Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature - MinIO Storage Connector (AWS + Azure) #997

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"llm": {
"api_key": "${API_KEY}",
"type": "azure_openai_chat",
"deployment_name":"gpt-4o-mini",
"api_base":"https://siasaigc.openai.azure.com/",
"api_version":"gpt-4o-mini-2024-07-18",
"model":"gpt-4o-mini-2024-07-18"

},
"embeddings": {
"api_key": "${API_KEY}",
"type": "azure_openai_chat",
"deployment_name":"text-embedding-3-large",
"api_base":"https://siasaigc.openai.azure.com/",
"api_version":"2024-06-01",
"model":"text-embedding-3-large"

}
}
21 changes: 20 additions & 1 deletion graphrag/config/create_graphrag_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,10 @@ def hydrate_parallelization_params(
connection_string=reader.str(Fragment.conn_string),
storage_account_blob_url=reader.str(Fragment.storage_account_blob_url),
container_name=reader.str(Fragment.container_name),
bucket_name=reader.str(Fragment.bucket_name),
access_key=reader.str(Fragment.access_key),
secret_key=reader.str(Fragment.secret_key),
endpoint = reader.str(Fragment.endpoint),
)
with reader.envvar_prefix(Section.cache), reader.use(values.get("cache")):
c_type = reader.str(Fragment.type)
Expand All @@ -359,6 +363,10 @@ def hydrate_parallelization_params(
storage_account_blob_url=reader.str(Fragment.storage_account_blob_url),
container_name=reader.str(Fragment.container_name),
base_dir=reader.str(Fragment.base_dir) or defs.CACHE_BASE_DIR,
bucket_name=reader.str(Fragment.bucket_name),
access_key=reader.str(Fragment.access_key),
secret_key=reader.str(Fragment.secret_key),
endpoint=reader.str(Fragment.endpoint),
)
with (
reader.envvar_prefix(Section.reporting),
Expand All @@ -371,6 +379,10 @@ def hydrate_parallelization_params(
storage_account_blob_url=reader.str(Fragment.storage_account_blob_url),
container_name=reader.str(Fragment.container_name),
base_dir=reader.str(Fragment.base_dir) or defs.REPORTING_BASE_DIR,
bucket_name=reader.str(Fragment.bucket_name),
access_key=reader.str(Fragment.access_key),
secret_key=reader.str(Fragment.secret_key),
endpoint=reader.str(Fragment.endpoint),
)
with reader.envvar_prefix(Section.storage), reader.use(values.get("storage")):
s_type = reader.str(Fragment.type)
Expand All @@ -380,6 +392,10 @@ def hydrate_parallelization_params(
storage_account_blob_url=reader.str(Fragment.storage_account_blob_url),
container_name=reader.str(Fragment.container_name),
base_dir=reader.str(Fragment.base_dir) or defs.STORAGE_BASE_DIR,
bucket_name=reader.str(Fragment.bucket_name),
access_key=reader.str(Fragment.access_key),
secret_key=reader.str(Fragment.secret_key),
endpoint=reader.str(Fragment.endpoint),
)
with reader.envvar_prefix(Section.chunk), reader.use(values.get("chunks")):
group_by_columns = reader.list("group_by_columns", "BY_COLUMNS")
Expand Down Expand Up @@ -608,7 +624,10 @@ class Fragment(str, Enum):
thread_stagger = "THREAD_STAGGER"
tpm = "TOKENS_PER_MINUTE"
type = "TYPE"

bucket_name="BUCKET_NAME"
access_key = "ACCESS_KEY"
secret_key = "SECRET_KEY" # noqa: S105
endpoint = "ENDPOINT"

class Section(str, Enum):
"""Configuration Sections."""
Expand Down
11 changes: 8 additions & 3 deletions graphrag/config/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class CacheType(str, Enum):
"""The none cache configuration type."""
blob = "blob"
"""The blob cache configuration type."""

minio = "minio"
"""The minio cache configuration type."""
def __repr__(self):
"""Get a string representation."""
return f'"{self.value}"'
Expand All @@ -45,7 +46,8 @@ class InputType(str, Enum):
"""The file storage type."""
blob = "blob"
"""The blob storage type."""

minio = "minio"
"""The minio storage type."""
def __repr__(self):
"""Get a string representation."""
return f'"{self.value}"'
Expand All @@ -60,6 +62,8 @@ class StorageType(str, Enum):
"""The memory storage type."""
blob = "blob"
"""The blob storage type."""
minio = "minio"
"""The minio storage type."""

def __repr__(self):
"""Get a string representation."""
Expand All @@ -75,7 +79,8 @@ class ReportingType(str, Enum):
"""The console reporting configuration type."""
blob = "blob"
"""The blob reporting configuration type."""

minio = "minio"
"""The minio reporting configuration type."""
def __repr__(self):
"""Get a string representation."""
return f'"{self.value}"'
Expand Down
3 changes: 3 additions & 0 deletions graphrag/config/input_models/input_config_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,6 @@ class InputConfigInput(TypedDict):
title_column: NotRequired[str | None]
document_attribute_columns: NotRequired[list[str] | str | None]
storage_account_blob_url: NotRequired[str | None]
bucket_name: NotRequired[str | None]
access_key: NotRequired[str | None]
endpoint: NotRequired[str | None]
3 changes: 3 additions & 0 deletions graphrag/config/input_models/reporting_config_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ class ReportingConfigInput(TypedDict):
connection_string: NotRequired[str | None]
container_name: NotRequired[str | None]
storage_account_blob_url: NotRequired[str | None]
bucket_name: NotRequired[str | None]
access_key: NotRequired[str | None]
endpoint: NotRequired[str | None]
3 changes: 3 additions & 0 deletions graphrag/config/input_models/storage_config_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ class StorageConfigInput(TypedDict):
connection_string: NotRequired[str | None]
container_name: NotRequired[str | None]
storage_account_blob_url: NotRequired[str | None]
bucket_name: NotRequired[str | None]
access_key: NotRequired[str | None]
endpoint: NotRequired[str | None]
16 changes: 16 additions & 0 deletions graphrag/config/models/cache_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,19 @@ class CacheConfig(BaseModel):
storage_account_blob_url: str | None = Field(
description="The storage account blob url to use.", default=None
)
bucket_name: str| None = Field(
description="The bucket name for the input files.", default=None
)
"""The bucket name for the input files."""
access_key: str| None = Field(
description="The access key for the input files.", default=None
)
"""The access key for the input files."""
secret_key: str| None = Field(
description="The secret key for the input files.", default=None
)
"""The secret key for the input files."""
endpoint: str | None = Field(
description="The endpoint for the input files.", default=None
)
"""The endpoint for the input files."""
16 changes: 16 additions & 0 deletions graphrag/config/models/input_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,19 @@ class InputConfig(BaseModel):
document_attribute_columns: list[str] = Field(
description="The document attribute columns to use.", default=[]
)
bucket_name: str| None = Field(
description="The bucket name for the input files.", default=None
)
"""The bucket name for the input files."""
access_key: str| None = Field(
description="The access key for the input files.", default=None
)
"""The access key for the input files."""
secret_key: str| None = Field(
description="The secret key for the input files.", default=None
)
"""The secret key for the input files."""
endpoint: str | None = Field(
description="The endpoint for the input files.", default=None
)
"""The endpoint for the input files."""
12 changes: 12 additions & 0 deletions graphrag/config/models/reporting_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,15 @@ class ReportingConfig(BaseModel):
storage_account_blob_url: str | None = Field(
description="The storage account blob url to use.", default=None
)
bucket_name: str | None = Field(
description="The reporting bucket name to use.", default=None
)
access_key: str | None = Field(
description="The reporting access key to use.", default=None
)
secret_key: str | None = Field(
description="The reporting secret key to use.", default=None
)
endpoint: str | None = Field(
description="The reporting endpoint to use.", default=None
)
16 changes: 16 additions & 0 deletions graphrag/config/models/storage_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,19 @@ class StorageConfig(BaseModel):
storage_account_blob_url: str | None = Field(
description="The storage account blob url to use.", default=None
)
bucket_name: str| None = Field(
description="The bucket name for the input files.", default=None
)
"""The bucket name for the input files."""
access_key: str| None = Field(
description="The access key for the input files.", default=None
)
"""The access key for the input files."""
secret_key: str| None = Field(
description="The secret key for the input files.", default=None
)
"""The secret key for the input files."""
endpoint: str | None = Field(
description="The endpoint for the input files.", default=None
)
"""The endpoint for the input files."""
17 changes: 16 additions & 1 deletion graphrag/index/cache/load_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,13 @@
from graphrag.index.config.cache import (
PipelineBlobCacheConfig,
PipelineFileCacheConfig,
PipelineMinioCacheConfig,
)
from graphrag.index.storage import (
BlobPipelineStorage,
FilePipelineStorage,
MinioPipelineStorage,
)
from graphrag.index.storage import BlobPipelineStorage, FilePipelineStorage

if TYPE_CHECKING:
from graphrag.index.config import (
Expand Down Expand Up @@ -46,6 +51,16 @@ def load_cache(config: PipelineCacheConfig | None, root_dir: str | None):
storage_account_blob_url=config.storage_account_blob_url,
).child(config.base_dir)
return JsonPipelineCache(storage)
case CacheType.minio:
config = cast(PipelineMinioCacheConfig, config)
storage = MinioPipelineStorage(
config.endpoint if config.endpoint is not None else "",
config.access_key if config.access_key is not None else "",
config.secret_key if config.secret_key is not None else "",
config.bucket_name if config.bucket_name is not None else "",
path_prefix=config.base_dir
).child(config.base_dir)
return JsonPipelineCache(storage)
case _:
msg = f"Unknown cache type: {config.type}"
raise ValueError(msg)
5 changes: 5 additions & 0 deletions graphrag/index/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
PipelineCacheConfigTypes,
PipelineFileCacheConfig,
PipelineMemoryCacheConfig,
PipelineMinioCacheConfig,
PipelineNoneCacheConfig,
)
from .input import (
Expand All @@ -22,13 +23,15 @@
PipelineBlobReportingConfig,
PipelineConsoleReportingConfig,
PipelineFileReportingConfig,
PipelineMinioReportingConfig,
PipelineReportingConfig,
PipelineReportingConfigTypes,
)
from .storage import (
PipelineBlobStorageConfig,
PipelineFileStorageConfig,
PipelineMemoryStorageConfig,
PipelineMinioStorageConfig,
PipelineStorageConfig,
PipelineStorageConfigTypes,
)
Expand All @@ -42,6 +45,7 @@
"PipelineBlobCacheConfig",
"PipelineBlobReportingConfig",
"PipelineBlobStorageConfig",
"PipelineMinioReportingConfig",
"PipelineCSVInputConfig",
"PipelineCacheConfig",
"PipelineCacheConfigTypes",
Expand All @@ -66,4 +70,5 @@
"PipelineWorkflowConfig",
"PipelineWorkflowReference",
"PipelineWorkflowStep",
"PipelineMinioCacheConfig"
]
25 changes: 25 additions & 0 deletions graphrag/index/config/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,35 @@ class PipelineBlobCacheConfig(PipelineCacheConfig[Literal[CacheType.blob]]):
)
"""The storage account blob url for cache"""

class PipelineMinioCacheConfig(PipelineCacheConfig[Literal[CacheType.blob]]):
"""Represents the blob cache configuration for the pipeline."""

type: Literal[CacheType.minio] = CacheType.minio
"""The type of cache."""

base_dir: str | None = pydantic_Field(
description="The base directory for the cache.", default=None
)
bucket_name: str| None = pydantic_Field(
description="The bucket name for the input files.", default=None
)
"""The bucket name for the input files."""
access_key: str| None = pydantic_Field(
description="The access key for the input files.", default=None
)
"""The access key for the input files."""
secret_key: str| None = pydantic_Field(
description="The secret key for the input files.", default=None
)
"""The secret key for the input files."""
endpoint: str | None = pydantic_Field(
description="The endpoint for the input files.", default=None
)
"""The endpoint for the input files."""
PipelineCacheConfigTypes = (
PipelineFileCacheConfig
| PipelineMemoryCacheConfig
| PipelineBlobCacheConfig
| PipelineNoneCacheConfig
| PipelineMinioCacheConfig
)
17 changes: 16 additions & 1 deletion graphrag/index/config/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,22 @@ class PipelineInputConfig(BaseModel, Generic[T]):
description="The encoding for the input files.", default=None
)
"""The encoding for the input files."""

bucket_name: str| None = pydantic_Field(
description="The bucket name for the input files.", default=None
)
"""The bucket name for the input files."""
access_key: str| None = pydantic_Field(
description="The access key for the input files.", default=None
)
"""The access key for the input files."""
secret_key: str| None = pydantic_Field(
description="The secret key for the input files.", default=None
)
"""The secret key for the input files."""
endpoint: str | None = pydantic_Field(
description="The endpoint for the input files.", default=None
)
"""The endpoint for the input files."""

class PipelineCSVInputConfig(PipelineInputConfig[Literal[InputFileType.csv]]):
"""Represent the configuration for a CSV input."""
Expand Down
Loading