Python: Azure Cosmos DB NoSQL Vector Store & Collection implementation (

microsoft#9296) ### Motivation and Context  We are implementing the Azure Cosmos DB NoSQL vector store and vector collection. ### Description Azure Cosmos DB NoSQL vector store & collection implementation.  ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄 --------- Co-authored-by: Eduard van Valkenburg <[email protected]>
Bryan-Roe-ai · Nov 12, 2024 · e6d4b10 · e6d4b10
1 parent 051a3d0
commit e6d4b10
Show file tree

Hide file tree

Showing 26 changed files with 2,077 additions and 98 deletions.
diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml
@@ -100,6 +100,8 @@ jobs:
       VERTEX_AI_GEMINI_MODEL_ID: ${{ vars.VERTEX_AI_GEMINI_MODEL_ID }}
       VERTEX_AI_EMBEDDING_MODEL_ID: ${{ vars.VERTEX_AI_EMBEDDING_MODEL_ID }}
       REDIS_CONNECTION_STRING: ${{ vars.REDIS_CONNECTION_STRING }}
+      AZURE_COSMOS_DB_NO_SQL_URL: ${{ vars.AZURE_COSMOS_DB_NO_SQL_URL }}
+      AZURE_COSMOS_DB_NO_SQL_KEY: ${{ secrets.AZURE_COSMOS_DB_NO_SQL_KEY }}
     steps:
       - uses: actions/checkout@v4
       - name: Set up uv
@@ -150,6 +152,12 @@ jobs:
         run: docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest
       - name: Setup Weaviate docker deployment
         run: docker run -d -p 8080:8080 -p 50051:50051 cr.weaviate.io/semitechnologies/weaviate:1.26.6
+      - name: Start Azure Cosmos DB emulator
+        if: matrix.os == 'windows-latest'
+        run: |
+          Write-Host "Launching Cosmos DB Emulator"
+          Import-Module "$env:ProgramFiles\Azure Cosmos DB Emulator\PSModules\Microsoft.Azure.CosmosDB.Emulator"
+          Start-CosmosDbEmulator
       - name: Azure CLI Login
         if: github.event_name != 'pull_request'
         uses: azure/login@v2
@@ -159,31 +167,37 @@ jobs:
           subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
       - name: Run Integration Tests - Completions
         id: run_tests_completions
+        timeout-minutes: 10
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/completions -v --junitxml=pytest-completions.xml
       - name: Run Integration Tests - Embeddings
         id: run_tests_embeddings
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/embeddings -v --junitxml=pytest-embeddings.xml
       - name: Run Integration Tests - Memory
         id: run_tests_memory
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/memory -v --junitxml=pytest-memory.xml
       - name: Run Integration Tests - Cross Language
         id: run_tests_cross_language
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/cross_language -v --junitxml=pytest-cross.xml
       - name: Run Integration Tests - Planning
         id: run_tests_planning
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/planning -v --junitxml=pytest-planning.xml
       - name: Run Integration Tests - Samples
         id: run_tests_samples
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/samples -v --junitxml=pytest-samples.xml
@@ -255,6 +269,8 @@ jobs:
       VERTEX_AI_GEMINI_MODEL_ID: ${{ vars.VERTEX_AI_GEMINI_MODEL_ID }}
       VERTEX_AI_EMBEDDING_MODEL_ID: ${{ vars.VERTEX_AI_EMBEDDING_MODEL_ID }}
       REDIS_CONNECTION_STRING: ${{ vars.REDIS_CONNECTION_STRING }}
+      AZURE_COSMOS_DB_NO_SQL_URL: ${{ vars.AZURE_COSMOS_DB_NO_SQL_URL }}
+      AZURE_COSMOS_DB_NO_SQL_KEY: ${{ secrets.AZURE_COSMOS_DB_NO_SQL_KEY }}
     steps:
       - uses: actions/checkout@v4
       - name: Set up uv
@@ -305,6 +321,12 @@ jobs:
         run: docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest
       - name: Setup Weaviate docker deployment
         run: docker run -d -p 8080:8080 -p 50051:50051 cr.weaviate.io/semitechnologies/weaviate:1.26.6
+      - name: Start Azure Cosmos DB emulator
+        if: matrix.os == 'windows-latest'
+        run: |
+          Write-Host "Launching Cosmos DB Emulator"
+          Import-Module "$env:ProgramFiles\Azure Cosmos DB Emulator\PSModules\Microsoft.Azure.CosmosDB.Emulator"
+          Start-CosmosDbEmulator
       - name: Azure CLI Login
         if: github.event_name != 'pull_request'
         uses: azure/login@v2
@@ -314,31 +336,37 @@ jobs:
           subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
       - name: Run Integration Tests - Completions
         id: run_tests_completions
+        timeout-minutes: 10
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/completions -v --junitxml=pytest-completions.xml
       - name: Run Integration Tests - Embeddings
         id: run_tests_embeddings
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/embeddings -v --junitxml=pytest-embeddings.xml
       - name: Run Integration Tests - Memory
         id: run_tests_memory
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/memory -v --junitxml=pytest-memory.xml
       - name: Run Integration Tests - Cross Language
         id: run_tests_cross_language
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/cross_language -v --junitxml=pytest-cross.xml
       - name: Run Integration Tests - Planning
         id: run_tests_planning
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/integration/planning -v --junitxml=pytest-planning.xml
       - name: Run Integration Tests - Samples
         id: run_tests_samples
+        timeout-minutes: 5
         shell: bash
         run: |
           uv run pytest -n logical --dist loadfile --dist worksteal ./tests/samples -v --junitxml=pytest-samples.xml
@@ -418,4 +446,4 @@ jobs:
           dry_run: ${{ env.run_type != 'Daily' && env.run_type != 'Manual'}}
           job: ${{ toJson(job) }}
           steps: ${{ toJson(steps) }}
-          overwrite: "{title: ` ${{ env.run_type }}: ${{ env.date }} `, text: ` ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`}"
+          overwrite: "{title: ` ${{ env.run_type }}: ${{ env.date }} `, text: ` ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`}"
diff --git a/python/.cspell.json b/python/.cspell.json
@@ -46,6 +46,7 @@
         "mongocluster",
         "ndarray",
         "nopep",
+        "NOSQL",
         "ollama",
         "onyourdatatest",
         "OPENAI",

diff --git a/python/samples/concepts/memory/new_memory.py b/python/samples/concepts/memory/new_memory.py
@@ -12,6 +12,9 @@
 from semantic_kernel.connectors.ai.open_ai import OpenAIEmbeddingPromptExecutionSettings, OpenAITextEmbedding
 from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding
 from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
+from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_collection import (
+    AzureCosmosDBNoSQLCollection,
+)
 from semantic_kernel.connectors.memory.in_memory import InMemoryVectorCollection
 from semantic_kernel.connectors.memory.postgres.postgres_collection import PostgresCollection
 from semantic_kernel.connectors.memory.qdrant import QdrantCollection
@@ -25,55 +28,64 @@
     VectorStoreRecordVectorField,
     vectorstoremodel,
 )
-
-
-@vectorstoremodel
-@dataclass
-class MyDataModelArray:
-    vector: Annotated[
-        np.ndarray | None,
-        VectorStoreRecordVectorField(
-            embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
-            index_kind="hnsw",
-            dimensions=1536,
-            distance_function="cosine_similarity",
-            property_type="float",
-            serialize_function=np.ndarray.tolist,
-            deserialize_function=np.array,
-        ),
-    ] = None
-    other: str | None = None
-    id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
-    content: Annotated[
-        str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
-    ] = "content1"
-
-
-@vectorstoremodel
-@dataclass
-class MyDataModelList:
-    vector: Annotated[
-        list[float] | None,
-        VectorStoreRecordVectorField(
-            embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
-            index_kind="hnsw",
-            dimensions=1536,
-            distance_function="cosine_similarity",
-            property_type="float",
-        ),
-    ] = None
-    other: str | None = None
-    id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
-    content: Annotated[
-        str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
-    ] = "content1"
+from semantic_kernel.data.const import DistanceFunction, IndexKind
+
+
+def get_data_model_array(index_kind: IndexKind, distance_function: DistanceFunction) -> type:
+    @vectorstoremodel
+    @dataclass
+    class DataModelArray:
+        vector: Annotated[
+            np.ndarray | None,
+            VectorStoreRecordVectorField(
+                embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
+                index_kind=index_kind,
+                dimensions=1536,
+                distance_function=distance_function,
+                property_type="float",
+                serialize_function=np.ndarray.tolist,
+                deserialize_function=np.array,
+            ),
+        ] = None
+        other: str | None = None
+        id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
+        content: Annotated[
+            str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
+        ] = "content1"
+
+    return DataModelArray
+
+
+def get_data_model_list(index_kind: IndexKind, distance_function: DistanceFunction) -> type:
+    @vectorstoremodel
+    @dataclass
+    class DataModelList:
+        vector: Annotated[
+            list[float] | None,
+            VectorStoreRecordVectorField(
+                embedding_settings={"embedding": OpenAIEmbeddingPromptExecutionSettings(dimensions=1536)},
+                index_kind=index_kind,
+                dimensions=1536,
+                distance_function=distance_function,
+                property_type="float",
+            ),
+        ] = None
+        other: str | None = None
+        id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
+        content: Annotated[
+            str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
+        ] = "content1"
+
+    return DataModelList
 
 
 collection_name = "test"
-MyDataModel = MyDataModelArray
+# Depending on the vector database, the index kind and distance function may need to be adjusted,
+# since not all combinations are supported by all databases.
+DataModel = get_data_model_array(IndexKind.HNSW, DistanceFunction.COSINE)
 
 # A list of VectorStoreRecordCollection that can be used.
-# Available stores are:
+# Available collections are:
 # - ai_search: Azure AI Search
 # - postgres: PostgreSQL
 # - redis_json: Redis JSON
@@ -83,63 +95,74 @@ class MyDataModelList:
 # - weaviate: Weaviate
 #   Please either configure the weaviate settings via environment variables or provide them through the constructor.
 #   Note that embed mode is not supported on Windows: https://github.com/weaviate/weaviate/issues/3315
-#
-# This is represented as a mapping from the store name to a
-# function which returns the store.
-# Using a function allows for lazy initialization of the store,
-# so that settings for unused stores do not cause validation errors.
-stores: dict[str, Callable[[], VectorStoreRecordCollection]] = {
-    "ai_search": lambda: AzureAISearchCollection[MyDataModel](
-        data_model_type=MyDataModel,
+# - azure_cosmos_nosql: Azure Cosmos NoSQL
+#   https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/how-to-create-account?tabs=azure-portal
+#   Please see the link above to learn how to set up an Azure Cosmos NoSQL account.
+#   https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=windows%2Cpython&pivots=api-nosql
+#   Please see the link above to learn how to set up the Azure Cosmos NoSQL emulator on your machine.
+#   For this sample to work with Azure Cosmos NoSQL, please adjust the index_kind of the data model to QUANTIZED_FLAT.
+# This is represented as a mapping from the collection name to a
+# function which returns the collection.
+# Using a function allows for lazy initialization of the collection,
+# so that settings for unused collections do not cause validation errors.
+collections: dict[str, Callable[[], VectorStoreRecordCollection]] = {
+    "ai_search": lambda: AzureAISearchCollection[DataModel](
+        data_model_type=DataModel,
     ),
-    "postgres": lambda: PostgresCollection[str, MyDataModel](
-        data_model_type=MyDataModel,
+    "postgres": lambda: PostgresCollection[str, DataModel](
+        data_model_type=DataModel,
         collection_name=collection_name,
     ),
-    "redis_json": lambda: RedisJsonCollection[MyDataModel](
-        data_model_type=MyDataModel,
+    "redis_json": lambda: RedisJsonCollection[DataModel](
+        data_model_type=DataModel,
         collection_name=collection_name,
         prefix_collection_name_to_key_names=True,
     ),
-    "redis_hashset": lambda: RedisHashsetCollection[MyDataModel](
-        data_model_type=MyDataModel,
+    "redis_hashset": lambda: RedisHashsetCollection[DataModel](
+        data_model_type=DataModel,
         collection_name=collection_name,
         prefix_collection_name_to_key_names=True,
     ),
-    "qdrant": lambda: QdrantCollection[MyDataModel](
-        data_model_type=MyDataModel, collection_name=collection_name, prefer_grpc=True, named_vectors=False
+    "qdrant": lambda: QdrantCollection[DataModel](
+        data_model_type=DataModel, collection_name=collection_name, prefer_grpc=True, named_vectors=False
+    ),
+    "in_memory": lambda: InMemoryVectorCollection[DataModel](
+        data_model_type=DataModel,
+        collection_name=collection_name,
     ),
-    "in_memory": lambda: InMemoryVectorCollection[MyDataModel](
-        data_model_type=MyDataModel,
+    "weaviate": lambda: WeaviateCollection[DataModel](
+        data_model_type=DataModel,
         collection_name=collection_name,
     ),
-    "weaviate": lambda: WeaviateCollection[MyDataModel](
-        data_model_type=MyDataModel,
+    "azure_cosmos_nosql": lambda: AzureCosmosDBNoSQLCollection(
+        data_model_type=DataModel,
+        database_name="sample_database",
         collection_name=collection_name,
+        create_database=True,
     ),
 }
 
 
-async def main(store: str, use_azure_openai: bool, embedding_model: str):
+async def main(collection: str, use_azure_openai: bool, embedding_model: str):
     kernel = Kernel()
     service_id = "embedding"
     if use_azure_openai:
         kernel.add_service(AzureTextEmbedding(service_id=service_id, deployment_name=embedding_model))
     else:
         kernel.add_service(OpenAITextEmbedding(service_id=service_id, ai_model_id=embedding_model))
-    async with stores[store]() as record_store:
-        await record_store.create_collection_if_not_exists()
+    async with collections[collection]() as record_collection:
+        await record_collection.create_collection_if_not_exists()
 
-        record1 = MyDataModel(content="My text", id="e6103c03-487f-4d7d-9c23-4723651c17f4")
-        record2 = MyDataModel(content="My other text", id="09caec77-f7e1-466a-bcec-f1d51c5b15be")
+        record1 = DataModel(content="My text", id="e6103c03-487f-4d7d-9c23-4723651c17f4")
+        record2 = DataModel(content="My other text", id="09caec77-f7e1-466a-bcec-f1d51c5b15be")
 
         records = await VectorStoreRecordUtils(kernel).add_vector_to_records(
-            [record1, record2], data_model_type=MyDataModel
+            [record1, record2], data_model_type=DataModel
         )
-        keys = await record_store.upsert_batch(records)
+        keys = await record_collection.upsert_batch(records)
         print(f"upserted {keys=}")
 
-        results = await record_store.get_batch([record1.id, record2.id])
+        results = await record_collection.get_batch([record1.id, record2.id])
         if results:
             for result in results:
                 print(f"found {result.id=}")
@@ -156,7 +179,7 @@ async def main(store: str, use_azure_openai: bool, embedding_model: str):
     argparse.ArgumentParser()
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("--store", default="in_memory", choices=stores.keys(), help="What store to use.")
+    parser.add_argument("--collection", default="in_memory", choices=collections.keys(), help="What collection to use.")
     # Option of whether to use OpenAI or Azure OpenAI.
     parser.add_argument("--use-azure-openai", action="store_true", help="Use Azure OpenAI instead of OpenAI.")
     # Model
@@ -165,4 +188,4 @@ async def main(store: str, use_azure_openai: bool, embedding_model: str):
     )
     args = parser.parse_args()
 
-    asyncio.run(main(store=args.store, use_azure_openai=args.use_azure_openai, embedding_model=args.model))
+    asyncio.run(main(collection=args.collection, use_azure_openai=args.use_azure_openai, embedding_model=args.model))
diff --git a/python/semantic_kernel/connectors/memory/azure_cosmos_db/__init__.py b/python/semantic_kernel/connectors/memory/azure_cosmos_db/__init__.py