Visualization, Powerpoint, Misc. Cleanup (#49)

* wip on visualizing dataset contents and vectors * remove docling * Make datasets represent individasiual docs * hi lite * make the hover point size bigger for stronger highlighting * highlight even stronger! * refactor and add a question entry box to show where the question lies in the vector space * a bit of refactoring and fix unused imports * force mypy into happiness * Update release version to dev-testing * fix types * disable input while loading * mark loading while viz is loading * clear up some warnings * make the active tab sticky on the ds management page * small refactoring * small refactor * "wip on visualization cleanup" * "now we're thinking with UMAP tooltips" * "styling" * "viz layout, continued." * "now we're thinking with dependency hell" * "add dependencies" * add the last dependency needed for powerpoint parsing * add other powerpoints * turn on the no-kb query toggle * tidy up the chunk metadata handling * use models for models * update uv.lock from main * delete log file --------- Co-authored-by: Michael Liu <[email protected]> Co-authored-by: actions-user <[email protected]> Co-authored-by: Elijah Williams <[email protected]>
cloudera · Dec 3, 2024 · e9b721c · e9b721c
1 parent 4f9178e
commit e9b721c
Show file tree

Hide file tree

Showing 21 changed files with 775 additions and 39 deletions.
diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py
@@ -55,6 +55,7 @@
 from .readers.json import JSONReader
 from .readers.simple_file import SimpleFileReader
 from .readers.pdf import PDFReader
+from .readers.pptx import PptxReader
 
 logger = logging.getLogger(__name__)
 
@@ -63,6 +64,9 @@
     ".txt": SimpleFileReader,
     ".md": SimpleFileReader,
     ".docx": DocxReader,
+    ".pptx": PptxReader,
+    ".pptm": PptxReader,
+    ".ppt": PptxReader,
     ".csv": CSVReader,
     ".json": JSONReader,
 }

diff --git a/llm-service/app/ai/indexing/readers/pptx.py b/llm-service/app/ai/indexing/readers/pptx.py
@@ -0,0 +1,59 @@
+#
+#  CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
+#  (C) Cloudera, Inc. 2024
+#  All rights reserved.
+#
+#  Applicable Open Source License: Apache 2.0
+#
+#  NOTE: Cloudera open source products are modular software products
+#  made up of hundreds of individual components, each of which was
+#  individually copyrighted.  Each Cloudera open source product is a
+#  collective work under U.S. Copyright Law. Your license to use the
+#  collective work is as provided in your written agreement with
+#  Cloudera.  Used apart from the collective work, this file is
+#  licensed for your use pursuant to the open source license
+#  identified above.
+#
+#  This code is provided to you pursuant a written agreement with
+#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
+#  this code. If you do not have a written agreement with Cloudera nor
+#  with an authorized and properly licensed third party, you do not
+#  have any rights to access nor to use this code.
+#
+#  Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
+#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
+#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
+#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
+#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
+#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
+#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
+#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
+#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
+#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
+#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
+#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
+#  DATA.
+#
+
+from pathlib import Path
+from typing import Any, List
+
+from llama_index.core.schema import TextNode
+from llama_index.readers.file import PptxReader as LlamaIndexPptxReader
+
+from .base_reader import BaseReader
+
+
+class PptxReader(BaseReader):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.inner = LlamaIndexPptxReader()
+
+    def load_chunks(self, file_path: Path) -> List[TextNode]:
+        documents = self.inner.load_data(file_path)
+        assert len(documents) == 1
+        document = documents[0]
+        document.id_ = self.document_id
+        self._add_document_metadata(document, file_path)
+        return self._chunks_in_document(document)
diff --git a/llm-service/app/ai/vector_stores/qdrant.py b/llm-service/app/ai/vector_stores/qdrant.py
@@ -37,18 +37,19 @@
 #
 
 import os
-from typing import Optional
+from typing import Optional, Any
+import umap
 
 import qdrant_client
 from llama_index.core.indices import VectorStoreIndex
 from llama_index.core.vector_stores.types import BasePydanticVectorStore
 from llama_index.vector_stores.qdrant import (
     QdrantVectorStore as LlamaIndexQdrantVectorStore,
 )
-from qdrant_client.http.models import CountResult
+from qdrant_client.http.models import CountResult, Record
 
-from ...services import models
 from .vector_store import VectorStore
+from ...services import models
 
 
 def new_qdrant_client() -> qdrant_client.QdrantClient:
@@ -60,20 +61,20 @@ def new_qdrant_client() -> qdrant_client.QdrantClient:
 class QdrantVectorStore(VectorStore):
     @staticmethod
     def for_chunks(
-        data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
+            data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
     ) -> "QdrantVectorStore":
         return QdrantVectorStore(table_name=f"index_{data_source_id}", client=client)
 
     @staticmethod
     def for_summaries(
-        data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
+            data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
     ) -> "QdrantVectorStore":
         return QdrantVectorStore(
             table_name=f"summary_index_{data_source_id}", client=client
         )
 
     def __init__(
-        self, table_name: str, client: Optional[qdrant_client.QdrantClient] = None
+            self, table_name: str, client: Optional[qdrant_client.QdrantClient] = None
     ):
         self.client = client or new_qdrant_client()
         self.table_name = table_name
@@ -105,3 +106,26 @@ def exists(self) -> bool:
     def llama_vector_store(self) -> BasePydanticVectorStore:
         vector_store = LlamaIndexQdrantVectorStore(self.table_name, self.client)
         return vector_store
+
+    def visualize(self, user_query: Optional[str] = None) -> list[tuple[tuple[float, float], str]]:
+        records: list[Record]
+        records, _ = self.client.scroll(self.table_name, limit=5000, with_vectors=True)
+
+        if user_query:
+            embedding_model = models.get_embedding_model()
+            user_query_vector = embedding_model.get_query_embedding(user_query)
+            records.append(Record(vector=user_query_vector, id="abc123", payload={"file_name": "USER_QUERY"}))
+
+        record: Record
+        filenames = []
+        for record in records:
+            payload: dict[str, Any] | None = record.payload
+            if payload:
+                filenames.append(payload.get("file_name"))
+
+        reducer = umap.UMAP()
+        embeddings = [record.vector for record in records]
+        reduced_embeddings = reducer.fit_transform(embeddings)
+
+        # todo: figure out how to satisfy mypy on this line
+        return [(tuple(coordinate), filenames[i]) for i, coordinate in enumerate(reduced_embeddings.tolist())] # type: ignore
diff --git a/llm-service/app/ai/vector_stores/vector_store.py b/llm-service/app/ai/vector_stores/vector_store.py
@@ -66,3 +66,7 @@ def llama_vector_store(self) -> BasePydanticVectorStore:
     @abstractmethod
     def exists(self) -> bool:
         """Does the vector store exist?"""
+
+    @abstractmethod
+    def visualize(self, user_query: Optional[str] = None) -> list[tuple[tuple[float,float], str]]:
+        """get a 2-d visualization of the vectors in the store"""
diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py
@@ -99,6 +99,23 @@ def chunk_contents(self, chunk_id: str) -> ChunkContentsResponse:
             metadata=node.metadata,
         )
 
+
+    @router.get("/visualize")
+    @exceptions.propagates
+    def visualize(self) -> list[tuple[tuple[float,float], str]]:
+        return self.chunks_vector_store.visualize()
+
+
+    class VisualizationRequest(BaseModel):
+        user_query: str
+
+
+    @router.post("/visualize")
+    @exceptions.propagates
+    def visualize_with_query(self, request: VisualizationRequest) -> list[tuple[tuple[float,float], str]]:
+        return self.chunks_vector_store.visualize(request.user_query)
+
+
     @router.delete(
         "/", summary="Deletes the data source from the index.", response_model=None
     )

diff --git a/llm-service/app/services/evaluators.py b/llm-service/app/services/evaluators.py
@@ -39,21 +39,15 @@
 from llama_index.core.base.response.schema import Response
 from llama_index.core.chat_engine.types import AgentChatResponse
 from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
-from llama_index.llms.bedrock import Bedrock
 
-from .llama_utils import completion_to_prompt, messages_to_prompt
+from ..services import models
 
 
 def evaluate_response(
-    query: str,
-    chat_response: AgentChatResponse,
+        query: str,
+        chat_response: AgentChatResponse,
 ) -> tuple[float, float]:
-    evaluator_llm = Bedrock(
-        model="meta.llama3-8b-instruct-v1:0",
-        context_size=128000,
-        messages_to_prompt=messages_to_prompt,
-        completion_to_prompt=completion_to_prompt,
-    )
+    evaluator_llm = models.get_llm("meta.llama3-8b-instruct-v1:0")
 
     relevancy_evaluator = RelevancyEvaluator(llm=evaluator_llm)
     relevance = relevancy_evaluator.evaluate_response(

diff --git a/llm-service/app/services/models.py b/llm-service/app/services/models.py
@@ -44,6 +44,7 @@
 from llama_index.core.llms import LLM
 from llama_index.embeddings.bedrock import BedrockEmbedding
 from llama_index.llms.bedrock import Bedrock
+from llama_index.llms.bedrock.utils import BEDROCK_FOUNDATION_LLMS
 
 from .caii import get_caii_embedding_models, get_caii_llm_models
 from .caii import get_embedding_model as caii_embedding
@@ -67,7 +68,7 @@ def get_llm(model_name: str = None) -> LLM:
         )
     return Bedrock(
         model=model_name,
-        context_size=128000,
+        context_size=BEDROCK_FOUNDATION_LLMS.get(model_name, 8192),
         messages_to_prompt=messages_to_prompt,
         completion_to_prompt=completion_to_prompt,
     )

diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml
@@ -21,6 +21,11 @@ dependencies = [
     "docx2txt>=0.8",
     "pandas>=2.2.3",
     "fastapi-utils>=0.8.0",
+    "umap-learn>=0.5.7",
+    "python-pptx>=1.0.2",
+    "torch>=2.5.1",
+    "pillow>=10.4.0",
+    "transformers>=4.46.3",
     "docling>=2.7.0",
 ]
 requires-python = "==3.10.*"