Skip to content

Commit

Permalink
Visualization, Powerpoint, Misc. Cleanup (#49)
Browse files Browse the repository at this point in the history
* wip on visualizing dataset contents and vectors

* remove docling

* Make datasets represent individasiual docs

* hi lite

* make the hover point size bigger for stronger highlighting

* highlight even stronger!

* refactor and add a question entry box to show where the question lies in the vector space

* a bit of refactoring and fix unused imports

* force mypy into happiness

* Update release version to dev-testing

* fix types

* disable input while loading

* mark loading while viz is loading

* clear up some warnings

* make the active tab sticky on the ds management page

* small refactoring

* small refactor

* "wip on visualization cleanup"

* "now we're thinking with UMAP tooltips"

* "styling"

* "viz layout, continued."

* "now we're thinking with dependency hell"

* "add dependencies"

* add the last dependency needed for powerpoint parsing

* add other powerpoints

* turn on the no-kb query toggle

* tidy up the chunk metadata handling

* use models for models

* update uv.lock from main

* delete log file

---------

Co-authored-by: Michael Liu <[email protected]>
Co-authored-by: actions-user <[email protected]>
Co-authored-by: Elijah Williams <[email protected]>
  • Loading branch information
4 people authored Dec 3, 2024
1 parent 4f9178e commit e9b721c
Show file tree
Hide file tree
Showing 21 changed files with 775 additions and 39 deletions.
4 changes: 4 additions & 0 deletions llm-service/app/ai/indexing/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from .readers.json import JSONReader
from .readers.simple_file import SimpleFileReader
from .readers.pdf import PDFReader
from .readers.pptx import PptxReader

logger = logging.getLogger(__name__)

Expand All @@ -63,6 +64,9 @@
".txt": SimpleFileReader,
".md": SimpleFileReader,
".docx": DocxReader,
".pptx": PptxReader,
".pptm": PptxReader,
".ppt": PptxReader,
".csv": CSVReader,
".json": JSONReader,
}
Expand Down
59 changes: 59 additions & 0 deletions llm-service/app/ai/indexing/readers/pptx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#
# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP)
# (C) Cloudera, Inc. 2024
# All rights reserved.
#
# Applicable Open Source License: Apache 2.0
#
# NOTE: Cloudera open source products are modular software products
# made up of hundreds of individual components, each of which was
# individually copyrighted. Each Cloudera open source product is a
# collective work under U.S. Copyright Law. Your license to use the
# collective work is as provided in your written agreement with
# Cloudera. Used apart from the collective work, this file is
# licensed for your use pursuant to the open source license
# identified above.
#
# This code is provided to you pursuant a written agreement with
# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
# this code. If you do not have a written agreement with Cloudera nor
# with an authorized and properly licensed third party, you do not
# have any rights to access nor to use this code.
#
# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the
# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
# DATA.
#

from pathlib import Path
from typing import Any, List

from llama_index.core.schema import TextNode
from llama_index.readers.file import PptxReader as LlamaIndexPptxReader

from .base_reader import BaseReader


class PptxReader(BaseReader):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.inner = LlamaIndexPptxReader()

def load_chunks(self, file_path: Path) -> List[TextNode]:
documents = self.inner.load_data(file_path)
assert len(documents) == 1
document = documents[0]
document.id_ = self.document_id
self._add_document_metadata(document, file_path)
return self._chunks_in_document(document)
36 changes: 30 additions & 6 deletions llm-service/app/ai/vector_stores/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,19 @@
#

import os
from typing import Optional
from typing import Optional, Any
import umap

import qdrant_client
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.vector_stores.types import BasePydanticVectorStore
from llama_index.vector_stores.qdrant import (
QdrantVectorStore as LlamaIndexQdrantVectorStore,
)
from qdrant_client.http.models import CountResult
from qdrant_client.http.models import CountResult, Record

from ...services import models
from .vector_store import VectorStore
from ...services import models


def new_qdrant_client() -> qdrant_client.QdrantClient:
Expand All @@ -60,20 +61,20 @@ def new_qdrant_client() -> qdrant_client.QdrantClient:
class QdrantVectorStore(VectorStore):
@staticmethod
def for_chunks(
data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
) -> "QdrantVectorStore":
return QdrantVectorStore(table_name=f"index_{data_source_id}", client=client)

@staticmethod
def for_summaries(
data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None
) -> "QdrantVectorStore":
return QdrantVectorStore(
table_name=f"summary_index_{data_source_id}", client=client
)

def __init__(
self, table_name: str, client: Optional[qdrant_client.QdrantClient] = None
self, table_name: str, client: Optional[qdrant_client.QdrantClient] = None
):
self.client = client or new_qdrant_client()
self.table_name = table_name
Expand Down Expand Up @@ -105,3 +106,26 @@ def exists(self) -> bool:
def llama_vector_store(self) -> BasePydanticVectorStore:
vector_store = LlamaIndexQdrantVectorStore(self.table_name, self.client)
return vector_store

def visualize(self, user_query: Optional[str] = None) -> list[tuple[tuple[float, float], str]]:
records: list[Record]
records, _ = self.client.scroll(self.table_name, limit=5000, with_vectors=True)

if user_query:
embedding_model = models.get_embedding_model()
user_query_vector = embedding_model.get_query_embedding(user_query)
records.append(Record(vector=user_query_vector, id="abc123", payload={"file_name": "USER_QUERY"}))

record: Record
filenames = []
for record in records:
payload: dict[str, Any] | None = record.payload
if payload:
filenames.append(payload.get("file_name"))

reducer = umap.UMAP()
embeddings = [record.vector for record in records]
reduced_embeddings = reducer.fit_transform(embeddings)

# todo: figure out how to satisfy mypy on this line
return [(tuple(coordinate), filenames[i]) for i, coordinate in enumerate(reduced_embeddings.tolist())] # type: ignore
4 changes: 4 additions & 0 deletions llm-service/app/ai/vector_stores/vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,7 @@ def llama_vector_store(self) -> BasePydanticVectorStore:
@abstractmethod
def exists(self) -> bool:
"""Does the vector store exist?"""

@abstractmethod
def visualize(self, user_query: Optional[str] = None) -> list[tuple[tuple[float,float], str]]:
"""get a 2-d visualization of the vectors in the store"""
17 changes: 17 additions & 0 deletions llm-service/app/routers/index/data_source/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,23 @@ def chunk_contents(self, chunk_id: str) -> ChunkContentsResponse:
metadata=node.metadata,
)


@router.get("/visualize")
@exceptions.propagates
def visualize(self) -> list[tuple[tuple[float,float], str]]:
return self.chunks_vector_store.visualize()


class VisualizationRequest(BaseModel):
user_query: str


@router.post("/visualize")
@exceptions.propagates
def visualize_with_query(self, request: VisualizationRequest) -> list[tuple[tuple[float,float], str]]:
return self.chunks_vector_store.visualize(request.user_query)


@router.delete(
"/", summary="Deletes the data source from the index.", response_model=None
)
Expand Down
14 changes: 4 additions & 10 deletions llm-service/app/services/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,15 @@
from llama_index.core.base.response.schema import Response
from llama_index.core.chat_engine.types import AgentChatResponse
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator
from llama_index.llms.bedrock import Bedrock

from .llama_utils import completion_to_prompt, messages_to_prompt
from ..services import models


def evaluate_response(
query: str,
chat_response: AgentChatResponse,
query: str,
chat_response: AgentChatResponse,
) -> tuple[float, float]:
evaluator_llm = Bedrock(
model="meta.llama3-8b-instruct-v1:0",
context_size=128000,
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
)
evaluator_llm = models.get_llm("meta.llama3-8b-instruct-v1:0")

relevancy_evaluator = RelevancyEvaluator(llm=evaluator_llm)
relevance = relevancy_evaluator.evaluate_response(
Expand Down
3 changes: 2 additions & 1 deletion llm-service/app/services/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from llama_index.core.llms import LLM
from llama_index.embeddings.bedrock import BedrockEmbedding
from llama_index.llms.bedrock import Bedrock
from llama_index.llms.bedrock.utils import BEDROCK_FOUNDATION_LLMS

from .caii import get_caii_embedding_models, get_caii_llm_models
from .caii import get_embedding_model as caii_embedding
Expand All @@ -67,7 +68,7 @@ def get_llm(model_name: str = None) -> LLM:
)
return Bedrock(
model=model_name,
context_size=128000,
context_size=BEDROCK_FOUNDATION_LLMS.get(model_name, 8192),
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
)
Expand Down
5 changes: 5 additions & 0 deletions llm-service/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ dependencies = [
"docx2txt>=0.8",
"pandas>=2.2.3",
"fastapi-utils>=0.8.0",
"umap-learn>=0.5.7",
"python-pptx>=1.0.2",
"torch>=2.5.1",
"pillow>=10.4.0",
"transformers>=4.46.3",
"docling>=2.7.0",
]
requires-python = "==3.10.*"
Expand Down
Loading

0 comments on commit e9b721c

Please sign in to comment.