From 9f53d67f30581cec6bc664c4ae9a9617e9cf55b6 Mon Sep 17 00:00:00 2001 From: Kritin Vongthongsri <73642562+kritinv@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:14:37 -0400 Subject: [PATCH] ChromaDB Integration --- deepeval/synthesizer/context_generator.py | 98 ++++++++++++++++++++--- tests/test_synthesizer.py | 10 ++- 2 files changed, 94 insertions(+), 14 deletions(-) diff --git a/deepeval/synthesizer/context_generator.py b/deepeval/synthesizer/context_generator.py index a38fb79a1..934f72123 100644 --- a/deepeval/synthesizer/context_generator.py +++ b/deepeval/synthesizer/context_generator.py @@ -1,5 +1,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Tuple +import chromadb import random from deepeval.synthesizer.doc_chunker import ( @@ -10,6 +11,8 @@ from deepeval.models.openai_embedding_model import OpenAIEmbeddingModel from deepeval.models.base_model import DeepEvalBaseEmbeddingModel +DB_FOLDER = ".vectordb" +DB_COLLECTION_NAME = "synth_vectordb" class ContextGenerator: def __init__( @@ -27,6 +30,7 @@ def __init__( self.multithreading = multithreading self.document_paths: List[str] = document_paths + self.client = chromadb.PersistentClient(path=DB_FOLDER) # TODO: Potential bug, calling generate_goldens_from_docs # twice in a notebook enviornment will not refresh combined chunks self.combined_chunks: List[Chunk] = self._load_docs() @@ -50,17 +54,22 @@ def generate_contexts( ############### Load Docs ############################# def _load_docs(self) -> List[Chunk]: - - def process_document(path): - doc_chunker = DocumentChunker( - self.embedder, self.chunk_size, self.chunk_overlap - ) - return doc_chunker.load_doc(path) - + # check if docs are already in db + new_doc_paths = [] + old_doc_paths = [] + collection = self.client.get_or_create_collection(name="test") + for path in self.document_paths: + docs_from_path = collection.get(where={"source": path}) + if len(docs_from_path['documents']) == 0: + new_doc_paths.append(path) + else: + old_doc_paths.append(path) + # create chunks for docs not in db + if len(new_doc_paths) != 0: + print("Calculating embeddings for: " + str(new_doc_paths)) combined_chunks = [] - if not self.multithreading: - for path in self.document_paths: + for path in new_doc_paths: doc_chunker = DocumentChunker( self.embedder, self.chunk_size, self.chunk_overlap ) @@ -69,8 +78,8 @@ def process_document(path): else: with ThreadPoolExecutor() as executor: future_to_path = { - executor.submit(process_document, path): path - for path in self.document_paths + executor.submit(self.process_document, path): path + for path in new_doc_paths } for future in as_completed(future_to_path): path = future_to_path[future] @@ -79,9 +88,63 @@ def process_document(path): combined_chunks.extend(chunks) except Exception as exc: print(f"{path} generated an exception: {exc}") - + self.save_chunks_to_db(collection, combined_chunks) + # create chunks from docs in db + saved_chunks = self.get_chunks_from_db(collection, old_doc_paths) + combined_chunks.extend(saved_chunks) return combined_chunks + + def process_document(self, path): + doc_chunker = DocumentChunker( + self.embedder, self.chunk_size, self.chunk_overlap + ) + return doc_chunker.load_doc(path) + + + def save_chunks_to_db(self, collection, combined_chunks): + if len(combined_chunks) == 0: return + documents = [] + embeddings = [] + metadatas=[] + ids=[] + for i, chunk in enumerate(combined_chunks): + documents.append(chunk.content) + embeddings.append(chunk.embedding) + metadatas.append({"source": chunk.source_file, + "similarity_to_mean": chunk.similarity_to_mean}) + ids.append(chunk.source_file + str(i)) + + collection.add( + documents=documents, + embeddings=embeddings, + metadatas=metadatas, + ids=ids + ) + + def get_chunks_from_db(self, collection, paths): + chunks = [] + filter = {} + if len(paths) == 0: + return [] + elif len(paths) == 1: + filter["source"] = paths[0] + else: + filter["$or"] = [{"source": {"$eq": path}} for path in paths] + print("Getting cached embeddings for: " + str(paths)) + results = collection.get(where=filter, include=['embeddings', 'documents', 'metadatas']) + for i in range(len(results["ids"])): + chunk = Chunk( + id=results['ids'][i], + content=results['documents'][i], + embedding=results['embeddings'][i], + source_file=results['metadatas'][i]['source'], + similarity_to_mean=results['metadatas'][i]['similarity_to_mean'] + ) + chunks.append(chunk) + return chunks + + ############### Search N Chunks ######################## def _get_n_random_clusters( self, n: int, cluster_size: int @@ -130,3 +193,14 @@ def _get_n_other_similar_chunks( similar_chunks = [self.combined_chunks[i] for i in top_n_indices] return similar_chunks + +######################################## + +import time + +if __name__ == "__main__": + start_time = time.time() + generator = ContextGenerator(document_paths=['data/pdf_example.pdf', 'data/txt_example.txt', 'data/docx_example.docx', 'data/large.pdf']) + end_time = time.time() + + print(f"Initialization and loading took {end_time - start_time:.2f} seconds.") diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py index 79844e917..6c9ca7dde 100644 --- a/tests/test_synthesizer.py +++ b/tests/test_synthesizer.py @@ -7,12 +7,18 @@ def test_synthesizer(): module_b_dir = os.path.dirname(os.path.realpath(__file__)) - file_path = os.path.join( + file_path_1 = os.path.join( module_b_dir, "synthesizer_data", "pdf_example.pdf" ) + file_path_2 = os.path.join( + module_b_dir, "synthesizer_data", "txt_example.txt" + ) synthesizer = Synthesizer() + synthesizer.generate_goldens_from_docs( - document_paths=[file_path], + document_paths=[file_path_1, file_path_2], max_goldens_per_document=2, ) synthesizer.save_as(file_type="json", directory="./results") + +test_synthesizer() \ No newline at end of file