From 4f2efb98f91a5ea9408c3d550bb8b7b6df4fe373 Mon Sep 17 00:00:00 2001 From: Arslan Saleem Date: Tue, 18 Feb 2025 23:36:57 +0100 Subject: [PATCH] fix(open_embedding): use embedding for openai in batches as well (#53) * fix(open_embedding): use embedding for openai in batches as well * fix: test cases --- backend/app/processing/file_preprocessing.py | 4 ++- backend/app/processing/process_queue.py | 2 +- backend/app/vectorstore/chroma.py | 25 ++++++------------- .../tests/processing/test_process_queue.py | 6 +++-- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/backend/app/processing/file_preprocessing.py b/backend/app/processing/file_preprocessing.py index 0da0ee2..8207912 100644 --- a/backend/app/processing/file_preprocessing.py +++ b/backend/app/processing/file_preprocessing.py @@ -43,7 +43,8 @@ def process_segmentation(project_id: int, asset_id: int, asset_file_name: str): vectorstore.add_docs( docs=docs, - metadatas=metadatas + metadatas=metadatas, + batch_size=100 ) project_repository.update_asset_content_status( @@ -67,6 +68,7 @@ def preprocess_file(asset_id: int): # Get asset details from the database first with SessionLocal() as db: asset = project_repository.get_asset(db=db, asset_id=asset_id) + if asset is None: logger.error(f"Asset with id {asset_id} not found in the database") return diff --git a/backend/app/processing/process_queue.py b/backend/app/processing/process_queue.py index 98b4be5..fa1c3d1 100644 --- a/backend/app/processing/process_queue.py +++ b/backend/app/processing/process_queue.py @@ -406,4 +406,4 @@ def vectorize_extraction_process_step(project_id: int, process_step_id: int, fil ] # Add documents to vectorstore - vectorstore.add_docs(docs=docs, metadatas=metadatas) + vectorstore.add_docs(docs=docs, metadatas=metadatas, batch_size=100) diff --git a/backend/app/vectorstore/chroma.py b/backend/app/vectorstore/chroma.py index e46ea18..5288ce0 100644 --- a/backend/app/vectorstore/chroma.py +++ b/backend/app/vectorstore/chroma.py @@ -101,25 +101,16 @@ def add_docs( filename = metadatas[0].get('filename', 'unknown') logger.info(f"Adding {len(docs)} sentences to the vector store for file {filename}") - # If using OpenAI embeddings, add all documents at once - if self.settings.use_openai_embeddings and self.settings.openai_api_key: - logger.info("Using OpenAI embeddings") + # Batching the document processing + batch_size = batch_size or self._batch_size + + for i in range(0, len(docs), batch_size): + logger.info(f"Processing batch {i} to {i + batch_size}") self._docs_collection.add( - documents=list(docs), - metadatas=metadatas, - ids=ids, + documents=docs[i : i + batch_size], + metadatas=metadatas[i : i + batch_size], + ids=ids[i : i + batch_size], ) - else: - logger.info("Using default embedding function") - batch_size = batch_size or self._batch_size - - for i in range(0, len(docs), batch_size): - logger.info(f"Processing batch {i} to {i + batch_size}") - self._docs_collection.add( - documents=docs[i : i + batch_size], - metadatas=metadatas[i : i + batch_size], - ids=ids[i : i + batch_size], - ) return list(ids) diff --git a/backend/tests/processing/test_process_queue.py b/backend/tests/processing/test_process_queue.py index 4aa6e5e..4796ef5 100644 --- a/backend/tests/processing/test_process_queue.py +++ b/backend/tests/processing/test_process_queue.py @@ -215,7 +215,8 @@ def test_vectorize_extraction_process_step_single_reference(mock_chroma_db): # Assertions mock_vectorstore.add_docs.assert_called_once_with( docs=expected_docs, - metadatas=expected_metadatas + metadatas=expected_metadatas, + batch_size=100 ) @patch('app.processing.process_queue.ChromaDB') @@ -261,7 +262,8 @@ def test_vectorize_extraction_process_step_multiple_references_concatenation(moc # Assertions mock_vectorstore.add_docs.assert_called_once_with( docs=expected_docs, - metadatas=expected_metadatas + metadatas=expected_metadatas, + batch_size=100 ) @patch('app.processing.process_queue.ChromaDB') # Replace with the correct module path