Skip to content

Commit

Permalink
🛠️ fix: Handle File Edge Cases (#27)
Browse files Browse the repository at this point in the history
* fix(get_loader): add `autodetected_incoding=True` to prevent text file encoding issues

* chore: bump packages

* chore: add required packages for PDF_EXTRACT_IMAGES

* fix: clean pdf page_content (remove null bytes)
  • Loading branch information
danny-avila authored Apr 22, 2024
1 parent cf92ecc commit a62ffd3
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 18 deletions.
36 changes: 27 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,16 @@
from fastapi.middleware.cors import CORSMiddleware
from langchain_core.runnables.config import run_in_executor
from langchain.text_splitter import RecursiveCharacterTextSplitter
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, status, Request
from fastapi import Query
from fastapi import (
FastAPI,
File,
Form,
Query,
UploadFile,
HTTPException,
status,
Request,
)
from langchain_community.document_loaders import (
WebBaseLoader,
TextLoader,
Expand All @@ -31,7 +39,7 @@
from psql import PSQLDatabase, ensure_custom_id_index_on_embedding, pg_health_check
from middleware import security_middleware
from pgvector_routes import router as pgvector_router
from parsers import process_documents
from parsers import process_documents, clean_text
from constants import ERROR_MESSAGES
from store import AsyncPgVector

Expand Down Expand Up @@ -196,13 +204,21 @@ def generate_digest(page_content: str):


async def store_data_in_vector_db(
data: Iterable[Document], file_id: str, user_id: str = ""
data: Iterable[Document],
file_id: str,
user_id: str = "",
clean_content: bool = False,
) -> bool:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=app.state.CHUNK_SIZE, chunk_overlap=app.state.CHUNK_OVERLAP
)
documents = text_splitter.split_documents(data)

# If `clean_content` is True, clean the page_content of each document (remove null bytes)
if clean_content:
for doc in documents:
doc.page_content = clean_text(doc.page_content)

# Preparing documents with page content and metadata for insertion.
docs = [
Document(
Expand Down Expand Up @@ -262,12 +278,12 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
elif file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0
):
loader = TextLoader(filepath)
loader = TextLoader(filepath, autodetect_encoding=True)
else:
loader = TextLoader(filepath)
loader = TextLoader(filepath, autodetect_encoding=True)
known_type = False

return loader, known_type
return loader, known_type, file_ext


@app.post("/local/embed")
Expand Down Expand Up @@ -346,11 +362,13 @@ async def embed_file(
)

try:
loader, known_type = get_loader(
loader, known_type, file_ext = get_loader(
file.filename, file.content_type, temp_file_path
)
data = loader.load()
result = await store_data_in_vector_db(data, file_id, user_id)
result = await store_data_in_vector_db(
data=data, file_id=file_id, user_id=user_id, clean_content=file_ext == "pdf"
)

if not result:
response_status = False
Expand Down
21 changes: 16 additions & 5 deletions parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,39 @@
from langchain.schema import Document
from config import CHUNK_OVERLAP


def clean_text(text: str) -> str:
"""
Remove NUL (0x00) characters from a string.
:param text: The original text with potential NUL characters.
:return: Cleaned text without NUL characters.
"""
return text.replace("\x00", "")


def process_documents(documents: List[Document]) -> str:
processed_text = ""
last_page: Optional[int] = None
doc_basename = ""

for doc in documents:
if 'source' in doc.metadata:
doc_basename = doc.metadata['source'].split('/')[-1]
if "source" in doc.metadata:
doc_basename = doc.metadata["source"].split("/")[-1]
break

processed_text += f"{doc_basename}\n"

for doc in documents:
current_page = doc.metadata.get('page')
current_page = doc.metadata.get("page")
if current_page and current_page != last_page:
processed_text += f"\n# PAGE {doc.metadata['page']}\n\n"
last_page = current_page

new_content = doc.page_content
if processed_text.endswith(new_content[:CHUNK_OVERLAP]):
processed_text += new_content[CHUNK_OVERLAP:]
else:
processed_text += new_content

return processed_text.strip()
6 changes: 4 additions & 2 deletions requirements.lite.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
langchain==0.1.12
langchain_community==0.0.28
langchain_community==0.0.34
langchain_openai==0.0.8
langchain_core==0.1.35
langchain_core==0.1.45
sqlalchemy==2.0.28
python-dotenv==1.0.1
fastapi==0.110.0
Expand All @@ -20,3 +20,5 @@ python-jose==3.3.0
asyncpg==0.29.0
python-multipart==0.0.9
aiofiles==23.2.1
rapidocr-onnxruntime==1.3.17
opencv-python-headless==4.9.0.80
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
langchain==0.1.12
langchain_community==0.0.28
langchain_community==0.0.34
langchain_openai==0.0.8
langchain_core==0.1.35
langchain_core==0.1.45
sqlalchemy==2.0.28
python-dotenv==1.0.1
fastapi==0.110.0
Expand All @@ -21,3 +21,5 @@ asyncpg==0.29.0
python-multipart==0.0.9
sentence_transformers==2.5.1
aiofiles==23.2.1
rapidocr-onnxruntime==1.3.17
opencv-python-headless==4.9.0.80

0 comments on commit a62ffd3

Please sign in to comment.