Skip to content

Commit

Permalink
Merge pull request #689 from ParisNeo/main
Browse files Browse the repository at this point in the history
Use docling for enhanced files loading
  • Loading branch information
LarFii authored Feb 1, 2025
2 parents eafb822 + 3a40772 commit 75b5739
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 84 deletions.
105 changes: 26 additions & 79 deletions lightrag/api/lightrag_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ class DocumentManager:
def __init__(
self,
input_dir: str,
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"),
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"),
):
self.input_dir = Path(input_dir)
self.supported_extensions = supported_extensions
Expand Down Expand Up @@ -973,38 +973,14 @@ async def index_file(file_path: Union[str, Path]) -> None:
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
content = await f.read()

case ".pdf":
if not pm.is_installed("pypdf2"):
pm.install("pypdf2")
from PyPDF2 import PdfReader

# PDF handling
reader = PdfReader(str(file_path))
content = ""
for page in reader.pages:
content += page.extract_text() + "\n"

case ".docx":
if not pm.is_installed("python-docx"):
pm.install("python-docx")
from docx import Document

# Word document handling
doc = Document(file_path)
content = "\n".join([paragraph.text for paragraph in doc.paragraphs])

case ".pptx":
if not pm.is_installed("pptx"):
pm.install("pptx")
from pptx import Presentation # type: ignore

# PowerPoint handling
prs = Presentation(file_path)
content = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content += shape.text + "\n"
case ".pdf" | ".docx" | ".pptx" | ".xlsx":
if not pm.is_installed("docling"):
pm.install("docling")
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()

case _:
raise ValueError(f"Unsupported file format: {ext}")
Expand Down Expand Up @@ -1282,55 +1258,26 @@ async def insert_file(file: UploadFile = File(...), description: str = Form(None
text_content = await file.read()
content = text_content.decode("utf-8")

case ".pdf":
if not pm.is_installed("pypdf2"):
pm.install("pypdf2")
from PyPDF2 import PdfReader
from io import BytesIO
case ".pdf" | ".docx" | ".pptx" | ".xlsx":
if not pm.is_installed("docling"):
pm.install("docling")
from docling.document_converter import DocumentConverter

# Read PDF from memory
pdf_content = await file.read()
pdf_file = BytesIO(pdf_content)
reader = PdfReader(pdf_file)
content = ""
for page in reader.pages:
content += page.extract_text() + "\n"

case ".docx":
if not pm.is_installed("python-docx"):
pm.install("python-docx")
from docx import Document
from io import BytesIO

# Read DOCX from memory
docx_content = await file.read()
docx_file = BytesIO(docx_content)
doc = Document(docx_file)
content = "\n".join(
[paragraph.text for paragraph in doc.paragraphs]
)
# Create a temporary file to save the uploaded content
temp_path = Path("temp") / file.filename
temp_path.parent.mkdir(exist_ok=True)

case ".pptx":
if not pm.is_installed("pptx"):
pm.install("pptx")
from pptx import Presentation # type: ignore
from io import BytesIO
# Save the uploaded file
with temp_path.open("wb") as f:
f.write(await file.read())

# Read PPTX from memory
pptx_content = await file.read()
pptx_file = BytesIO(pptx_content)
prs = Presentation(pptx_file)
content = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content += shape.text + "\n"

case _:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}",
)
try:
converter = DocumentConverter()
result = converter.convert(str(temp_path))
content = result.document.export_to_markdown()
finally:
# Clean up the temporary file
temp_path.unlink()

# Insert content into RAG system
if content:
Expand Down
8 changes: 3 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ accelerate
aiofiles
aiohttp
configparser

# File manipulation libraries
docling
graspologic

# database packages
Expand All @@ -11,12 +14,7 @@ networkx
numpy
pipmaster
pydantic

# File manipulation libraries
PyPDF2
python-docx
python-dotenv
python-pptx

setuptools
tenacity
Expand Down

0 comments on commit 75b5739

Please sign in to comment.