Skip to content

Commit

Permalink
k
Browse files Browse the repository at this point in the history
  • Loading branch information
pablonyx committed Mar 8, 2025
1 parent c6b3191 commit a1ab2e4
Show file tree
Hide file tree
Showing 7 changed files with 204 additions and 161 deletions.
2 changes: 2 additions & 0 deletions backend/onyx/connectors/confluence/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def _process_image_attachment(
media_type=media_type,
file_origin=FileOrigin.CONNECTOR,
)
print(f"Stored image attachment with file name: {file_name}")
logger.info(f"Stored image attachment with file name: {file_name}")

# Return empty text but include the file_name for later processing
return AttachmentProcessingResult(text="", file_name=file_name, error=None)
Expand Down
34 changes: 31 additions & 3 deletions backend/onyx/connectors/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ class TextSection(Section):
"""Section containing text content"""

text: str
image_file_name: None = None
link: str | None = None


class ImageSection(Section):
"""Section containing an image reference"""

image_file_name: str
text: None = None
link: str | None = None


class BasicExpertInfo(BaseModel):
Expand Down Expand Up @@ -169,7 +169,9 @@ def get_metadata_str_attributes(self) -> list[str] | None:


class Document(DocumentBase):
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
"""Used for Onyx ingestion api, the ID is required"""

id: str
source: DocumentSource

def get_total_char_length(self) -> int:
Expand Down Expand Up @@ -207,6 +209,32 @@ def from_base(cls, base: DocumentBase) -> "Document":
)


class IndexingDocument(Document):
"""Document with processed sections for indexing"""

processed_sections: list[Section] = []

def get_total_char_length(self) -> int:
"""Get the total character length of the document including processed sections"""
title_len = len(self.title or self.semantic_identifier)

# Use processed_sections if available, otherwise fall back to original sections
if self.processed_sections:
section_len = sum(
len(section.text) if section.text is not None else 0
for section in self.processed_sections
)
else:
section_len = sum(
len(section.text)
if isinstance(section, TextSection) and section.text is not None
else 0
for section in self.sections
)

return title_len + section_len


class SlimDocument(BaseModel):
id: str
perm_sync_data: Any | None = None
Expand Down
45 changes: 0 additions & 45 deletions backend/onyx/connectors/vision_enabled_connector.py

This file was deleted.

4 changes: 4 additions & 0 deletions backend/onyx/db/pg_file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def read_lobj(
use_tempfile: bool = False,
) -> IO:
pg_conn = get_pg_conn_from_session(db_session)
# Ensure we're using binary mode by default for large objects
if mode is None:
mode = "rb"
large_object = (
pg_conn.lobject(lobj_oid, mode=mode) if mode else pg_conn.lobject(lobj_oid)
)
Expand All @@ -81,6 +84,7 @@ def read_lobj(
temp_file.seek(0)
return temp_file
else:
# Ensure we're getting raw bytes without text decoding
return BytesIO(large_object.read())


Expand Down
68 changes: 44 additions & 24 deletions backend/onyx/indexing/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_metadata_keys_to_ignore,
)
from onyx.connectors.models import Document
from onyx.connectors.models import ImageSection
from onyx.connectors.models import IndexingDocument
from onyx.connectors.models import Section
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.models import DocAwareChunk
from onyx.natural_language_processing.utils import BaseTokenizer
Expand Down Expand Up @@ -195,7 +195,7 @@ def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
# ADDED: extra param image_url to store in the chunk
def _create_chunk(
self,
document: Document,
document: IndexingDocument,
chunks_list: list[DocAwareChunk],
text: str,
links: dict[int, str],
Expand Down Expand Up @@ -226,31 +226,46 @@ def _create_chunk(

def _chunk_document(
self,
document: Document,
document: IndexingDocument,
title_prefix: str,
metadata_suffix_semantic: str,
metadata_suffix_keyword: str,
content_token_limit: int,
) -> list[DocAwareChunk]:
"""
Loops through sections of the document, converting them into one or more chunks.
If a section has an image_link, we treat it as a dedicated chunk.
Legacy method for backward compatibility.
Calls _chunk_document_with_sections with document.sections.
"""
return self._chunk_document_with_sections(
document,
document.processed_sections,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
content_token_limit,
)

def _chunk_document_with_sections(
self,
document: IndexingDocument,
sections: list[Section],
title_prefix: str,
metadata_suffix_semantic: str,
metadata_suffix_keyword: str,
content_token_limit: int,
) -> list[DocAwareChunk]:
"""
Loops through sections of the document, converting them into one or more chunks.
Works with processed sections that are base Section objects.
"""
chunks: list[DocAwareChunk] = []
link_offsets: dict[int, str] = {}
chunk_text = ""

for section_idx, section in enumerate(document.sections):
# Handle different section types
if isinstance(section, ImageSection):
# Skip ImageSection in chunking - these should be processed separately
continue

# Get section text - will be None for ImageSection
for section_idx, section in enumerate(sections):
# Get section text and other attributes
section_text = clean_text(section.text or "")
section_link_text = section.link or ""
# Get image file name if present
image_url = section.image_file_name

# If there is no useful content, skip
Expand All @@ -261,7 +276,7 @@ def _chunk_document(
)
continue

# CASE 1: If this is an image section, force a separate chunk
# CASE 1: If this section has an image, force a separate chunk
if image_url:
# First, if we have any partially built text chunk, finalize it
if chunk_text.strip():
Expand All @@ -278,15 +293,13 @@ def _chunk_document(
chunk_text = ""
link_offsets = {}

# Create a chunk specifically for this image
# (If the section has text describing the image, use that as content)
# Create a chunk specifically for this image section
# (Using the text summary that was generated during processing)
self._create_chunk(
document,
chunks,
section_text,
links={0: section_link_text}
if section_link_text
else {}, # No text offsets needed for images
links={0: section_link_text} if section_link_text else {},
image_file_name=image_url,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
Expand Down Expand Up @@ -391,7 +404,9 @@ def _chunk_document(
)
return chunks

def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
def _handle_single_document(
self, document: IndexingDocument
) -> list[DocAwareChunk]:
# Specifically for reproducing an issue with gmail
if document.source == DocumentSource.GMAIL:
logger.debug(f"Chunking {document.semantic_identifier}")
Expand Down Expand Up @@ -427,9 +442,12 @@ def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
title_prefix = ""
metadata_suffix_semantic = ""

# Chunk the document
normal_chunks = self._chunk_document(
# Use processed_sections if available (IndexingDocument), otherwise use original sections
sections_to_chunk = document.processed_sections

normal_chunks = self._chunk_document_with_sections(
document,
sections_to_chunk,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
Expand All @@ -443,10 +461,12 @@ def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:

return normal_chunks

def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
def chunk(self, documents: list[IndexingDocument]) -> list[DocAwareChunk]:
"""
Takes in a list of documents and chunks them into smaller chunks for indexing
while persisting the document metadata.
Works with both standard Document objects and IndexingDocument objects with processed_sections.
"""
final_chunks: list[DocAwareChunk] = []
for document in documents:
Expand Down
Loading

0 comments on commit a1ab2e4

Please sign in to comment.