k

onyx-dot-app · Mar 8, 2025 · a1ab2e4 · a1ab2e4
1 parent c6b3191
commit a1ab2e4
Show file tree

Hide file tree

Showing 7 changed files with 204 additions and 161 deletions.
diff --git a/backend/onyx/connectors/confluence/utils.py b/backend/onyx/connectors/confluence/utils.py
@@ -182,6 +182,8 @@ def _process_image_attachment(
                 media_type=media_type,
                 file_origin=FileOrigin.CONNECTOR,
             )
+            print(f"Stored image attachment with file name: {file_name}")
+            logger.info(f"Stored image attachment with file name: {file_name}")
 
             # Return empty text but include the file_name for later processing
             return AttachmentProcessingResult(text="", file_name=file_name, error=None)

diff --git a/backend/onyx/connectors/models.py b/backend/onyx/connectors/models.py
@@ -42,14 +42,14 @@ class TextSection(Section):
     """Section containing text content"""
 
     text: str
-    image_file_name: None = None
+    link: str | None = None
 
 
 class ImageSection(Section):
     """Section containing an image reference"""
 
     image_file_name: str
-    text: None = None
+    link: str | None = None
 
 
 class BasicExpertInfo(BaseModel):
@@ -169,7 +169,9 @@ def get_metadata_str_attributes(self) -> list[str] | None:
 
 
 class Document(DocumentBase):
-    id: str  # This must be unique or during indexing/reindexing, chunks will be overwritten
+    """Used for Onyx ingestion api, the ID is required"""
+
+    id: str
     source: DocumentSource
 
     def get_total_char_length(self) -> int:
@@ -207,6 +209,32 @@ def from_base(cls, base: DocumentBase) -> "Document":
         )
 
 
+class IndexingDocument(Document):
+    """Document with processed sections for indexing"""
+
+    processed_sections: list[Section] = []
+
+    def get_total_char_length(self) -> int:
+        """Get the total character length of the document including processed sections"""
+        title_len = len(self.title or self.semantic_identifier)
+
+        # Use processed_sections if available, otherwise fall back to original sections
+        if self.processed_sections:
+            section_len = sum(
+                len(section.text) if section.text is not None else 0
+                for section in self.processed_sections
+            )
+        else:
+            section_len = sum(
+                len(section.text)
+                if isinstance(section, TextSection) and section.text is not None
+                else 0
+                for section in self.sections
+            )
+
+        return title_len + section_len
+
+
 class SlimDocument(BaseModel):
     id: str
     perm_sync_data: Any | None = None

diff --git a/backend/onyx/connectors/vision_enabled_connector.py b/backend/onyx/connectors/vision_enabled_connector.py
diff --git a/backend/onyx/db/pg_file_store.py b/backend/onyx/db/pg_file_store.py
@@ -67,6 +67,9 @@ def read_lobj(
     use_tempfile: bool = False,
 ) -> IO:
     pg_conn = get_pg_conn_from_session(db_session)
+    # Ensure we're using binary mode by default for large objects
+    if mode is None:
+        mode = "rb"
     large_object = (
         pg_conn.lobject(lobj_oid, mode=mode) if mode else pg_conn.lobject(lobj_oid)
     )
@@ -81,6 +84,7 @@ def read_lobj(
         temp_file.seek(0)
         return temp_file
     else:
+        # Ensure we're getting raw bytes without text decoding
         return BytesIO(large_object.read())
 
 

diff --git a/backend/onyx/indexing/chunker.py b/backend/onyx/indexing/chunker.py
@@ -9,8 +9,8 @@
 from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
     get_metadata_keys_to_ignore,
 )
-from onyx.connectors.models import Document
-from onyx.connectors.models import ImageSection
+from onyx.connectors.models import IndexingDocument
+from onyx.connectors.models import Section
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.indexing.models import DocAwareChunk
 from onyx.natural_language_processing.utils import BaseTokenizer
@@ -195,7 +195,7 @@ def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
     # ADDED: extra param image_url to store in the chunk
     def _create_chunk(
         self,
-        document: Document,
+        document: IndexingDocument,
         chunks_list: list[DocAwareChunk],
         text: str,
         links: dict[int, str],
@@ -226,31 +226,46 @@ def _create_chunk(
 
     def _chunk_document(
         self,
-        document: Document,
+        document: IndexingDocument,
         title_prefix: str,
         metadata_suffix_semantic: str,
         metadata_suffix_keyword: str,
         content_token_limit: int,
     ) -> list[DocAwareChunk]:
         """
-        Loops through sections of the document, converting them into one or more chunks.
-        If a section has an image_link, we treat it as a dedicated chunk.
+        Legacy method for backward compatibility.
+        Calls _chunk_document_with_sections with document.sections.
         """
+        return self._chunk_document_with_sections(
+            document,
+            document.processed_sections,
+            title_prefix,
+            metadata_suffix_semantic,
+            metadata_suffix_keyword,
+            content_token_limit,
+        )
 
+    def _chunk_document_with_sections(
+        self,
+        document: IndexingDocument,
+        sections: list[Section],
+        title_prefix: str,
+        metadata_suffix_semantic: str,
+        metadata_suffix_keyword: str,
+        content_token_limit: int,
+    ) -> list[DocAwareChunk]:
+        """
+        Loops through sections of the document, converting them into one or more chunks.
+        Works with processed sections that are base Section objects.
+        """
         chunks: list[DocAwareChunk] = []
         link_offsets: dict[int, str] = {}
         chunk_text = ""
 
-        for section_idx, section in enumerate(document.sections):
-            # Handle different section types
-            if isinstance(section, ImageSection):
-                # Skip ImageSection in chunking - these should be processed separately
-                continue
-
-            # Get section text - will be None for ImageSection
+        for section_idx, section in enumerate(sections):
+            # Get section text and other attributes
             section_text = clean_text(section.text or "")
             section_link_text = section.link or ""
-            # Get image file name if present
             image_url = section.image_file_name
 
             # If there is no useful content, skip
@@ -261,7 +276,7 @@ def _chunk_document(
                 )
                 continue
 
-            # CASE 1: If this is an image section, force a separate chunk
+            # CASE 1: If this section has an image, force a separate chunk
             if image_url:
                 # First, if we have any partially built text chunk, finalize it
                 if chunk_text.strip():
@@ -278,15 +293,13 @@ def _chunk_document(
                     chunk_text = ""
                     link_offsets = {}
 
-                # Create a chunk specifically for this image
-                # (If the section has text describing the image, use that as content)
+                # Create a chunk specifically for this image section
+                # (Using the text summary that was generated during processing)
                 self._create_chunk(
                     document,
                     chunks,
                     section_text,
-                    links={0: section_link_text}
-                    if section_link_text
-                    else {},  # No text offsets needed for images
+                    links={0: section_link_text} if section_link_text else {},
                     image_file_name=image_url,
                     title_prefix=title_prefix,
                     metadata_suffix_semantic=metadata_suffix_semantic,
@@ -391,7 +404,9 @@ def _chunk_document(
             )
         return chunks
 
-    def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
+    def _handle_single_document(
+        self, document: IndexingDocument
+    ) -> list[DocAwareChunk]:
         # Specifically for reproducing an issue with gmail
         if document.source == DocumentSource.GMAIL:
             logger.debug(f"Chunking {document.semantic_identifier}")
@@ -427,9 +442,12 @@ def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
             title_prefix = ""
             metadata_suffix_semantic = ""
 
-        # Chunk the document
-        normal_chunks = self._chunk_document(
+        # Use processed_sections if available (IndexingDocument), otherwise use original sections
+        sections_to_chunk = document.processed_sections
+
+        normal_chunks = self._chunk_document_with_sections(
             document,
+            sections_to_chunk,
             title_prefix,
             metadata_suffix_semantic,
             metadata_suffix_keyword,
@@ -443,10 +461,12 @@ def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
 
         return normal_chunks
 
-    def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
+    def chunk(self, documents: list[IndexingDocument]) -> list[DocAwareChunk]:
         """
         Takes in a list of documents and chunks them into smaller chunks for indexing
         while persisting the document metadata.
+
+        Works with both standard Document objects and IndexingDocument objects with processed_sections.
         """
         final_chunks: list[DocAwareChunk] = []
         for document in documents: