k

onyx-dot-app · Mar 8, 2025 · c6b3191 · c6b3191
1 parent a201d2a
commit c6b3191
Show file tree

Hide file tree

Showing 27 changed files with 253 additions and 319 deletions.
diff --git a/backend/onyx/background/indexing/run_indexing.py b/backend/onyx/background/indexing/run_indexing.py
@@ -28,6 +28,7 @@
 from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import IndexAttemptMetadata
+from onyx.connectors.models import TextSection
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
 from onyx.db.connector_credential_pair import get_last_successful_attempt_time
 from onyx.db.connector_credential_pair import update_connector_credential_pair
@@ -154,14 +155,12 @@ def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
             )
 
         for section in cleaned_doc.sections:
-            if section.link and "\x00" in section.link:
-                logger.warning(
-                    f"NUL characters found in document link for document: {cleaned_doc.id}"
-                )
+            if section.link is not None:
                 section.link = section.link.replace("\x00", "")
 
             # since text can be longer, just replace to avoid double scan
-            section.text = section.text.replace("\x00", "")
+            if isinstance(section, TextSection) and section.text is not None:
+                section.text = section.text.replace("\x00", "")
 
         cleaned_batch.append(cleaned_doc)
 
@@ -479,7 +478,11 @@ def _run_indexing(
 
                     doc_size = 0
                     for section in doc.sections:
-                        doc_size += len(section.text)
+                        if (
+                            isinstance(section, TextSection)
+                            and section.text is not None
+                        ):
+                            doc_size += len(section.text)
 
                     if doc_size > INDEXING_SIZE_WARNING_THRESHOLD:
                         logger.warning(

diff --git a/backend/onyx/connectors/airtable/airtable_connector.py b/backend/onyx/connectors/airtable/airtable_connector.py
@@ -384,7 +384,7 @@ def _process_record(
 
         return Document(
             id=f"airtable__{record_id}",
-            sections=sections,
+            sections=sections,  # type: ignore # TextSection is a subclass of Section
             source=DocumentSource.AIRTABLE,
             semantic_identifier=semantic_id,
             metadata=metadata,

diff --git a/backend/onyx/connectors/blob/connector.py b/backend/onyx/connectors/blob/connector.py
@@ -341,7 +341,12 @@ def validate_connector_settings(self) -> None:
                 print("Sections:")
                 for section in doc.sections:
                     print(f"  - Link: {section.link}")
-                    print(f"  - Text: {section.text[:100]}...")
+                    if isinstance(section, TextSection) and section.text is not None:
+                        print(f"  - Text: {section.text[:100]}...")
+                    elif (
+                        hasattr(section, "image_file_name") and section.image_file_name
+                    ):
+                        print(f"  - Image: {section.image_file_name}")
                 print("---")
             break
 

diff --git a/backend/onyx/connectors/confluence/connector.py b/backend/onyx/connectors/confluence/connector.py
@@ -242,7 +242,9 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
             )
 
             # Create the main section for the page content
-            sections = [TextSection(text=page_content, link=page_url)]
+            sections: list[TextSection | ImageSection] = [
+                TextSection(text=page_content, link=page_url)
+            ]
 
             # Process comments if available
             comment_text = self._get_comment_string_for_page_id(page_id)
@@ -260,6 +262,7 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
                 for attachment in attachments.get("results", []):
                     print("zattachment", attachment)
                     # Process each attachment
+                    result = None
                     try:
                         result = process_attachment(
                             self.confluence_client,
@@ -270,24 +273,23 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
                         print("error", e)
                     print("result", result)
 
-                    if result.text:
+                    if result and result.text:
                         print("result.text", result.text)
                         # Create a section for the attachment text
                         attachment_section = TextSection(
                             text=result.text,
                             link=f"{page_url}#attachment-{attachment['id']}",
                         )
                         sections.append(attachment_section)
-                    elif result.file_name:
+                    elif result and result.file_name:
                         print("result.file_name", result.file_name)
                         # Create an ImageSection for image attachments
                         image_section = ImageSection(
-                            text="",
                             link=f"{page_url}#attachment-{attachment['id']}",
                             image_file_name=result.file_name,
                         )
                         sections.append(image_section)
-                    elif result.error:
+                    elif result and result.error:
                         logger.warning(
                             f"Error processing attachment '{attachment.get('title')}': {result.error}"
                         )

diff --git a/backend/onyx/connectors/discord/connector.py b/backend/onyx/connectors/discord/connector.py
@@ -78,7 +78,7 @@ def _convert_message_to_document(
         semantic_identifier=semantic_identifier,
         doc_updated_at=message.edited_at,
         title=title,
-        sections=sections,
+        sections=sections,  # type: ignore # TextSection is a subclass of Section
         metadata=metadata,
     )
 

diff --git a/backend/onyx/connectors/discourse/connector.py b/backend/onyx/connectors/discourse/connector.py
@@ -129,7 +129,7 @@ def _get_doc_from_topic(self, topic_id: int) -> Document:
 
         doc = Document(
             id="_".join([DocumentSource.DISCOURSE.value, str(topic["id"])]),
-            sections=sections,
+            sections=sections,  # type: ignore # TextSection is a subclass of Section
             source=DocumentSource.DISCOURSE,
             semantic_identifier=topic["title"],
             doc_updated_at=time_str_to_utc(topic["last_posted_at"]),

diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py
@@ -97,7 +97,7 @@ def _create_image_section(
     except Exception as e:
         logger.error(f"Failed to store image {display_name}: {e}")
         # Return an empty section with no file name
-        return ImageSection(text="", image_file_name=""), None
+        return ImageSection(image_file_name=""), None
 
 
 def _process_file(
@@ -232,7 +232,7 @@ def _process_file(
     )
 
     # Build sections: first the text as a single Section
-    sections = []
+    sections: list[TextSection | ImageSection] = []
     link_in_meta = metadata.get("link")
     if text_content.strip():
         sections.append(TextSection(link=link_in_meta, text=text_content.strip()))

diff --git a/backend/onyx/connectors/fireflies/connector.py b/backend/onyx/connectors/fireflies/connector.py
@@ -94,7 +94,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
 
     return Document(
         id=fireflies_id,
-        sections=sections,
+        sections=sections,  # type: ignore # TextSection is a subclass of Section
         source=DocumentSource.FIREFLIES,
         semantic_identifier=meeting_title,
         metadata={},

diff --git a/backend/onyx/connectors/gmail/connector.py b/backend/onyx/connectors/gmail/connector.py
@@ -28,8 +28,8 @@
 from onyx.connectors.interfaces import SlimConnector
 from onyx.connectors.models import BasicExpertInfo
 from onyx.connectors.models import Document
-from onyx.connectors.models import TextSection
 from onyx.connectors.models import SlimDocument
+from onyx.connectors.models import TextSection
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.logger import setup_logger
 from onyx.utils.retry_wrapper import retry_builder
@@ -192,7 +192,7 @@ def thread_to_document(full_thread: Dict[str, Any]) -> Document | None:
     return Document(
         id=id,
         semantic_identifier=semantic_identifier,
-        sections=sections,
+        sections=sections,  # type: ignore # TextSection is a subclass of Section
         source=DocumentSource.GMAIL,
         # This is used to perform permission sync
         primary_owners=primary_owners,