Skip to content

Commit

Permalink
k
Browse files Browse the repository at this point in the history
  • Loading branch information
pablonyx committed Mar 8, 2025
1 parent a201d2a commit c6b3191
Show file tree
Hide file tree
Showing 27 changed files with 253 additions and 319 deletions.
15 changes: 9 additions & 6 deletions backend/onyx/background/indexing/run_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.connectors.models import TextSection
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
from onyx.db.connector_credential_pair import get_last_successful_attempt_time
from onyx.db.connector_credential_pair import update_connector_credential_pair
Expand Down Expand Up @@ -154,14 +155,12 @@ def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
)

for section in cleaned_doc.sections:
if section.link and "\x00" in section.link:
logger.warning(
f"NUL characters found in document link for document: {cleaned_doc.id}"
)
if section.link is not None:
section.link = section.link.replace("\x00", "")

# since text can be longer, just replace to avoid double scan
section.text = section.text.replace("\x00", "")
if isinstance(section, TextSection) and section.text is not None:
section.text = section.text.replace("\x00", "")

cleaned_batch.append(cleaned_doc)

Expand Down Expand Up @@ -479,7 +478,11 @@ def _run_indexing(

doc_size = 0
for section in doc.sections:
doc_size += len(section.text)
if (
isinstance(section, TextSection)
and section.text is not None
):
doc_size += len(section.text)

if doc_size > INDEXING_SIZE_WARNING_THRESHOLD:
logger.warning(
Expand Down
2 changes: 1 addition & 1 deletion backend/onyx/connectors/airtable/airtable_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def _process_record(

return Document(
id=f"airtable__{record_id}",
sections=sections,
sections=sections, # type: ignore # TextSection is a subclass of Section
source=DocumentSource.AIRTABLE,
semantic_identifier=semantic_id,
metadata=metadata,
Expand Down
7 changes: 6 additions & 1 deletion backend/onyx/connectors/blob/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,12 @@ def validate_connector_settings(self) -> None:
print("Sections:")
for section in doc.sections:
print(f" - Link: {section.link}")
print(f" - Text: {section.text[:100]}...")
if isinstance(section, TextSection) and section.text is not None:
print(f" - Text: {section.text[:100]}...")
elif (
hasattr(section, "image_file_name") and section.image_file_name
):
print(f" - Image: {section.image_file_name}")
print("---")
break

Expand Down
12 changes: 7 additions & 5 deletions backend/onyx/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,9 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
)

# Create the main section for the page content
sections = [TextSection(text=page_content, link=page_url)]
sections: list[TextSection | ImageSection] = [
TextSection(text=page_content, link=page_url)
]

# Process comments if available
comment_text = self._get_comment_string_for_page_id(page_id)
Expand All @@ -260,6 +262,7 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
for attachment in attachments.get("results", []):
print("zattachment", attachment)
# Process each attachment
result = None
try:
result = process_attachment(
self.confluence_client,
Expand All @@ -270,24 +273,23 @@ def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
print("error", e)
print("result", result)

if result.text:
if result and result.text:
print("result.text", result.text)
# Create a section for the attachment text
attachment_section = TextSection(
text=result.text,
link=f"{page_url}#attachment-{attachment['id']}",
)
sections.append(attachment_section)
elif result.file_name:
elif result and result.file_name:
print("result.file_name", result.file_name)
# Create an ImageSection for image attachments
image_section = ImageSection(
text="",
link=f"{page_url}#attachment-{attachment['id']}",
image_file_name=result.file_name,
)
sections.append(image_section)
elif result.error:
elif result and result.error:
logger.warning(
f"Error processing attachment '{attachment.get('title')}': {result.error}"
)
Expand Down
2 changes: 1 addition & 1 deletion backend/onyx/connectors/discord/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def _convert_message_to_document(
semantic_identifier=semantic_identifier,
doc_updated_at=message.edited_at,
title=title,
sections=sections,
sections=sections, # type: ignore # TextSection is a subclass of Section
metadata=metadata,
)

Expand Down
2 changes: 1 addition & 1 deletion backend/onyx/connectors/discourse/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _get_doc_from_topic(self, topic_id: int) -> Document:

doc = Document(
id="_".join([DocumentSource.DISCOURSE.value, str(topic["id"])]),
sections=sections,
sections=sections, # type: ignore # TextSection is a subclass of Section
source=DocumentSource.DISCOURSE,
semantic_identifier=topic["title"],
doc_updated_at=time_str_to_utc(topic["last_posted_at"]),
Expand Down
4 changes: 2 additions & 2 deletions backend/onyx/connectors/file/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _create_image_section(
except Exception as e:
logger.error(f"Failed to store image {display_name}: {e}")
# Return an empty section with no file name
return ImageSection(text="", image_file_name=""), None
return ImageSection(image_file_name=""), None


def _process_file(
Expand Down Expand Up @@ -232,7 +232,7 @@ def _process_file(
)

# Build sections: first the text as a single Section
sections = []
sections: list[TextSection | ImageSection] = []
link_in_meta = metadata.get("link")
if text_content.strip():
sections.append(TextSection(link=link_in_meta, text=text_content.strip()))
Expand Down
2 changes: 1 addition & 1 deletion backend/onyx/connectors/fireflies/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:

return Document(
id=fireflies_id,
sections=sections,
sections=sections, # type: ignore # TextSection is a subclass of Section
source=DocumentSource.FIREFLIES,
semantic_identifier=meeting_title,
metadata={},
Expand Down
4 changes: 2 additions & 2 deletions backend/onyx/connectors/gmail/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
Expand Down Expand Up @@ -192,7 +192,7 @@ def thread_to_document(full_thread: Dict[str, Any]) -> Document | None:
return Document(
id=id,
semantic_identifier=semantic_identifier,
sections=sections,
sections=sections, # type: ignore # TextSection is a subclass of Section
source=DocumentSource.GMAIL,
# This is used to perform permission sync
primary_owners=primary_owners,
Expand Down
Loading

0 comments on commit c6b3191

Please sign in to comment.