Skip to content

Commit

Permalink
Remove timestamp optimization for full syncs
Browse files Browse the repository at this point in the history
  • Loading branch information
seanstory committed Nov 16, 2023
1 parent c827b4d commit d2d52d4
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 23 deletions.
18 changes: 2 additions & 16 deletions connectors/es/sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,22 +420,8 @@ async def get_docs(self, generator):
continue

if doc_id in existing_ids:
# pop out of existing_ids
ts = existing_ids.pop(doc_id)

# If the doc has a timestamp, we can use it to see if it has
# been modified. This reduces the bulk size a *lot*
#
# Some backends do not know how to do this, so it's optional.
# For these, we update the docs in any case.
if TIMESTAMP_FIELD in doc and ts == doc[TIMESTAMP_FIELD]:
# cancel the download
if (
self.content_extraction_enabled
and lazy_download is not None
):
await lazy_download(doit=False)
continue
# pop out of existing_ids, so they do not get deleted
existing_ids.pop(doc_id)

self.total_docs_updated += 1
else:
Expand Down
18 changes: 11 additions & 7 deletions tests/test_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,14 +481,18 @@ async def setup_extractor(
total_downloads(0),
),
(
# doc 1 is present, data source also has doc 1 with the same timestamp -> nothing happens
# doc 1 is present, data source also has doc 1 with the same timestamp -> doc one is updated
[DOC_ONE],
[(DOC_ONE, None, "index")],
NO_FILTERING,
SYNC_RULES_ENABLED,
CONTENT_EXTRACTION_ENABLED,
[end_docs_operation()],
updated(0),
[
# update happens through overwriting
index_operation(DOC_ONE),
end_docs_operation(),
],
updated(1),
created(0),
deleted(0),
total_downloads(0),
Expand Down Expand Up @@ -584,17 +588,17 @@ async def setup_extractor(
total_downloads(1),
),
(
# doc 1 present, data source has doc 1 -> no lazy download if timestamps are the same for the docs
# doc 1 present, data source has doc 1 -> lazy download occurs
[DOC_ONE],
[(DOC_ONE, lazy_download_fake(DOC_ONE), "index")],
NO_FILTERING,
SYNC_RULES_ENABLED,
CONTENT_EXTRACTION_ENABLED,
[end_docs_operation()],
updated(0),
[index_operation(DOC_ONE), end_docs_operation()],
updated(1),
created(0),
deleted(0),
total_downloads(0),
total_downloads(1),
),
(
# doc 1 present, data source has doc 1 with different timestamp
Expand Down

0 comments on commit d2d52d4

Please sign in to comment.