Skip to content

Commit

Permalink
fix: page_number for pdfs with pages not containing any text (#7271)
Browse files Browse the repository at this point in the history
* update version

* ci: fix rest-api tests (#7256)

* fix: page_number for pdfs with pages not containing any text

* Revert "fix: page_number for pdfs with pages not containing any text"

This reverts commit ae20bae.

* Reapply "fix: page_number for pdfs with pages not containing any text"

This reverts commit 18fd55b.

* add test

---------

Co-authored-by: Vladimir Blagojevic <[email protected]>
Co-authored-by: Tobias Wochinger <[email protected]>
  • Loading branch information
3 people authored Mar 4, 2024
1 parent d03d5b5 commit 7be8f1e
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
3 changes: 3 additions & 0 deletions haystack/nodes/preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,9 @@ def _concatenate_units(
else:
num_page_breaks = len(processed_units)
cur_page += num_page_breaks
else:
if self.add_page_number and split_at == "\f":
cur_page += 1

return text_splits, splits_pages, splits_start_idxs

Expand Down
14 changes: 14 additions & 0 deletions test/nodes/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,20 @@ def test_preprocess_page_split_and_split_overlap():
assert output[1].meta["page"] == 2


@pytest.mark.unit
def test_preprocess_page_split_with_empty_pages():
doc = Document(
content="This is a document on page 1.\f\fThis is a document on page 3.\f\fThis is a document on page 5."
)
output = PreProcessor(
split_by="page", split_length=1, split_respect_sentence_boundary=False, split_overlap=0, add_page_number=True
).run([doc])[0]["documents"]
assert len(output) == 3
assert output[0] == Document(content="This is a document on page 1.", meta={"_split_id": 0, "page": 1})
assert output[1] == Document(content="This is a document on page 3.", meta={"_split_id": 1, "page": 3})
assert output[2] == Document(content="This is a document on page 5.", meta={"_split_id": 2, "page": 5})


@pytest.mark.unit
def test_preprocess_tiktoken_token_split(mock_tiktoken_tokenizer):
raw_docs = [
Expand Down

0 comments on commit 7be8f1e

Please sign in to comment.