fix: page_number for pdfs with pages not containing any text (#7271)

* update version * ci: fix rest-api tests (#7256) * fix: page_number for pdfs with pages not containing any text * Revert "fix: page_number for pdfs with pages not containing any text" This reverts commit ae20bae. * Reapply "fix: page_number for pdfs with pages not containing any text" This reverts commit 18fd55b. * add test --------- Co-authored-by: Vladimir Blagojevic <[email protected]> Co-authored-by: Tobias Wochinger <[email protected]>
deepset-ai · Mar 4, 2024 · 7be8f1e · 7be8f1e
1 parent d03d5b5
commit 7be8f1e
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 0 deletions.
diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py
@@ -641,6 +641,9 @@ def _concatenate_units(
                     else:
                         num_page_breaks = len(processed_units)
                     cur_page += num_page_breaks
+            else:
+                if self.add_page_number and split_at == "\f":
+                    cur_page += 1
 
         return text_splits, splits_pages, splits_start_idxs
 

diff --git a/test/nodes/test_preprocessor.py b/test/nodes/test_preprocessor.py
@@ -265,6 +265,20 @@ def test_preprocess_page_split_and_split_overlap():
     assert output[1].meta["page"] == 2
 
 
+@pytest.mark.unit
+def test_preprocess_page_split_with_empty_pages():
+    doc = Document(
+        content="This is a document on page 1.\f\fThis is a document on page 3.\f\fThis is a document on page 5."
+    )
+    output = PreProcessor(
+        split_by="page", split_length=1, split_respect_sentence_boundary=False, split_overlap=0, add_page_number=True
+    ).run([doc])[0]["documents"]
+    assert len(output) == 3
+    assert output[0] == Document(content="This is a document on page 1.", meta={"_split_id": 0, "page": 1})
+    assert output[1] == Document(content="This is a document on page 3.", meta={"_split_id": 1, "page": 3})
+    assert output[2] == Document(content="This is a document on page 5.", meta={"_split_id": 2, "page": 5})
+
+
 @pytest.mark.unit
 def test_preprocess_tiktoken_token_split(mock_tiktoken_tokenizer):
     raw_docs = [