graft: fix invisible text appearing after strip_invisible_text

strip_invisible_text resets the text render mode on each `BT` (begin text) command. However the text state is not actually reset for each text element, only for each page. The pdf reference says: > The text state operators can appear outside text objects, and the values they set > are retained across text objects in a single content stream. Like other graphics > state parameters, these parameters are initialized to their default values at the > beginning of each page. > > -- https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf#page=397 With the current implementation, a text object is only deleted if it contains a `3 Tr` command (setting the text rendering mode to invalid). However the rendering mode may be set once and then not changed for multiple text objects or set outside of a text object. In that case only the first text object (which contains the `3 Tr`-command) is removed. This not only leaves the other text objects in the pdf, but also makes them visible, since the text object that contained the `3 Tr`-command is removed. This PR updates `strip_invisible_text` to not reset the rendering mode for each object and to keep track of the rendering mode when the graphic state is pushed/popped.
ocrmypdf · Dec 11, 2024 · e8ea642 · e8ea642
1 parent 02d85ff
commit e8ea642
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 5 deletions.
diff --git a/src/ocrmypdf/_graft.py b/src/ocrmypdf/_graft.py
@@ -57,34 +57,43 @@ def _update_resources(
         fonts[font_key] = font
 
 
+
 def strip_invisible_text(pdf: Pdf, page: Page):
     stream = []
     in_text_obj = False
-    render_mode = 0
+    render_mode_stack = [0]
     text_objects = []
 
     for operands, operator in parse_content_stream(page, ''):
+        if operator == Operator('Tr'):
+            render_mode_stack[-1] = operands[0]
+
+        if operator == Operator('q'):
+            render_mode_stack.append(render_mode_stack[-1])
+
+        if operator == Operator('Q'):
+            render_mode_stack.pop()
+
         if not in_text_obj:
             if operator == Operator('BT'):
                 in_text_obj = True
-                render_mode = 0
                 text_objects.append((operands, operator))
             else:
                 stream.append((operands, operator))
         else:
-            if operator == Operator('Tr'):
-                render_mode = operands[0]
             text_objects.append((operands, operator))
             if operator == Operator('ET'):
                 in_text_obj = False
-                if render_mode != 3:
+                if render_mode_stack[-1] != 3:
                     stream.extend(text_objects)
                 text_objects.clear()
 
     content_stream = unparse_content_stream(stream)
     page.Contents = Stream(pdf, content_stream)
 
 
+
+
 class OcrGrafter:
     """Manages grafting text-only PDFs onto regular PDFs."""
 

diff --git a/tests/test_graft.py b/tests/test_graft.py
@@ -40,3 +40,72 @@ def test_links(resources, outpdf):
         p2 = pdf.pages[1]
         assert p1.Annots[0].A.D[0].objgen == p2.objgen
         assert p2.Annots[0].A.D[0].objgen == p1.objgen
+
+
+def test_strip_invisble_text():
+    pdf = pikepdf.Pdf.new()
+    print(pikepdf.parse_content_stream(pikepdf.Stream(pdf, b'3 Tr')))
+    page = pdf.add_blank_page()
+    visible_text = [
+        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
+        pikepdf.ContentStreamInstruction(
+            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
+        ),
+        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
+        pikepdf.ContentStreamInstruction(
+            (pikepdf.String('visible'),), pikepdf.Operator('Tj')
+        ),
+        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
+    ]
+    invisible_text = [
+        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
+        pikepdf.ContentStreamInstruction(
+            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
+        ),
+        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
+        pikepdf.ContentStreamInstruction(
+            (pikepdf.String('invisible'),), pikepdf.Operator('Tj')
+        ),
+        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
+    ]
+    invisible_text_setting_tr = [
+        pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
+        pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
+        pikepdf.ContentStreamInstruction(
+            (pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
+        ),
+        pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
+        pikepdf.ContentStreamInstruction(
+            (pikepdf.String('invisible'),), pikepdf.Operator('Tj')
+        ),
+        pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
+    ]
+    stream = [
+        pikepdf.ContentStreamInstruction([], pikepdf.Operator('q')),
+        pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
+        *invisible_text,
+        pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q')),
+        *visible_text,
+        *invisible_text_setting_tr,
+        *invisible_text,
+    ]
+    content_stream = pikepdf.unparse_content_stream(stream)
+    page.Contents = pikepdf.Stream(pdf, content_stream)
+
+    def count(string, page):
+        return len(
+            [
+                True
+                for operands, operator in pikepdf.parse_content_stream(page)
+                if operator == pikepdf.Operator('Tj')
+                and operands[0] == pikepdf.String(string)
+            ]
+        )
+
+    nr_visible_pre = count('visible', page)
+    ocrmypdf._graft.strip_invisible_text(pdf, page)
+    nr_visible_post = count('visible', page)
+    assert (
+        nr_visible_pre == nr_visible_post
+    ), 'Number of visible text elements did not change'
+    assert count('invisible', page) == 0, 'No invisible elems left'