Skip to content

Commit

Permalink
ENH: Add capability to filter text extraction by orientation (#1175)
Browse files Browse the repository at this point in the history
Closes #1071
  • Loading branch information
pubpub-zz authored Jul 30, 2022
1 parent 8c532a0 commit 8a27fa4
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 37 deletions.
130 changes: 94 additions & 36 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,7 @@ def _extract_text(
self,
obj: Any,
pdf: Any,
orientations: Tuple[int, ...] = (0, 90, 180, 270),
space_width: float = 200.0,
content_key: Optional[str] = PG.CONTENTS,
) -> str:
Expand All @@ -1117,6 +1118,9 @@ def _extract_text(
this function, as it will change if this function is made more
sophisticated.
:param Tuple[int, ...] orientations: list of orientations text_extraction will look for
default = (0, 90, 180, 270)
note: currently only 0(Up),90(turned Left), 180(upside Down), 270 (turned Right)
:param float space_width: force default space width
(if not extracted from font (default 200)
:param Optional[str] content_key: indicate the default key where to extract data
Expand Down Expand Up @@ -1195,7 +1199,7 @@ def current_spacewidth() -> float:
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations
check_crlf_space: bool = False
# Table 5.4 page 405
if operator == b"BT":
Expand Down Expand Up @@ -1301,34 +1305,37 @@ def process_operation(operator: bytes, operands: List) -> None:

elif operator == b"Tj":
check_crlf_space = True
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(
cmap[0], "surrogatepass"
) # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
m = mult(tm_matrix, cm_matrix)
o = orient(m)
if o in orientations:
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)

text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
if isinstance(cmap[0], str):
try:
t = tt.decode(
cmap[0], "surrogatepass"
) # apply str encoding
except Exception: # the data does not match the expectation, we use the alternative ; text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
)

text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
else:
return None
if check_crlf_space:
Expand All @@ -1339,6 +1346,8 @@ def process_operation(operator: bytes, operands: List) -> None:
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
tm_prev = m
if o not in orientations:
return
try:
if o == 0:
if deltaY < -0.8 * f:
Expand Down Expand Up @@ -1418,7 +1427,7 @@ def process_operation(operator: bytes, operands: List) -> None:
xobj = resources_dict["/XObject"]
if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
# output += text
text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore
text = self.extract_xform_text(xobj[operands[0]], orientations, space_width) # type: ignore
output += text
except Exception:
warnings.warn(
Expand All @@ -1433,7 +1442,12 @@ def process_operation(operator: bytes, operands: List) -> None:
return output

def extract_text(
self, Tj_sep: str = "", TJ_sep: str = "", space_width: float = 200.0
self,
*args: Any,
Tj_sep: str = None,
TJ_sep: str = None,
orientations: Union[int, Tuple[int, ...]] = (0, 90, 180, 270),
space_width: float = 200.0,
) -> str:
"""
Locate all text drawing commands, in the order they are provided in the
Expand All @@ -1445,15 +1459,59 @@ def extract_text(
Do not rely on the order of text coming out of this function, as it
will change if this function is made more sophisticated.
:param space_width : force default space width (if not extracted from font (default 200)
:params obsolete/Depreciating Tj_sep, TJ_sep: kept for compatibility
:param orientations : (list of) orientations (of the characters) (default: (0,90,270,360))
single int is equivalent to a singleton ( 0 == (0,) )
note: currently only 0(Up),90(turned Left), 180(upside Down),270 (turned Right)
:param space_width : force default space width (if not extracted from font (default: 200)
:return: The extracted text
"""
return self._extract_text(self, self.pdf, space_width, PG.CONTENTS)
if len(args) >= 1:
if isinstance(args[0], str):
Tj_sep = args[0]
if len(args) >= 2:
if isinstance(args[1], str):
TJ_sep = args[1]
else:
raise TypeError(f"Invalid positional parameter {args[1]}")
if len(args) >= 3:
if isinstance(args[2], (tuple, int)):
orientations = args[2]
else:
raise TypeError(f"Invalid positional parameter {args[2]}")
if len(args) >= 4:
if isinstance(args[3], (float, int)):
space_width = args[3]
else:
raise TypeError(f"Invalid positional parameter {args[3]}")
elif isinstance(args[0], (tuple, int)):
orientations = args[0]
if len(args) >= 2:
if isinstance(args[1], (float, int)):
space_width = args[1]
else:
raise TypeError(f"Invalid positional parameter {args[1]}")
else:
raise TypeError(f"Invalid positional parameter {args[0]}")
if Tj_sep is not None or TJ_sep is not None:
warnings.warn(
"parameters Tj_Sep, TJ_sep depreciated, and will be removed in PyPDF2 3.0.0.",
DeprecationWarning,
)

if isinstance(orientations, int):
orientations = (orientations,)

return self._extract_text(
self, self.pdf, orientations, space_width, PG.CONTENTS
)

def extract_xform_text(
self, xform: EncodedStreamObject, space_width: float = 200.0
self,
xform: EncodedStreamObject,
orientations: Tuple[int, ...] = (0, 90, 270, 360),
space_width: float = 200.0,
) -> str:
"""
Extract text from an XObject.
Expand All @@ -1462,7 +1520,7 @@ def extract_xform_text(
:return: The extracted text
"""
return self._extract_text(xform, self.pdf, space_width, None)
return self._extract_text(xform, self.pdf, orientations, space_width, None)

def extractText(
self, Tj_sep: str = "", TJ_sep: str = ""
Expand All @@ -1473,7 +1531,7 @@ def extractText(
Use :meth:`extract_text` instead.
"""
deprecate_with_replacement("extractText", "extract_text")
return self.extract_text(Tj_sep=Tj_sep, TJ_sep=TJ_sep)
return self.extract_text()

def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
"""
Expand Down
79 changes: 78 additions & 1 deletion tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
from io import BytesIO
from pathlib import Path
from re import findall

import pytest

Expand Down Expand Up @@ -44,7 +45,7 @@ def test_PdfReaderFileLoad():
with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
pdftext = pdftext_file.read()

text = page.extract_text(Tj_sep="", TJ_sep="").encode("utf-8")
text = page.extract_text().encode("utf-8")

# Compare the text of the PDF to a known source
for expected_line, actual_line in zip(text.split(b"\n"), pdftext.split(b"\n")):
Expand Down Expand Up @@ -209,6 +210,82 @@ def test_extract_textbench(enable, url, pages, print_result=False):
pass


def test_orientations():
p = PdfReader(os.path.join(RESOURCE_ROOT, "test Orient.pdf")).pages[0]
try:
p.extract_text("", "")
except DeprecationWarning:
pass
else:
raise Exception("DeprecationWarning expected")
try:
p.extract_text("", "", 0)
except DeprecationWarning:
pass
else:
raise Exception("DeprecationWarning expected")
try:
p.extract_text("", "", 0, 200)
except DeprecationWarning:
pass
else:
raise Exception("DeprecationWarning expected")

try:
p.extract_text(Tj_sep="", TJ_sep="")
except DeprecationWarning:
pass
else:
raise Exception("DeprecationWarning expected")
assert findall("\\((.)\\)", p.extract_text()) == ["T", "B", "L", "R"]
try:
p.extract_text(None)
except Exception:
pass
else:
raise Exception("Argument 1 check invalid")
try:
p.extract_text("", 0)
except Exception:
pass
else:
raise Exception("Argument 2 check invalid")
try:
p.extract_text("", "", None)
except Exception:
pass
else:
raise Exception("Argument 3 check invalid")
try:
p.extract_text("", "", 0, "")
except Exception:
pass
else:
raise Exception("Argument 4 check invalid")
try:
p.extract_text(0, "")
except Exception:
pass
else:
raise Exception("Argument 1 new syntax check invalid")

p.extract_text(0, 0)
p.extract_text(orientations=0)

for (req, rst) in (
(0, ["T"]),
(90, ["L"]),
(180, ["B"]),
(270, ["R"]),
((0,), ["T"]),
((0, 180), ["T", "B"]),
((45,), []),
):
assert (
findall("\\((.)\\)", p.extract_text(req)) == rst
), f"extract_text({req}) => {rst}"


@pytest.mark.parametrize(
("base_path", "overlay_path"),
[
Expand Down

0 comments on commit 8a27fa4

Please sign in to comment.