Skip to content

Commit

Permalink
fix: linting and small bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
Coniferish committed Jan 18, 2025
1 parent 5fc2f7e commit c252fc1
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 23 deletions.
31 changes: 15 additions & 16 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,23 +1521,22 @@ def test_document_to_element_list_sets_category_depth_titles():

@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
("strategy", "origin"),
"strategy",
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
(PartitionStrategy.FAST, {"pdfminer"}),
(PartitionStrategy.HI_RES, {"yolox", "pdfminer", "ocr_tesseract"}),
(PartitionStrategy.OCR_ONLY, {"ocr_tesseract"}),
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_with_password(
file_mode,
strategy,
origin,
filename=example_doc_path("pdf/password.pdf"),
file_mode: str,
strategy: str,
filename: str = example_doc_path("pdf/password.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result):
def _test(result: list[Element]):
# validate that the result is a non-empty list of dicts
assert len(result) == 1
assert result[0].text == "File with password"
Expand All @@ -1551,10 +1550,10 @@ def _test(result):
_test(result)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, password="password"
)
_test(result)
with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, password="password"
)
_test(result)
13 changes: 6 additions & 7 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def extractable_elements(
languages: Optional[list[str]] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
password:Optional[str] = None,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
if isinstance(file, bytes):
Expand All @@ -386,7 +386,7 @@ def _partition_pdf_with_pdfminer(
languages: list[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
password:Optional[str] = None,
password: Optional[str] = None,
**kwargs: Any,
) -> list[list[Element]]:
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
Expand Down Expand Up @@ -445,7 +445,7 @@ def _process_pdfminer_pages(

for page_number, (page, page_layout) in enumerate(
open_pdfminer_pages_generator(fp, password=password),
start=starting_page_number,
start=starting_page_number,
):
width, height = page_layout.width, page_layout.height

Expand Down Expand Up @@ -567,7 +567,7 @@ def _partition_pdf_or_image_local(
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
pdf_hi_res_max_pages: Optional[int] = None,
password:Optional[str] = None,
password: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partition using package installed locally"""
Expand Down Expand Up @@ -703,7 +703,7 @@ def _partition_pdf_or_image_local(
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
password=password
password=password,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
Expand Down Expand Up @@ -879,8 +879,7 @@ def _partition_pdf_or_image_with_ocr(
elements.extend(page_elements)
else:
for page_number, image in enumerate(
convert_pdf_to_images(filename, file, password=password),
start=starting_page_number
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
):
page_elements = _partition_pdf_or_image_with_ocr_from_image(
image=image,
Expand Down

0 comments on commit c252fc1

Please sign in to comment.