diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 15cc6d4839..a600de7f46 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1521,23 +1521,22 @@ def test_document_to_element_list_sets_category_depth_titles(): @pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) @pytest.mark.parametrize( - ("strategy", "origin"), + "strategy", # fast: can't capture the "intentionally left blank page" page # others: will ignore the actual blank page [ - (PartitionStrategy.FAST, {"pdfminer"}), - (PartitionStrategy.HI_RES, {"yolox", "pdfminer", "ocr_tesseract"}), - (PartitionStrategy.OCR_ONLY, {"ocr_tesseract"}), + PartitionStrategy.FAST, + PartitionStrategy.HI_RES, + PartitionStrategy.OCR_ONLY, ], ) def test_partition_pdf_with_password( - file_mode, - strategy, - origin, - filename=example_doc_path("pdf/password.pdf"), + file_mode: str, + strategy: str, + filename: str = example_doc_path("pdf/password.pdf"), ): # Test that the partition_pdf function can handle filename - def _test(result): + def _test(result: list[Element]): # validate that the result is a non-empty list of dicts assert len(result) == 1 assert result[0].text == "File with password" @@ -1551,10 +1550,10 @@ def _test(result): _test(result) else: with open(filename, "rb") as test_file: - spooled_temp_file = SpooledTemporaryFile() - spooled_temp_file.write(test_file.read()) - spooled_temp_file.seek(0) - result = pdf.partition_pdf( - file=spooled_temp_file, strategy=strategy, password="password" - ) - _test(result) + with SpooledTemporaryFile() as spooled_temp_file: + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + result = pdf.partition_pdf( + file=spooled_temp_file, strategy=strategy, password="password" + ) + _test(result) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 68eae727ec..4643b38c0a 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -364,7 +364,7 @@ def extractable_elements( languages: Optional[list[str]] = None, metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, - password:Optional[str] = None, + password: Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: if isinstance(file, bytes): @@ -386,7 +386,7 @@ def _partition_pdf_with_pdfminer( languages: list[str], metadata_last_modified: Optional[str], starting_page_number: int = 1, - password:Optional[str] = None, + password: Optional[str] = None, **kwargs: Any, ) -> list[list[Element]]: """Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster @@ -445,7 +445,7 @@ def _process_pdfminer_pages( for page_number, (page, page_layout) in enumerate( open_pdfminer_pages_generator(fp, password=password), - start=starting_page_number, + start=starting_page_number, ): width, height = page_layout.width, page_layout.height @@ -567,7 +567,7 @@ def _partition_pdf_or_image_local( extract_forms: bool = False, form_extraction_skip_tables: bool = True, pdf_hi_res_max_pages: Optional[int] = None, - password:Optional[str] = None, + password: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partition using package installed locally""" @@ -703,7 +703,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, - password=password + password=password, pdf_image_dpi=pdf_image_dpi, ocr_layout_dumper=ocr_layout_dumper, ) @@ -879,8 +879,7 @@ def _partition_pdf_or_image_with_ocr( elements.extend(page_elements) else: for page_number, image in enumerate( - convert_pdf_to_images(filename, file, password=password), - start=starting_page_number + convert_pdf_to_images(filename, file, password=password), start=starting_page_number ): page_elements = _partition_pdf_or_image_with_ocr_from_image( image=image,