diff --git a/CHANGELOG.md b/CHANGELOG.md index dca55238..99713c33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.0.72 + +* Fix certain filetypes failing mimetype lookup in the new base image + ## 0.0.71 * replace rockylinux with chainguard/wolfi as a base image for `amd64` diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py index b566cf6d..81b465d2 100644 --- a/prepline_general/api/app.py +++ b/prepline_general/api/app.py @@ -13,7 +13,7 @@ app = FastAPI( title="Unstructured Pipeline API", summary="Partition documents with the Unstructured library", - version="0.0.71", + version="0.0.72", docs_url="/general/docs", openapi_url="/general/openapi.json", servers=[ diff --git a/prepline_general/api/filetypes.py b/prepline_general/api/filetypes.py new file mode 100644 index 00000000..8f0b3510 --- /dev/null +++ b/prepline_general/api/filetypes.py @@ -0,0 +1,107 @@ +import mimetypes +import os +from fastapi import UploadFile, HTTPException +from typing import Optional + +DEFAULT_MIMETYPES = ( + "application/pdf,application/msword,image/jpeg,image/png,text/markdown," + "text/x-markdown,text/html," + "application/vnd.openxmlformats-officedocument.wordprocessingml.document," + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," + "application/vnd.ms-excel,application/vnd.openxmlformats-officedocument." + "presentationml.presentation," + "application/json," + "application/vnd.ms-powerpoint," + "text/html,message/rfc822,text/plain,image/png," + "application/epub,application/epub+zip," + "application/rtf,text/rtf," + "application/vnd.oasis.opendocument.text," + "text/csv,text/x-csv,application/csv,application/x-csv," + "text/comma-separated-values,text/x-comma-separated-values," + "application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst," + "text/tsv,text/tab-separated-values," + "application/x-ole-storage,application/vnd.ms-outlook," + "application/yaml," + "application/x-yaml," + "text/x-yaml," + "text/yaml," + "image/bmp," + "image/heic," + "image/tiff," + "text/org," +) + +if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None): + os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES + + +def _load_mimetypes() -> None: + """Call this on startup to ensure that all expected file extensions are present in the mimetypes + lib""" + expected_mimetypes = [ + (".bmp", "image/bmp"), + (".csv", "application/csv"), + (".doc", "application/msword"), + (".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), + (".eml", "message/rfc822"), + (".epub", "application/epub"), + (".gz", "application/gzip"), + (".heic", "image/heic"), + (".html", "text/html"), + (".jpeg", "image/jpeg"), + (".jpg", "image/jpeg"), + (".json", "application/json"), + (".md", "text/markdown"), + (".msg", "application/x-ole-storage"), + (".odt", "application/vnd.oasis.opendocument.text"), + (".org", "text/org"), + (".pdf", "application/pdf"), + (".png", "image/png"), + (".ppt", "application/vnd.ms-powerpoint"), + (".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"), + (".rst", "text/prs.fallenstein.rst"), + (".rtf", "application/rtf"), + (".tiff", "image/tiff"), + (".tsv", "text/tab-separated-values"), + (".txt", "text/plain"), + (".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), + (".xml", "text/xml"), + ] + + for extension, mimetype in expected_mimetypes: + mimetypes.add_type(mimetype, extension) + + +_load_mimetypes() + + +def get_validated_mimetype(file: UploadFile) -> Optional[str]: + """The MIME-type of `file`. + + The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too + generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and + return HTTP 400 for an invalid type. + """ + content_type = file.content_type + filename = str(file.filename) # -- "None" when file.filename is None -- + if not content_type or content_type == "application/octet-stream": + content_type = mimetypes.guess_type(filename)[0] + + # Some filetypes missing for this library, just hardcode them for now + if not content_type: + if filename.endswith(".md"): + content_type = "text/markdown" + elif filename.endswith(".msg"): + content_type = "message/rfc822" + + allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") + if allowed_mimetypes_str is not None: + allowed_mimetypes = allowed_mimetypes_str.split(",") + + if content_type not in allowed_mimetypes: + raise HTTPException( + status_code=400, + detail=(f"File type {content_type} is not supported."), + ) + + return content_type diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index a02f8f41..cff47142 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -34,6 +34,7 @@ from starlette.types import Send from prepline_general.api.models.form_params import GeneralFormParams +from prepline_general.api.filetypes import get_validated_mimetype from unstructured.documents.elements import Element from unstructured.partition.auto import partition from unstructured.staging.base import ( @@ -59,37 +60,6 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool: logger = logging.getLogger("unstructured_api") -DEFAULT_MIMETYPES = ( - "application/pdf,application/msword,image/jpeg,image/png,text/markdown," - "text/x-markdown,text/html," - "application/vnd.openxmlformats-officedocument.wordprocessingml.document," - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet," - "application/vnd.ms-excel,application/vnd.openxmlformats-officedocument." - "presentationml.presentation," - "application/json," - "application/vnd.ms-powerpoint," - "text/html,message/rfc822,text/plain,image/png," - "application/epub,application/epub+zip," - "application/rtf,text/rtf," - "application/vnd.oasis.opendocument.text," - "text/csv,text/x-csv,application/csv,application/x-csv," - "text/comma-separated-values,text/x-comma-separated-values," - "application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst," - "text/tsv,text/tab-separated-values," - "application/x-ole-storage,application/vnd.ms-outlook," - "application/yaml," - "application/x-yaml," - "text/x-yaml," - "text/yaml," - "image/bmp," - "image/heic," - "image/tiff," - "text/org," -) - -if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None): - os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES - def get_pdf_splits(pdf_pages: Sequence[PageObject], split_size: int = 1): """Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages. @@ -609,38 +579,6 @@ def _set_pdf_infer_table_structure( return strategy in ("hi_res", "auto") and pdf_infer_table_structure -def get_validated_mimetype(file: UploadFile) -> Optional[str]: - """The MIME-type of `file`. - - The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too - generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and - return HTTP 400 for an invalid type. - """ - content_type = file.content_type - filename = str(file.filename) # -- "None" when file.filename is None -- - if not content_type or content_type == "application/octet-stream": - content_type = mimetypes.guess_type(filename)[0] - - # Some filetypes missing for this library, just hardcode them for now - if not content_type: - if filename.endswith(".md"): - content_type = "text/markdown" - elif filename.endswith(".msg"): - content_type = "message/rfc822" - - allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") - if allowed_mimetypes_str is not None: - allowed_mimetypes = allowed_mimetypes_str.split(",") - - if content_type not in allowed_mimetypes: - raise HTTPException( - status_code=400, - detail=(f"File type {content_type} is not supported."), - ) - - return content_type - - class MultipartMixedResponse(StreamingResponse): CRLF = b"\r\n" @@ -713,7 +651,7 @@ def return_content_type(filename: str): @router.get("/general/v0/general", include_in_schema=False) -@router.get("/general/v0.0.71/general", include_in_schema=False) +@router.get("/general/v0.0.72/general", include_in_schema=False) async def handle_invalid_get_request(): raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported." @@ -728,7 +666,7 @@ async def handle_invalid_get_request(): description="Description", operation_id="partition_parameters", ) -@router.post("/general/v0.0.71/general", include_in_schema=False) +@router.post("/general/v0.0.72/general", include_in_schema=False) def general_partition( request: Request, # cannot use annotated type here because of a bug described here: diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index f09f9458..5156b2ae 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.71 +version: 0.0.72 diff --git a/sample-docs/DA-1p.bmp b/sample-docs/DA-1p.bmp new file mode 100644 index 00000000..b018300f Binary files /dev/null and b/sample-docs/DA-1p.bmp differ diff --git a/sample-docs/DA-1p.heic b/sample-docs/DA-1p.heic new file mode 100644 index 00000000..908c9a6e Binary files /dev/null and b/sample-docs/DA-1p.heic differ diff --git a/sample-docs/layout-parser-paper-fast.tiff b/sample-docs/layout-parser-paper-fast.tiff new file mode 100644 index 00000000..b4b21a24 Binary files /dev/null and b/sample-docs/layout-parser-paper-fast.tiff differ diff --git a/scripts/smoketest.py b/scripts/smoketest.py index 1b150ed8..37bb4c61 100644 --- a/scripts/smoketest.py +++ b/scripts/smoketest.py @@ -49,72 +49,96 @@ def send_document( @pytest.mark.parametrize( - "example_filename, content_type", + ("extension", "example_filename", "content_type"), [ - # Note(yuming): Please sort filetypes alphabetically according to - # https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/auto.py#L14 - ("stanley-cups.csv", "application/csv"), - ("fake.doc", "application/msword"), - ("fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), - ("alert.eml", "message/rfc822"), - ("announcement.eml", "message/rfc822"), - ("fake-email-attachment.eml", "message/rfc822"), - ("fake-email-image-embedded.eml", "message/rfc822"), - ("fake-email.eml", "message/rfc822"), - ("family-day.eml", "message/rfc822"), - ("winter-sports.epub", "application/epub"), - ("fake-html.html", "text/html"), - pytest.param( - "layout-parser-paper-fast.jpg", - "image/jpeg", - marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"), - ), - ("spring-weather.html.json", "application/json"), - ("README.md", "text/markdown"), - ("fake-email.msg", "application/x-ole-storage"), - ("fake.odt", "application/vnd.oasis.opendocument.text"), - # Note(austin) The two inference calls will hang on mac with unsupported hardware error - # Skip these with SKIP_INFERENCE_TESTS=true make docker-test - pytest.param( - "layout-parser-paper.pdf.gz", - "application/gzip", - marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"), - ), - pytest.param( - "layout-parser-paper.pdf", - "application/pdf", - marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"), + (".bmp", "DA-1p.bmp", "image/bmp"), + (".csv", "stanley-cups.csv", "application/csv"), + (".doc", "fake.doc", "application/msword"), + ( + ".docx", + "fake.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ), - ("fake-power-point.ppt", "application/vnd.ms-powerpoint"), + (".eml", "fake-email-attachment.eml", "message/rfc822"), + (".epub", "winter-sports.epub", "application/epub"), + (".heic", "DA-1p.heic", "image/heic"), + (".html", "fake-html.html", "text/html"), + (".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"), + (".md", "README.md", "text/markdown"), + (".msg", "fake-email.msg", "application/x-ole-storage"), + (".odt", "fake.odt", "application/vnd.oasis.opendocument.text"), + (".pdf", "layout-parser-paper.pdf", "application/pdf"), + (".png", "english-and-korean.png", "image/png"), + (".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"), ( + ".pptx", "fake-power-point.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", ), - ("README.rst", "text/prs.fallenstein.rst"), - ("fake-doc.rtf", "application/rtf"), - ("fake-text.txt", "text/plain"), - ("stanley-cups.tsv", "text/tab-separated-values"), + (".rst", "README.rst", "text/prs.fallenstein.rst"), + (".rtf", "fake-doc.rtf", "application/rtf"), + (".tiff", "layout-parser-paper-fast.tiff", "image/tiff"), + (".tsv", "stanley-cups.tsv", "text/tab-separated-values"), + (".txt", "fake-text.txt", "text/plain"), ( + ".xlsx", "stanley-cups.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ), - ("fake-xml.xml", "text/xml"), + (".xml", "fake-xml.xml", "text/xml"), + (".json", "spring-weather.html.json", "application/json"), + ( + ".gz", + "layout-parser-paper.pdf.gz", + "application/gzip", + ), ], ) -def test_happy_path(example_filename: str, content_type: str): +def test_happy_path_all_types(extension, example_filename: str, content_type: str): """ For the files in sample-docs, verify that we get a 200 and some structured response """ + # The auto strategy will run ocr on these files + # This doesn't always work on our macs + if skip_inference_tests and extension in [ + ".bmp", + ".heic", + ".jpeg", + ".pdf", + ".png", + ".tiff", + ".gz", # Since we're using a gzipped pdf... + ]: + pytest.skip("emulated hardware") + test_file = str(Path("sample-docs") / example_filename) - print(f"sending {content_type}") - json_response = send_document(filenames=[test_file], content_type=content_type) - assert json_response.status_code == 200 - assert len(json_response.json()) > 0 - assert len("".join(elem["text"] for elem in json_response.json())) > 20 + # Verify we can send with explicit content type + response = send_document(filenames=[test_file], content_type=content_type) + + if response.status_code != 200: + assert False, response.text + + assert len(response.json()) > 0 + assert len("".join(elem["text"] for elem in response.json())) > 20 + + # Verify we can infer the filetype on the server + response = send_document(filenames=[test_file], content_type=None) + + if response.status_code != 200: + assert False, response.text + + assert len(response.json()) > 0 + assert len("".join(elem["text"] for elem in response.json())) > 20 + + json_response = response + + # Verify we can set output type to csv csv_response = send_document( - filenames=[test_file], content_type=content_type, output_format="text/csv" + filenames=[test_file], + content_type=content_type, + output_format="text/csv", ) assert csv_response.status_code == 200 assert len(csv_response.text) > 0