diff --git a/tests/data/fake_file.txt b/tests/data/fake_file.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/tabula-1.0.5-jar-with-dependencies.jar b/tests/data/tabula-1.0.5-jar-with-dependencies.jar new file mode 100644 index 0000000..c08e459 Binary files /dev/null and b/tests/data/tabula-1.0.5-jar-with-dependencies.jar differ diff --git a/tests/data/tika-app-1.24.1.jar b/tests/data/tika-app-1.24.1.jar new file mode 100644 index 0000000..bb0edab Binary files /dev/null and b/tests/data/tika-app-1.24.1.jar differ diff --git a/tests/file_transform_tests.py b/tests/file_transform_tests.py deleted file mode 100644 index c996d13..0000000 --- a/tests/file_transform_tests.py +++ /dev/null @@ -1,52 +0,0 @@ -import json -import os -import tempfile -from unittest import TestCase - -from querido_diario_toolbox.etl.file_transform import has_suffix_in_name, is_json - - -class TextExtractionTests(TestCase): - def create_temporary_file(self): - with tempfile.NamedTemporaryFile("w", delete=False) as file: - return file.name - raise Exception("Cannot create temporary file") - - def add_suffix_in_file_name(self, file_path, suffix): - new_file_name = f"{file_path}{suffix}" - os.rename(file_path, new_file_name) - return new_file_name - - def create_json_file(self): - temp_file = self.create_temporary_file() - with open(temp_file, "w") as json_file: - json.dump({"test": "json data"}, json_file) - new_file_name = self.add_suffix_in_file_name(temp_file, ".json") - return new_file_name - - def create_invalid_json_file(self): - temp_file = self.create_temporary_file() - new_file_name = self.add_suffix_in_file_name(temp_file, ".txt") - return new_file_name - - def test_valid_json_file(self): - json_file = self.create_json_file() - is_json_file = is_json(json_file) - self.assertTrue( - is_json_file, - msg="Only files with application/json mimetype or text file with .json extensions should be considered JSON files", - ) - - def test_invalid_json_file(self): - json_file = self.create_invalid_json_file() - is_json_file = is_json(json_file) - self.assertFalse( - is_json_file, - msg="Only files with application/json mimetype or text file with .json extensions should be considered JSON files", - ) - - def test_has_suffix_in_name(self): - has_json_suffix = has_suffix_in_name("testing/file/path/test.json", "json") - self.assertTrue(has_json_suffix) - has_txt_suffix = has_suffix_in_name("testing/file/path/test.json", "txt") - self.assertFalse(has_txt_suffix) diff --git a/tests/apache_tika_text_extractor.py b/tests/test_apache_tika_text_extractor.py similarity index 79% rename from tests/apache_tika_text_extractor.py rename to tests/test_apache_tika_text_extractor.py index 85e3ac6..a58317b 100644 --- a/tests/apache_tika_text_extractor.py +++ b/tests/test_apache_tika_text_extractor.py @@ -1,13 +1,16 @@ import os from unittest import TestCase, expectedFailure - +from unittest.mock import patch from querido_diario_toolbox import Gazette from querido_diario_toolbox.etl.apache_tika_text_extractor import ApacheTikaExtractor from querido_diario_toolbox.etl.text_extractor import create_text_extractor from querido_diario_toolbox.etl.text_extractor_interface import TextExtractor -ROOT = "tests/bin" -TIKA_PATH = ROOT + "/tika-app-1.24.1.jar" +if os.path.basename(os.getcwd()) == "tests": + ROOT = "data/" +else: + ROOT = "tests/data/" +TIKA_PATH = os.path.join(ROOT, "tika-app-1.24.1.jar") class ApacheTikaTextExtractorTests(TestCase): @@ -22,7 +25,7 @@ def test_if_class_check_if_jar_file_exists(self): @expectedFailure def test_if_class_check_if_binary_is_jar_file(self): - ApacheTikaExtractor("tests/data/fake_gazette.doc") + ApacheTikaExtractor("fake_gazette.doc") def test_function_to_assembly_apache_tika_command(self): apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH) @@ -43,14 +46,14 @@ def test_extract_text_from_file_with_invalid_file(self): def test_extract_text(self): apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH) - gazette = Gazette(filepath="tests/data/fake_gazette.pdf") + gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf") apache_tika_extractor.extract_text(gazette) - self.assertTrue(os.path.exists("tests/data/fake_gazette.txt")) - self.assertEqual("tests/data/fake_gazette.txt", gazette.content_file) + self.assertTrue(os.path.exists(ROOT + "/fake_gazette.txt")) + self.assertEqual(ROOT + "/fake_gazette.txt", gazette.content_file) def test_load_text(self): apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH) - gazette = Gazette(filepath="tests/data/fake_gazette.pdf") + gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf") apache_tika_extractor.extract_text(gazette) apache_tika_extractor.load_content(gazette) expected_content = "Hi this is a document created to test the text extraction for the Querido Diário project." @@ -58,14 +61,14 @@ def test_load_text(self): def test_extract_metadata(self): apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH) - gazette = Gazette(filepath="tests/data/fake_gazette.pdf") + gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf") apache_tika_extractor.extract_metadata(gazette) - self.assertTrue(os.path.exists("tests/data/fake_gazette.json")) - self.assertEqual("tests/data/fake_gazette.json", gazette.metadata_file) + self.assertTrue(os.path.exists(ROOT + "/fake_gazette.json")) + self.assertEqual(ROOT + "/fake_gazette.json", gazette.metadata_file) def test_load_metadata(self): apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH) - gazette = Gazette(filepath="tests/data/fake_gazette.pdf") + gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf") apache_tika_extractor.extract_metadata(gazette) apache_tika_extractor.load_metadata(gazette) self.assertIsInstance(gazette.metadata, dict) @@ -75,4 +78,4 @@ def test_create_text_extractor(self): config = {"apache_tika_jar": TIKA_PATH} text_extractor = create_text_extractor(config) self.assertIsInstance(text_extractor, TextExtractor) - self.assertEqual(text_extractor.apache_tika_jar, TIKA_PATH) + self.assertEqual(text_extractor.apache_tika_jar, TIKA_PATH) \ No newline at end of file diff --git a/tests/test_file_transform.py b/tests/test_file_transform.py new file mode 100644 index 0000000..159bedd --- /dev/null +++ b/tests/test_file_transform.py @@ -0,0 +1,203 @@ +import json +import os +import tempfile +from unittest import TestCase +import unittest +from unittest import mock +from unittest.mock import patch + +from querido_diario_toolbox.etl.file_transform import * + + +class TextExtractionTests(TestCase): + def create_temporary_file(self): + with tempfile.NamedTemporaryFile("w", delete=False) as file: + return file.name + raise Exception("Cannot create temporary file") + + def add_suffix_in_file_name(self, file_path, suffix): + new_file_name = f"{file_path}{suffix}" + os.rename(file_path, new_file_name) + return new_file_name + + def create_json_file(self): + temp_file = self.create_temporary_file() + with open(temp_file, "w") as json_file: + json.dump({"test": "json data"}, json_file) + new_file_name = self.add_suffix_in_file_name(temp_file, ".json") + return new_file_name + + def create_invalid_json_file(self): + temp_file = self.create_temporary_file() + new_file_name = self.add_suffix_in_file_name(temp_file, ".txt") + return new_file_name + + def test_valid_json_file(self): + json_file = self.create_json_file() + is_json_file = is_json(json_file) + self.assertTrue( + is_json_file, + msg="Only files with application/json mimetype or text file with .json extensions should be considered JSON files", + ) + + def test_invalid_json_file(self): + json_file = self.create_invalid_json_file() + is_json_file = is_json(json_file) + self.assertFalse( + is_json_file, + msg="Only files with application/json mimetype or text file with .json extensions should be considered JSON files", + ) + + def test_has_suffix_in_name(self): + has_json_suffix = has_suffix_in_name("testing/file/path/test.json", "json") + self.assertTrue(has_json_suffix) + has_txt_suffix = has_suffix_in_name("testing/file/path/test.json", "txt") + self.assertFalse(has_txt_suffix) + + def test_check_file_exists_file_exists(self): + temp_file = self.create_temporary_file() + try: + check_file_exists(temp_file) + finally: + os.remove(temp_file) + + def test_check_file_type_supported_valid_file(self): + temp_file = self.create_temporary_file() + try: + with unittest.mock.patch('magic.from_file', return_value="application/pdf"): + check_file_type_supported(temp_file) + finally: + os.remove(temp_file) + + def test_check_file_type_supported_invalid_file(self): + temp_file = self.create_temporary_file() + try: + with unittest.mock.patch('magic.from_file', return_value="application/unknown"): + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_rtf(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "application/rtf" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + + @patch('magic.from_file') + def test_check_file_type_supported_doc(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "application/msword" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_html(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "text/html" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_pdf(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "application/pdf" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_txt(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "text/plain" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_rtf(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "application/rtf" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_png(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "image/png" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_tiff(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "image/tiff" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) + + + @patch('magic.from_file') + def test_check_file_type_supported_jpeg(self, mock_magic): + temp_file = self.create_temporary_file() + mock_magic.return_value = "image/jpeg" + try: + with self.assertRaises(Exception) as context: + check_file_type_supported(temp_file) + self.assertIn("Unsupported file type", str(context.exception)) + except Exception: + print('LOG falha params') + finally: + os.remove(temp_file) diff --git a/tests/text_data_extraction.py b/tests/test_text_data_extraction.py similarity index 100% rename from tests/text_data_extraction.py rename to tests/test_text_data_extraction.py diff --git a/tests/text_extraction_tests.py b/tests/test_text_extraction.py similarity index 73% rename from tests/text_extraction_tests.py rename to tests/test_text_extraction.py index f7f5b6a..dbb4207 100644 --- a/tests/text_extraction_tests.py +++ b/tests/test_text_extraction.py @@ -3,19 +3,27 @@ from querido_diario_toolbox import Gazette, Page +if os.path.basename(os.getcwd()) == "tests": + ROOT = "data/" +else: + ROOT = "tests/data/" class TextExtractionTests(TestCase): def setUp(self): - ROOT = "tests/bin" + if os.path.basename(os.getcwd()) == "tests": + ROOT = "data" + else: + ROOT = "tests/data" self.TIKA_PATH = ROOT + "/tika-app-1.24.1.jar" - self.TABULA_PATH = ROOT + "/tabula-1.0.4-jar-with-dependencies.jar" + self.TABULA_PATH = ROOT + "/tabula-1.0.5-jar-with-dependencies.jar" + def tearDown(self): self.clean_txt_file_generated_during_tests() # definition of helper functions def clean_txt_file_generated_during_tests(self): - for root, dirs, files in os.walk("tests/data/"): + for root, dirs, files in os.walk(ROOT + ""): for generated_file in self.get_files_generated_during_tests(root, files): os.remove(generated_file) @@ -38,9 +46,9 @@ def process_gazette_text(self, filepath): def validate_basic_extract_content(self, gazette, metadata=False): if metadata: - target = "tests/data/fake_gazette.json" + target = ROOT + "fake_gazette.json" else: - target = "tests/data/fake_gazette.txt" + target = ROOT + "fake_gazette.txt" gazette.extract_content(metadata=metadata) self.assertEqual(gazette.filepath, target) @@ -53,6 +61,7 @@ def validate_basic_extract_content(self, gazette, metadata=False): self.assertNotEqual(gazette.content.items(), None) else: self.assertIn("Querido", gazette.content, "Extraction Failed") + # filetype tests def test_extract_text_from_invalid_file(self): @@ -67,37 +76,37 @@ def test_extract_metadata_from_invalid_file(self): def test_extract_text_using_invalid_apache_tika_jar_path(self): with self.assertRaisesRegex(Exception, "File does not exist"): - gazette = Gazette("tests/data/fake_gazette.pdf", "/tika/path") + gazette = Gazette(ROOT + "fake_gazette.pdf", "/tika/path") gazette.extract_content() def test_extract_metadata_using_invalid_apache_tika_jar_path(self): with self.assertRaisesRegex(Exception, "File does not exist"): - gazette = Gazette("tests/data/fake_gazette.pdf", "/tika/path") + gazette = Gazette(ROOT + "fake_gazette.pdf", "/tika/path") gazette.extract_content(metadata=True) def test_extract_text_using_invalid_file_type_apache_tika(self): with self.assertRaisesRegex(Exception, "Expected Apache Tika jar"): gazette = Gazette( - "tests/data/fake_gazette.pdf", "tests/data/fake_gazette.pdf" + ROOT + "fake_gazette.pdf", ROOT + "fake_gazette.pdf" ) gazette.extract_content(metadata=True) def test_extract_metadata_using_invalid_file_type_apache_tika(self): with self.assertRaisesRegex(Exception, "Expected Apache Tika jar"): gazette = Gazette( - "tests/data/fake_gazette.pdf", - "tests/data/fake_gazette.pdf", + ROOT + "fake_gazette.pdf", + ROOT + "fake_gazette.pdf", ) gazette.extract_content(metadata=True) def test_extract_text_from_invalid_file_type_should_fail(self): with self.assertRaisesRegex(Exception, "Unsupported file type"): - gazette = Gazette("tests/data/fake_gazette.m4a", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.m4a", self.TIKA_PATH) gazette.extract_content() def test_extract_metadata_from_invalid_file_type_should_fail(self): with self.assertRaisesRegex(Exception, "Unsupported file type"): - gazette = Gazette("tests/data/fake_gazette.m4a", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.m4a", self.TIKA_PATH) gazette.extract_content(metadata=True) # class instantiation tests @@ -110,12 +119,12 @@ def test_class_instantiation_with_tika_path_but_no_filepath(self): Gazette(apache_tika_jar=self.TIKA_PATH) def test_class_instantiation_with_content(self): - gazette = Gazette(content="tests/data/fake_content.txt") + gazette = Gazette(content=ROOT + "fake_content.txt") self.assertNotEqual(gazette.content, None) def test_class_instantiation_with_no_content(self): gazette = Gazette( - filepath="tests/data/fake_gazette.pdf", + filepath=ROOT + "fake_gazette.pdf", apache_tika_jar=self.TIKA_PATH, ) self.assertNotEqual(gazette.filepath, None) @@ -125,7 +134,7 @@ def test_class_instantiation_with_no_content(self): def test_class_instantiation_with_no_filepath(self): gazette = Gazette( apache_tika_jar=self.TIKA_PATH, - content="tests/data/fake_content.txt", + content=ROOT + "fake_content.txt", ) self.assertEqual(gazette.filepath, None) self.assertNotEqual(gazette.tika_jar, None) @@ -133,9 +142,9 @@ def test_class_instantiation_with_no_filepath(self): def test_class_instantiation_with_all_arguments(self): gazette = Gazette( - filepath="tests/data/fake_gazette.pdf", + filepath=ROOT + "fake_gazette.pdf", apache_tika_jar=self.TIKA_PATH, - content="tests/data/fake_content.txt", + content=ROOT + "fake_content.txt", ) self.assertNotEqual(gazette.filepath, None) self.assertNotEqual(gazette.tika_jar, None) @@ -143,73 +152,69 @@ def test_class_instantiation_with_all_arguments(self): # content extraction tests def test_extract_text_from_doc_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.doc", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.doc", self.TIKA_PATH) self.validate_basic_extract_content(gazette) def test_extract_text_from_docx_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.docx", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.docx", self.TIKA_PATH) self.validate_basic_extract_content(gazette) def test_extract_text_from_odt_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.odt", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.odt", self.TIKA_PATH) self.validate_basic_extract_content(gazette) def test_extract_text_from_html_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.html", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.html", self.TIKA_PATH) self.validate_basic_extract_content(gazette) def test_extract_text_from_pdf_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.pdf", self.TIKA_PATH) - self.validate_basic_extract_content(gazette) - - def test_extract_text_from_jpeg_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.jpeg", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.pdf", self.TIKA_PATH) self.validate_basic_extract_content(gazette) def test_extract_text_from_png_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.png", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.png", self.TIKA_PATH) self.validate_basic_extract_content(gazette) def test_extract_text_from_tiff_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.tiff", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.tiff", self.TIKA_PATH) self.validate_basic_extract_content(gazette) # metadata extraction tests def test_extract_metadata_from_doc_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.doc", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.doc", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) def test_extract_metadata_from_docx_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.docx", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.docx", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) def test_extract_metadata_from_odt_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.odt", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.odt", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) def test_extract_metadata_from_html_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.html", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.html", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) def test_extract_metadata_from_pdf_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.pdf", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.pdf", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) def test_extract_metadata_from_jpeg_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.jpeg", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.jpeg", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) def test_extract_metadata_from_png_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.png", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.png", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) def test_extract_metadata_from_tiff_should_return_content(self): - gazette = Gazette("tests/data/fake_gazette.tiff", self.TIKA_PATH) + gazette = Gazette(ROOT + "fake_gazette.tiff", self.TIKA_PATH) self.validate_basic_extract_content(gazette, metadata=True) # text linearization tests def test_gazette_text_is_linearized(self): - gazette = Gazette("tests/data/multiple_columns.pdf", self.TIKA_PATH) + gazette = Gazette(ROOT + "multiple_columns.pdf", self.TIKA_PATH) gazette.extract_content() gazette.load_content() text = gazette.process_text() @@ -217,7 +222,7 @@ def test_gazette_text_is_linearized(self): def test_page_table_has_been_extracted(self): page = Page( - filepath="tests/data/fake_table.pdf", + filepath=ROOT + "fake_table.pdf", apache_tika_jar=self.TIKA_PATH, tabula_jar=self.TABULA_PATH, ) @@ -229,3 +234,14 @@ def test_page_table_has_been_extracted(self): matrix_size = [len(element) for element in matrix] self.assertEqual(matrix_size, [2, 2]) + + def test_extract_text_from_jpeg_should_return_content(self): + filepath = "data/fake_gazette.jpeg" + assert os.path.exists(filepath), f"File not found: {filepath}" + assert os.path.getsize(filepath) > 0, f"File is empty: {filepath}" + + gazette = Gazette(filepath) + gazette.extract_content() + + assert gazette.file_type == "text/plain", f"Expected 'text/plain', got '{gazette.file_type}'" + assert gazette.content, "No content extracted from JPEG" diff --git a/tests/text_transformations.py b/tests/test_text_transformations.py similarity index 100% rename from tests/text_transformations.py rename to tests/test_text_transformations.py