Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Melhoria na pasta de testes e novos casos de teste do file_transform.py #54

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added tests/data/fake_file.txt
Empty file.
Binary file added tests/data/tabula-1.0.5-jar-with-dependencies.jar
Binary file not shown.
Binary file added tests/data/tika-app-1.24.1.jar
Binary file not shown.
52 changes: 0 additions & 52 deletions tests/file_transform_tests.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import os
from unittest import TestCase, expectedFailure

from unittest.mock import patch
from querido_diario_toolbox import Gazette
from querido_diario_toolbox.etl.apache_tika_text_extractor import ApacheTikaExtractor
from querido_diario_toolbox.etl.text_extractor import create_text_extractor
from querido_diario_toolbox.etl.text_extractor_interface import TextExtractor

ROOT = "tests/bin"
TIKA_PATH = ROOT + "/tika-app-1.24.1.jar"
if os.path.basename(os.getcwd()) == "tests":
ROOT = "data/"
else:
ROOT = "tests/data/"
TIKA_PATH = os.path.join(ROOT, "tika-app-1.24.1.jar")


class ApacheTikaTextExtractorTests(TestCase):
Expand All @@ -22,7 +25,7 @@ def test_if_class_check_if_jar_file_exists(self):

@expectedFailure
def test_if_class_check_if_binary_is_jar_file(self):
ApacheTikaExtractor("tests/data/fake_gazette.doc")
ApacheTikaExtractor("fake_gazette.doc")

def test_function_to_assembly_apache_tika_command(self):
apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH)
Expand All @@ -43,29 +46,29 @@ def test_extract_text_from_file_with_invalid_file(self):

def test_extract_text(self):
apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH)
gazette = Gazette(filepath="tests/data/fake_gazette.pdf")
gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf")
apache_tika_extractor.extract_text(gazette)
self.assertTrue(os.path.exists("tests/data/fake_gazette.txt"))
self.assertEqual("tests/data/fake_gazette.txt", gazette.content_file)
self.assertTrue(os.path.exists(ROOT + "/fake_gazette.txt"))
self.assertEqual(ROOT + "/fake_gazette.txt", gazette.content_file)

def test_load_text(self):
apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH)
gazette = Gazette(filepath="tests/data/fake_gazette.pdf")
gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf")
apache_tika_extractor.extract_text(gazette)
apache_tika_extractor.load_content(gazette)
expected_content = "Hi this is a document created to test the text extraction for the Querido Diário project."
self.assertIn(expected_content, gazette.content)

def test_extract_metadata(self):
apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH)
gazette = Gazette(filepath="tests/data/fake_gazette.pdf")
gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf")
apache_tika_extractor.extract_metadata(gazette)
self.assertTrue(os.path.exists("tests/data/fake_gazette.json"))
self.assertEqual("tests/data/fake_gazette.json", gazette.metadata_file)
self.assertTrue(os.path.exists(ROOT + "/fake_gazette.json"))
self.assertEqual(ROOT + "/fake_gazette.json", gazette.metadata_file)

def test_load_metadata(self):
apache_tika_extractor = ApacheTikaExtractor(TIKA_PATH)
gazette = Gazette(filepath="tests/data/fake_gazette.pdf")
gazette = Gazette(filepath=ROOT + "/fake_gazette.pdf")
apache_tika_extractor.extract_metadata(gazette)
apache_tika_extractor.load_metadata(gazette)
self.assertIsInstance(gazette.metadata, dict)
Expand All @@ -75,4 +78,4 @@ def test_create_text_extractor(self):
config = {"apache_tika_jar": TIKA_PATH}
text_extractor = create_text_extractor(config)
self.assertIsInstance(text_extractor, TextExtractor)
self.assertEqual(text_extractor.apache_tika_jar, TIKA_PATH)
self.assertEqual(text_extractor.apache_tika_jar, TIKA_PATH)
203 changes: 203 additions & 0 deletions tests/test_file_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import json
import os
import tempfile
from unittest import TestCase
import unittest
from unittest import mock
from unittest.mock import patch

from querido_diario_toolbox.etl.file_transform import *


class TextExtractionTests(TestCase):
def create_temporary_file(self):
with tempfile.NamedTemporaryFile("w", delete=False) as file:
return file.name
raise Exception("Cannot create temporary file")

def add_suffix_in_file_name(self, file_path, suffix):
new_file_name = f"{file_path}{suffix}"
os.rename(file_path, new_file_name)
return new_file_name

def create_json_file(self):
temp_file = self.create_temporary_file()
with open(temp_file, "w") as json_file:
json.dump({"test": "json data"}, json_file)
new_file_name = self.add_suffix_in_file_name(temp_file, ".json")
return new_file_name

def create_invalid_json_file(self):
temp_file = self.create_temporary_file()
new_file_name = self.add_suffix_in_file_name(temp_file, ".txt")
return new_file_name

def test_valid_json_file(self):
json_file = self.create_json_file()
is_json_file = is_json(json_file)
self.assertTrue(
is_json_file,
msg="Only files with application/json mimetype or text file with .json extensions should be considered JSON files",
)

def test_invalid_json_file(self):
json_file = self.create_invalid_json_file()
is_json_file = is_json(json_file)
self.assertFalse(
is_json_file,
msg="Only files with application/json mimetype or text file with .json extensions should be considered JSON files",
)

def test_has_suffix_in_name(self):
has_json_suffix = has_suffix_in_name("testing/file/path/test.json", "json")
self.assertTrue(has_json_suffix)
has_txt_suffix = has_suffix_in_name("testing/file/path/test.json", "txt")
self.assertFalse(has_txt_suffix)

def test_check_file_exists_file_exists(self):
temp_file = self.create_temporary_file()
try:
check_file_exists(temp_file)
finally:
os.remove(temp_file)

def test_check_file_type_supported_valid_file(self):
temp_file = self.create_temporary_file()
try:
with unittest.mock.patch('magic.from_file', return_value="application/pdf"):
check_file_type_supported(temp_file)
finally:
os.remove(temp_file)

def test_check_file_type_supported_invalid_file(self):
temp_file = self.create_temporary_file()
try:
with unittest.mock.patch('magic.from_file', return_value="application/unknown"):
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_rtf(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "application/rtf"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')

@patch('magic.from_file')
def test_check_file_type_supported_doc(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "application/msword"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_html(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "text/html"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_pdf(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "application/pdf"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_txt(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "text/plain"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_rtf(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "application/rtf"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_png(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "image/png"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_tiff(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "image/tiff"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)


@patch('magic.from_file')
def test_check_file_type_supported_jpeg(self, mock_magic):
temp_file = self.create_temporary_file()
mock_magic.return_value = "image/jpeg"
try:
with self.assertRaises(Exception) as context:
check_file_type_supported(temp_file)
self.assertIn("Unsupported file type", str(context.exception))
except Exception:
print('LOG falha params')
finally:
os.remove(temp_file)
File renamed without changes.
Loading