diff --git a/CHANGELOG.md b/CHANGELOG.md index 002f1eaae3..658f4d23f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,17 @@ -## 0.15.2-dev2 +## 0.15.2-dev3 ### Enhancements ### Features +* **Added per-class Object Detection metrics in the evaluation**. The metrics include average precision, precision, recall, and f1-score for each class in the dataset. + ### Fixes * **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions. * **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters). * **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive. +* **Fixes an issue in Object Detection metrics** The issue was in preprocessing/validating the ground truth and predicted data for object detection metrics. ## 0.15.1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index dc814f281f..24b428a1eb 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.2-dev2" # pragma: no cover +__version__ = "0.15.2-dev3" # pragma: no cover diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py index ba162b5336..c6446ac9d1 100755 --- a/unstructured/ingest/evaluate.py +++ b/unstructured/ingest/evaluate.py @@ -6,7 +6,8 @@ from unstructured.metrics.evaluate import ( ElementTypeMetricsCalculator, - ObjectDetectionMetricsCalculator, + ObjectDetectionAggregatedMetricsCalculator, + ObjectDetectionPerClassMetricsCalculator, TableStructureMetricsCalculator, TextExtractionMetricsCalculator, filter_metrics, @@ -291,14 +292,23 @@ def measure_object_detection_metrics_command( output_list: Optional[List[str]] = None, source_list: Optional[List[str]] = None, ): - return ( - ObjectDetectionMetricsCalculator( + aggregated_df = ( + ObjectDetectionAggregatedMetricsCalculator( + documents_dir=output_dir, + ground_truths_dir=source_dir, + ) + .on_files(document_paths=output_list, ground_truth_paths=source_list) + .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True) + ) + per_class_df = ( + ObjectDetectionPerClassMetricsCalculator( documents_dir=output_dir, ground_truths_dir=source_dir, ) .on_files(document_paths=output_list, ground_truth_paths=source_list) .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True) ) + return aggregated_df, per_class_df @main.command() diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 4eb4f6f01f..05eab09c8e 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -3,6 +3,7 @@ from __future__ import annotations import concurrent.futures +import json import logging import os import sys @@ -18,7 +19,9 @@ calculate_element_type_percent_match, get_element_type_frequency, ) -from unstructured.metrics.object_detection import ObjectDetectionEvalProcessor +from unstructured.metrics.object_detection import ( + ObjectDetectionEvalProcessor, +) from unstructured.metrics.table.table_eval import TableEvalProcessor from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text from unstructured.metrics.utils import ( @@ -68,10 +71,14 @@ def __post_init__(self): # -- auto-discover all files in the directories -- self._document_paths = [ - path.relative_to(self.documents_dir) for path in self.documents_dir.rglob("*") + path.relative_to(self.documents_dir) + for path in self.documents_dir.glob("*") + if path.is_file() ] self._ground_truth_paths = [ - path.relative_to(self.ground_truths_dir) for path in self.ground_truths_dir.rglob("*") + path.relative_to(self.ground_truths_dir) + for path in self.ground_truths_dir.glob("*") + if path.is_file() ] @property @@ -147,7 +154,13 @@ def calculate( def _default_executor(cls): max_processors = int(os.environ.get("MAX_PROCESSES", os.cpu_count())) logger.info(f"Configuring a pool of {max_processors} processors for parallel processing.") - return concurrent.futures.ProcessPoolExecutor(max_workers=max_processors) + return cls._get_executor_class()(max_workers=max_processors) + + @classmethod + def _get_executor_class( + cls, + ) -> type[concurrent.futures.ThreadPoolExecutor] | type[concurrent.futures.ProcessPoolExecutor]: + return concurrent.futures.ProcessPoolExecutor def _process_all_documents( self, executor: concurrent.futures.Executor, visualize_progress: bool @@ -336,6 +349,17 @@ def _validate_inputs(self): "Specified file type under `documents_dir` or `output_list` should be one of " f"`json` or `txt`. The given file type is {self.document_type}, exiting." ) + for path in self._document_paths: + try: + path.suffixes[-1] + except IndexError: + logger.error(f"File {path} does not have a suffix, skipping") + continue + if path.suffixes[-1] != f".{self.document_type}": + logger.warning( + "The directory contains file type inconsistent with the given input. " + "Please note that some files will be skipped." + ) if not all(path.suffixes[-1] == f".{self.document_type}" for path in self._document_paths): logger.warning( "The directory contains file type inconsistent with the given input. " @@ -598,7 +622,7 @@ def filter_metrics( @dataclass -class ObjectDetectionMetricsCalculator(BaseMetricsCalculator): +class ObjectDetectionMetricsCalculatorBase(BaseMetricsCalculator, ABC): """ Calculates object detection metrics for each document: - f1 score @@ -613,6 +637,7 @@ def __post_init__(self): self._document_paths = [ path.relative_to(self.documents_dir) for path in self.documents_dir.rglob("analysis/*/layout_dump/object_detection.json") + if path.is_file() ] @property @@ -643,8 +668,9 @@ def _find_file_in_ground_truth(self, file_stem: str) -> Optional[Path]: return path return None - def _process_document(self, doc: Path) -> Optional[list]: - """Calculate metrics for a single document. + def _get_paths(self, doc: Path) -> tuple(str, Path, Path): + """Resolves ground doctype, prediction file path and ground truth path. + As OD dump directory structure differes from other simple outputs, it needs a specific processing to match the output OD dump file with corresponding OD GT file. @@ -667,7 +693,7 @@ def _process_document(self, doc: Path) -> Optional[list]: doc (Path): path to the OD dump file Returns: - list: a list of metrics (representing a single row) for a single document + tuple: doctype, prediction file path, ground truth path """ od_dump_path = Path(doc) file_stem = od_dump_path.parts[-3] # we take the `document_name` - so the filename stem @@ -675,31 +701,21 @@ def _process_document(self, doc: Path) -> Optional[list]: src_gt_filename = self._find_file_in_ground_truth(file_stem) if src_gt_filename not in self._ground_truth_paths: - return None + raise ValueError(f"Ground truth file {src_gt_filename} not found in list of GT files") doctype = Path(src_gt_filename.stem).suffix[1:] prediction_file = self.documents_dir / doc if not prediction_file.exists(): logger.warning(f"Prediction file {prediction_file} does not exist, skipping") - return None + raise ValueError(f"Prediction file {prediction_file} does not exist") ground_truth_file = self.ground_truths_dir / src_gt_filename if not ground_truth_file.exists(): logger.warning(f"Ground truth file {ground_truth_file} does not exist, skipping") - return None + raise ValueError(f"Ground truth file {ground_truth_file} does not exist") - processor = ObjectDetectionEvalProcessor.from_json_files( - prediction_file_path=prediction_file, - ground_truth_file_path=ground_truth_file, - ) - metrics = processor.get_metrics() - - return [ - src_gt_filename.stem, - doctype, - None, # connector - ] + [getattr(metrics, metric) for metric in self.supported_metric_names] + return doctype, prediction_file, ground_truth_file def _generate_dataframes(self, rows) -> tuple[pd.DataFrame, pd.DataFrame]: headers = ["filename", "doctype", "connector"] + self.supported_metric_names @@ -722,3 +738,122 @@ def _generate_dataframes(self, rows) -> tuple[pd.DataFrame, pd.DataFrame]: agg_df.columns = AGG_HEADERS return df, agg_df + + +class ObjectDetectionPerClassMetricsCalculator(ObjectDetectionMetricsCalculatorBase): + + def __post_init__(self): + super().__post_init__() + self.per_class_metric_names: list[str] | None = None + self._set_supported_metrics() + + @property + def supported_metric_names(self): + if self.per_class_metric_names: + return self.per_class_metric_names + else: + raise ValueError("per_class_metrics not initialized - cannot get class names") + + @property + def default_tsv_name(self): + return "all-docs-object-detection-metrics-per-class.tsv" + + @property + def default_agg_tsv_name(self): + return "aggregate-object-detection-metrics-per-class.tsv" + + def _process_document(self, doc: Path) -> Optional[list]: + """Calculate both class-aggregated and per-class metrics for a single document. + + Args: + doc (Path): path to the OD dump file + + Returns: + tuple: a tuple of aggregated and per-class metrics for a single document + """ + try: + doctype, prediction_file, ground_truth_file = self._get_paths(doc) + except ValueError as e: + logger.error(f"Failed to process document {doc}: {e}") + return None + + processor = ObjectDetectionEvalProcessor.from_json_files( + prediction_file_path=prediction_file, + ground_truth_file_path=ground_truth_file, + ) + _, per_class_metrics = processor.get_metrics() + + per_class_metrics_row = [ + ground_truth_file.stem, + doctype, + None, # connector + ] + + for combined_metric_name in self.supported_metric_names: + metric = "_".join(combined_metric_name.split("_")[:-1]) + class_name = combined_metric_name.split("_")[-1] + class_metrics = getattr(per_class_metrics, metric) + per_class_metrics_row.append(class_metrics[class_name]) + return per_class_metrics_row + + def _set_supported_metrics(self): + """Sets the supported metrics based on the classes found in the ground truth files. + The difference between per class and aggregated calculator is that the list of classes + (so the metrics) bases on the contents of the GT / prediction files. + """ + metrics = ["f1_score", "precision", "recall", "m_ap"] + classes = set() + for gt_file in self._ground_truth_paths: + gt_file_path = self.ground_truths_dir / gt_file + with open(gt_file_path) as f: + gt = json.load(f) + gt_classes = gt["object_detection_classes"] + classes.update(gt_classes) + per_class_metric_names = [] + for metric in metrics: + for class_name in classes: + per_class_metric_names.append(f"{metric}_{class_name}") + self.per_class_metric_names = sorted(per_class_metric_names) + + +class ObjectDetectionAggregatedMetricsCalculator(ObjectDetectionMetricsCalculatorBase): + """Calculates object detection metrics for each document and aggregates by all classes""" + + @property + def supported_metric_names(self): + return ["f1_score", "precision", "recall", "m_ap"] + + @property + def default_tsv_name(self): + return "all-docs-object-detection-metrics.tsv" + + @property + def default_agg_tsv_name(self): + return "aggregate-object-detection-metrics.tsv" + + def _process_document(self, doc: Path) -> Optional[list]: + """Calculate both class-aggregated and per-class metrics for a single document. + + Args: + doc (Path): path to the OD dump file + + Returns: + list: a list of aggregated metrics for a single document + """ + try: + doctype, prediction_file, ground_truth_file = self._get_paths(doc) + except ValueError as e: + logger.error(f"Failed to process document {doc}: {e}") + return None + + processor = ObjectDetectionEvalProcessor.from_json_files( + prediction_file_path=prediction_file, + ground_truth_file_path=ground_truth_file, + ) + metrics, _ = processor.get_metrics() + + return [ + ground_truth_file.stem, + doctype, + None, # connector + ] + [getattr(metrics, metric) for metric in self.supported_metric_names] diff --git a/unstructured/metrics/object_detection.py b/unstructured/metrics/object_detection.py index 0c08ae8fce..7c28721518 100644 --- a/unstructured/metrics/object_detection.py +++ b/unstructured/metrics/object_detection.py @@ -17,8 +17,8 @@ @dataclass -class ObjectDetectionEvaluation: - """Class representing a gathered table metrics.""" +class ObjectDetectionAggregatedEvaluation: + """Class representing a gathered class-aggregated object detection metrics""" f1_score: float precision: float @@ -26,8 +26,26 @@ class ObjectDetectionEvaluation: m_ap: float -class ObjectDetectionEvalProcessor: +@dataclass +class ObjectDetectionPerClassEvaluation: + """Class representing a gathered object detection metrics per-class""" + + f1_score: dict[str, float] + precision: dict[str, float] + recall: dict[str, float] + m_ap: dict[str, float] + + @classmethod + def from_tensors(cls, ap, precision, recall, f1, class_labels): + f1_score = {class_labels[i]: f1[i] for i in range(len(class_labels))} + precision = {class_labels[i]: precision[i] for i in range(len(class_labels))} + recall = {class_labels[i]: recall[i] for i in range(len(class_labels))} + m_ap = {class_labels[i]: ap[i] for i in range(len(class_labels))} + return cls(f1_score, precision, recall, m_ap) + + +class ObjectDetectionEvalProcessor: iou_thresholds = IOU_THRESHOLDS score_threshold = SCORE_THRESHOLD recall_thresholds = RECALL_THRESHOLDS @@ -62,7 +80,7 @@ def __init__( self.document_targets = [target.to(device) for target in document_targets] self.pages_height = pages_height self.pages_width = pages_width - self.num_cls = len(class_labels) + self.class_labels = class_labels @classmethod def from_json_files( @@ -85,17 +103,30 @@ def from_json_files( with open(ground_truth_file_path) as f: ground_truth_data = json.load(f) - assert ( - predictions_data["object_detection_classes"] - == ground_truth_data["object_detection_classes"] + assert sorted(predictions_data["object_detection_classes"]) == sorted( + ground_truth_data["object_detection_classes"] ), "Classes in predictions and ground truth do not match." assert len(predictions_data["pages"]) == len( ground_truth_data["pages"] ), "Pages number in predictions and ground truth do not match." - for pred_page, gt_page in zip(predictions_data["pages"], ground_truth_data["pages"]): - assert ( - pred_page["size"] == gt_page["size"] - ), "Page sizes in predictions and ground truth do not match." + for pred_page, gt_page in zip( + sorted(predictions_data["pages"], key=lambda p: p["number"]), + sorted(ground_truth_data["pages"], key=lambda p: p["number"]), + ): + assert pred_page["number"] == gt_page["number"], ( + f"Page numbers in predictions {prediction_file_path.name} " + f"({pred_page['number']}) and ground truth {ground_truth_file_path.name} " + f"({gt_page['number']}) do not match." + ) + page_num = pred_page["number"] + + # TODO: translate the bboxes instead of raising error + assert pred_page["size"] == gt_page["size"], ( + f"Page sizes in predictions {prediction_file_path.name} " + f"({pred_page['size'][0]} x {pred_page['size'][1]}) " + f"and ground truth {ground_truth_file_path.name} ({gt_page['size'][0]} x " + f"{gt_page['size'][1]}) do not match for page {page_num}." + ) class_labels = predictions_data["object_detection_classes"] document_preds = cls._process_data(predictions_data, class_labels, prediction=True) @@ -104,6 +135,98 @@ def from_json_files( return cls(document_preds, document_targets, pages_height, pages_width, class_labels) + def get_metrics( + self, + ) -> tuple[ObjectDetectionAggregatedEvaluation, ObjectDetectionPerClassEvaluation]: + """Get per document OD metrics. + + Returns: + tuple: Tuple of ObjectDetectionAggregatedEvaluation and + ObjectDetectionPerClassEvaluation + """ + document_matchings = [] + for preds, targets, height, width in zip( + self.document_preds, self.document_targets, self.pages_height, self.pages_width + ): + # iterate over each page + page_matching_tensors = self._compute_page_detection_matching( + preds=preds, + targets=targets, + height=height, + width=width, + ) + document_matchings.append(page_matching_tensors) + + # compute metrics for all detections and targets + mean_ap, mean_precision, mean_recall, mean_f1 = ( + -1.0, + -1.0, + -1.0, + -1.0, + ) + + num_cls = len(self.class_labels) + mean_ap_per_class = np.full(num_cls, np.nan) + mean_precision_per_class = np.full(num_cls, np.nan) + mean_recall_per_class = np.full(num_cls, np.nan) + mean_f1_per_class = np.full(num_cls, np.nan) + + if len(document_matchings): + matching_info_tensors = [torch.cat(x, 0) for x in list(zip(*document_matchings))] + + # shape (n_class, nb_iou_thresh) + ( + ap_per_present_classes, + precision_per_present_classes, + recall_per_present_classes, + f1_per_present_classes, + present_classes, + ) = self._compute_detection_metrics( + *matching_info_tensors, + ) + + # Precision, recall and f1 are computed for IoU threshold range, averaged over classes + # results before version 3.0.4 (Dec 11 2022) were computed only for smallest value + # (i.e IoU 0.5 if metric is @0.5:0.95) + mean_precision, mean_recall, mean_f1 = ( + precision_per_present_classes.mean(), + recall_per_present_classes.mean(), + f1_per_present_classes.mean(), + ) + + # MaP is averaged over IoU thresholds and over classes + mean_ap = ap_per_present_classes.mean() + + # Fill array of per-class AP scores with values for classes that were present in the + # dataset + ap_per_class = ap_per_present_classes.mean(1) + precision_per_class = precision_per_present_classes.mean(1) + recall_per_class = recall_per_present_classes.mean(1) + f1_per_class = f1_per_present_classes.mean(1) + for i, class_index in enumerate(present_classes): + mean_ap_per_class[class_index] = float(ap_per_class[i]) + + mean_precision_per_class[class_index] = float(precision_per_class[i]) + mean_recall_per_class[class_index] = float(recall_per_class[i]) + mean_f1_per_class[class_index] = float(f1_per_class[i]) + + od_per_class_evaluation = ObjectDetectionPerClassEvaluation.from_tensors( + ap=mean_ap_per_class, + precision=mean_precision_per_class, + recall=mean_recall_per_class, + f1=mean_f1_per_class, + class_labels=self.class_labels, + ) + + od_evaluation = ObjectDetectionAggregatedEvaluation( + f1_score=float(mean_f1), + precision=float(mean_precision), + recall=float(mean_recall), + m_ap=float(mean_ap), + ) + + return od_evaluation, od_per_class_evaluation + @staticmethod def _parse_page_dimensions(data: dict) -> tuple[list, list]: """ @@ -573,86 +696,6 @@ def _compute_detection_metrics_per_cls( return ap, precision, recall - def get_metrics(self) -> ObjectDetectionEvaluation: - """Get per document OD metrics. - - Returns: - output_dict: dict with OD metrics - """ - document_matchings = [] - for preds, targets, height, width in zip( - self.document_preds, self.document_targets, self.pages_height, self.pages_width - ): - # iterate over each page - page_matching_tensors = self._compute_page_detection_matching( - preds=preds, - targets=targets, - height=height, - width=width, - ) - document_matchings.append(page_matching_tensors) - - # compute metrics for all detections and targets - mean_ap, mean_precision, mean_recall, mean_f1 = ( - -1.0, - -1.0, - -1.0, - -1.0, - ) - mean_ap_per_class = np.zeros(self.num_cls) - - mean_precision_per_class = np.zeros(self.num_cls) - mean_recall_per_class = np.zeros(self.num_cls) - mean_f1_per_class = np.zeros(self.num_cls) - - if len(document_matchings): - matching_info_tensors = [torch.cat(x, 0) for x in list(zip(*document_matchings))] - - # shape (n_class, nb_iou_thresh) - ( - ap_per_present_classes, - precision_per_present_classes, - recall_per_present_classes, - f1_per_present_classes, - present_classes, - ) = self._compute_detection_metrics( - *matching_info_tensors, - ) - - # Precision, recall and f1 are computed for IoU threshold range, averaged over classes - # results before version 3.0.4 (Dec 11 2022) were computed only for smallest value - # (i.e IoU 0.5 if metric is @0.5:0.95) - mean_precision, mean_recall, mean_f1 = ( - precision_per_present_classes.mean(), - recall_per_present_classes.mean(), - f1_per_present_classes.mean(), - ) - - # MaP is averaged over IoU thresholds and over classes - mean_ap = ap_per_present_classes.mean() - - # Fill array of per-class AP scores with values for classes that were present in the - # dataset - ap_per_class = ap_per_present_classes.mean(1) - precision_per_class = precision_per_present_classes.mean(1) - recall_per_class = recall_per_present_classes.mean(1) - f1_per_class = f1_per_present_classes.mean(1) - for i, class_index in enumerate(present_classes): - mean_ap_per_class[class_index] = float(ap_per_class[i]) - - mean_precision_per_class[class_index] = float(precision_per_class[i]) - mean_recall_per_class[class_index] = float(recall_per_class[i]) - mean_f1_per_class[class_index] = float(f1_per_class[i]) - - od_evaluation = ObjectDetectionEvaluation( - f1_score=float(mean_f1), - precision=float(mean_precision), - recall=float(mean_recall), - m_ap=float(mean_ap), - ) - - return od_evaluation - if __name__ == "__main__": from dataclasses import asdict @@ -671,5 +714,6 @@ def get_metrics(self) -> ObjectDetectionEvaluation: prediction_file_path, ground_truth_file_path ) - metrics: ObjectDetectionEvaluation = eval_processor.get_metrics() + metrics, per_class_metrics = eval_processor.get_metrics() print(f"Metrics for {ground_truth_file_path.name}:\n{asdict(metrics)}") + print(f"Per class Metrics for {ground_truth_file_path.name}:\n{asdict(per_class_metrics)}")