Skip to content

Commit

Permalink
updated the code to export layout after re-annotated the DP-Bench dat…
Browse files Browse the repository at this point in the history
…aset

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 10, 2025
1 parent 3006842 commit 29fa042
Show file tree
Hide file tree
Showing 5 changed files with 320 additions and 39 deletions.
155 changes: 152 additions & 3 deletions docling_eval/benchmarks/annotation_formats/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,24 @@
)

from docling_eval.docling.utils import from_pil_to_base64uri, crop_bounding_box
from docling_eval.docling.utils import insert_images
from docling_eval.docling.utils import (
insert_images,
extract_images,
docling_version,
get_binary,
save_shard_to_disk
)

from docling_eval.benchmarks.constants import BenchMarkColumns
from docling_eval.benchmarks.utils import draw_clusters_with_reading_order, save_inspection_html
from docling_eval.benchmarks.utils import (
draw_clusters_with_reading_order,
save_inspection_html,
save_comparison_html_with_clusters,
write_datasets_info,

)

from docling_eval.docling.conversion import create_converter

# Configure logging
logging.basicConfig(
Expand Down Expand Up @@ -669,6 +683,131 @@ def parse_args():
DocItemLabel.FOOTNOTE,
}

PRED_HTML_EXPORT_LABELS = {
DocItemLabel.TITLE,
DocItemLabel.DOCUMENT_INDEX,
DocItemLabel.SECTION_HEADER,
DocItemLabel.PARAGRAPH,
DocItemLabel.TABLE,
DocItemLabel.PICTURE,
DocItemLabel.FORMULA,
DocItemLabel.CHECKBOX_UNSELECTED,
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.TEXT,
DocItemLabel.LIST_ITEM,
DocItemLabel.CODE,
DocItemLabel.REFERENCE,
# Additional
DocItemLabel.PAGE_HEADER,
DocItemLabel.PAGE_FOOTER,
DocItemLabel.FOOTNOTE,
}

def create_layout_dataset_from_annotations(input_dir:Path, annot_file:Path):

output_dir = input_dir / "layout"

imgs_dir = input_dir / "imgs"
page_imgs_dir = input_dir / "page_imgs"
pdfs_dir = input_dir / "pdfs"

json_true_dir = input_dir / "json-groundtruth"
json_pred_dir = input_dir / "json-predictions"
json_anno_dir = input_dir / "json-annotations"

html_anno_dir = input_dir / "html-annotations"
html_viz_dir = input_dir / "html-annotatations-viz"

overview_file = input_dir / "overview_map.json"

with open(overview_file, "r") as fr:
overview = json.load(fr)

for _ in [input_dir, output_dir,
imgs_dir, page_imgs_dir, pdfs_dir,
json_true_dir, json_pred_dir, json_anno_dir,
html_anno_dir, html_viz_dir]:
os.makedirs(_, exist_ok=True)

image_scale = 2.0

# Create Converter
doc_converter = create_converter(page_image_scale=image_scale)

records = []
for desc, true_doc in tqdm(from_cvat_to_docling_document(annotation_filenames = [annot_file],
overview=overview,
pdfs_dir=pdfs_dir,
imgs_dir=imgs_dir),
total=len(overview),
ncols=128,
desc="Creating documents from annotations"):

basename = desc["basename"]

"""
save_inspection_html(filename=str(html_viz_dir / f"{basename}.html"), doc = true_doc,
labels=TRUE_HTML_EXPORT_LABELS)
"""

pdf_file = desc["pdf_file"]

# Create the predicted Document
conv_results = doc_converter.convert(source=pdf_file, raises_on_error=True)
pred_doc = conv_results.document

true_doc, true_pictures, true_page_images = extract_images(
document=true_doc,
pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, # pictures_column,
page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, # page_images_column,
)

pred_doc, pred_pictures, pred_page_images = extract_images(
document=pred_doc,
pictures_column=BenchMarkColumns.PREDICTION_PICTURES.value, # pictures_column,
page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES.value, # page_images_column,
)

if True:
save_comparison_html_with_clusters(
filename=html_viz_dir / f"{basename}-clusters.html",
true_doc=true_doc,
pred_doc=pred_doc,
page_image=true_page_images[0],
true_labels=TRUE_HTML_EXPORT_LABELS,
pred_labels=PRED_HTML_EXPORT_LABELS,
)

record = {
BenchMarkColumns.DOCLING_VERSION: docling_version(),
BenchMarkColumns.STATUS: str(conv_results.status),
BenchMarkColumns.DOC_ID: str(basename),

BenchMarkColumns.GROUNDTRUTH: json.dumps(true_doc.export_to_dict()),
BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES: true_page_images,
BenchMarkColumns.GROUNDTRUTH_PICTURES: true_pictures,

BenchMarkColumns.PREDICTION: json.dumps(pred_doc.export_to_dict()),
BenchMarkColumns.PREDICTION_PAGE_IMAGES: pred_page_images,
BenchMarkColumns.PREDICTION_PICTURES: pred_pictures,

BenchMarkColumns.ORIGINAL: get_binary(pdf_file),
BenchMarkColumns.MIMETYPE: "application/pdf",
}
records.append(record)

test_dir = output_dir / "test"
os.makedirs(test_dir, exist_ok=True)

save_shard_to_disk(items=records, dataset_path=test_dir)

write_datasets_info(
name="DPBench: end-to-end",
output_dir=output_dir,
num_train_rows=0,
num_test_rows=len(records),
)

def main():

input_dir, preannot_file = parse_args()
Expand All @@ -692,6 +831,11 @@ def main():
for _ in [input_dir, imgs_dir, page_imgs_dir, pdfs_dir, json_true_dir, json_pred_dir, json_anno_dir, html_anno_dir, html_viz_dir]:
os.makedirs(_, exist_ok=True)

image_scale = 2.0

# Create Converter
doc_converter = create_converter(page_image_scale=image_scale)

for desc, true_doc in tqdm(from_cvat_to_docling_document(annotation_filenames = [preannot_file],
overview=overview,
pdfs_dir=pdfs_dir,
Expand All @@ -701,10 +845,15 @@ def main():
desc="Creating documents from annotations"):

basename = desc["basename"]


save_inspection_html(filename=str(html_viz_dir / f"{basename}.html"), doc = true_doc,
labels=TRUE_HTML_EXPORT_LABELS)


if __name__ == "__main__":
main()
#main()

input_dir, annot_file = parse_args()

create_layout_dataset_from_annotations(input_dir=input_dir, annot_file=annot_file)
9 changes: 9 additions & 0 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,15 @@ def visualise(
+ tabulate(data, headers=headers, tablefmt="github")
)

data, headers = layout_evaluation.mAP_stats.to_table()
logging.info(
"TEDS table: \n\n" + tabulate(data, headers=headers, tablefmt="github")
)

figname = odir / f"evaluation_{benchmark.value}_{modality.value}.png"
layout_evaluation.mAP_stats.save_histogram(figname=figname, name="struct-with-text")


elif modality == EvaluationModality.TABLEFORMER:

with open(filename, "r") as fd:
Expand Down
87 changes: 55 additions & 32 deletions docling_eval/evaluators/layout_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,39 +17,50 @@

from docling_eval.benchmarks.constants import BenchMarkColumns

from docling_eval.evaluators.utils import DatasetStatistics, compute_stats

class LayoutEvaluation(BaseModel):

class ClassLayoutEvaluation(BaseModel):
name: str
label: str
value: float

class ImageLayoutEvaluation(BaseModel):
name: str
value: float

class DatasetLayoutEvaluation(BaseModel):
true_labels: Dict[str, int]
pred_labels: Dict[str, int]

intersecting_labels: List[str]

evaluations: List[LayoutEvaluation]
evaluations_per_class: List[ClassLayoutEvaluation]

evaluations_per_image: List[ImageLayoutEvaluation]

mAP_stats: DatasetStatistics

def to_table(self) -> Tuple[List[List[str]], List[str]]:

headers = ["label", "Class mAP[0.5:0.95]"]

self.evaluations = sorted(self.evaluations, key=lambda x: x.value, reverse=True)
self.evaluations_per_class = sorted(self.evaluations_per_class, key=lambda x: x.value, reverse=True)

table = []
for i in range(len(self.evaluations)):
for i in range(len(self.evaluations_per_class)):
table.append(
[
f"{self.evaluations[i].label}",
f"{100.0*self.evaluations[i].value:.2f}",
f"{self.evaluations_per_class[i].label}",
f"{100.0*self.evaluations_per_class[i].value:.2f}",
]
)

return table, headers




class LayoutEvaluator:

def __init__(self) -> None:
Expand Down Expand Up @@ -88,6 +99,7 @@ def __call__(self, ds_path: Path, split: str = "test") -> DatasetLayoutEvaluatio
)
logging.info(f"Intersection labels: {intersection_labels}")

doc_ids = []
ground_truths = []
predictions = []

Expand All @@ -110,14 +122,21 @@ def __call__(self, ds_path: Path, split: str = "test") -> DatasetLayoutEvaluatio
)

if len(gts) == len(preds):
for i in range(len(gts)):
doc_ids.append(data[BenchMarkColumns.DOC_ID] + f"-page-{i}")

ground_truths.extend(gts)
predictions.extend(preds)
else:
logging.error("Ignoring predictions for document")

assert len(ground_truths) == len(
assert len(doc_ids) == len(
ground_truths
), "doc_ids==len(ground_truths)"

assert len(doc_ids) == len(
predictions
), "len(ground_truths)==len(predictions)"
), "doc_ids==len(predictions)"

# Initialize Mean Average Precision metric
metric = MeanAveragePrecision(iou_type="bbox", class_metrics=True)
Expand All @@ -136,38 +155,42 @@ def __call__(self, ds_path: Path, split: str = "test") -> DatasetLayoutEvaluatio
if "map_per_class" in result:
for label_idx, class_map in enumerate(result["map_per_class"]):
evaluations.append(
LayoutEvaluation(
ClassLayoutEvaluation(
name="Class mAP[0.5:0.95]",
label=intersection_labels[label_idx].value,
value=class_map,
)
)

"""
# Print results
print("Results:")
for key, value in result.items():
try:
print(f"{key}: {value:.3f}")
except:
print(f"{key}: {value}")
# Overall mAP
print(f"Overall mAP[0.5:0.95]: {result['map'].item():.3f}")
print("\nPer-Class mAP[0.5:0.95]:")
if "map_per_class" in result:
for label_idx, class_map in enumerate(result["map_per_class"]):
# label_name = self.label_names.get(label_idx, f"Class {label_idx}") # Use label name or default
print(
f" => {label_idx} {intersection_labels[label_idx].value}: {class_map:.3f}"
)
"""

# Compute mAP for each image individually
map_values = []

evaluations_per_image: List[ImageLayoutEvaluation] = []
for doc_id, pred, gt in zip(doc_ids, predictions, ground_truths):
# Reset the metric for the next image
metric.reset()

# Update with single image
metric.update([pred], [gt])

# Compute metrics
result = metric.compute()

# Extract mAP for this image
map_value = float(result["map"].item())

map_values.append(map_value)
evaluations_per_image.append(ImageLayoutEvaluation(name=doc_id, value=map_value))

return DatasetLayoutEvaluation(
evaluations=evaluations,
evaluations_per_class=evaluations,
evaluations_per_image=evaluations_per_image,

mAP_stats = compute_stats(map_values),

true_labels=true_labels,
pred_labels=pred_labels,

intersecting_labels=[_.value for _ in intersection_labels],
)

Expand Down Expand Up @@ -271,7 +294,7 @@ def _evaluate_layouts_in_documents(
ground_truths = []
predictions = []

# logging.info("\n\n ============================================ \n\n")
# logging.info(f"\n\n ================= {true_doc.name}, {pred_doc.name} ===================== \n\n")

for page_no, items in true_pages_to_objects.items():

Expand Down
Loading

0 comments on commit 29fa042

Please sign in to comment.