feat: MarkdownTextEvaluator: Introduce text evaluation based on markd…

…own export of DoclingDocument. Use BLEU metric Signed-off-by: Nikos Livathinos <[email protected]>
DS4SD · Jan 14, 2025 · 020594b · 020594b
1 parent 96a7d88
commit 020594b
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 1 deletion.
diff --git a/docling_eval/benchmarks/constants.py b/docling_eval/benchmarks/constants.py
@@ -24,6 +24,7 @@ class EvaluationModality(str, Enum):
     TABLEFORMER = "tableformer"
     CODEFORMER = "codeformer"
     READING_ORDER = "reading_order"
+    MARKDOWN_TEXT = "markdown_text"
 
 
 class BenchMarkNames(str, Enum):

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -24,6 +24,10 @@
     DatasetLayoutEvaluation,
     LayoutEvaluator,
 )
+from docling_eval.evaluators.markdown_text_evaluator import (
+    DatasetMarkdownEvaluation,
+    MarkdownTextEvaluator,
+)
 from docling_eval.evaluators.readingorder_evaluator import (
     DatasetReadingOrderEvaluation,
     ReadingOrderEvaluator,
@@ -140,6 +144,13 @@ def evaluate(
                 readingorder_evaluation.model_dump(), fd, indent=2, sort_keys=True
             )
 
+    elif modality == EvaluationModality.MARKDOWN_TEXT:
+        md_evaluator = MarkdownTextEvaluator()
+        md_evaluation = md_evaluator(idir, split="test")
+
+        with open(save_fn, "w") as fd:
+            json.dump(md_evaluation.model_dump(), fd, indent=2, sort_keys=True)
+
     elif modality == EvaluationModality.CODEFORMER:
         pass
 
@@ -207,6 +218,17 @@ def visualise(
             + tabulate(data, headers=headers, tablefmt="github")
         )
 
+    elif modality == EvaluationModality.MARKDOWN_TEXT:
+        with open(filename, "r") as fd:
+            markdown_evaluation = DatasetMarkdownEvaluation.parse_file(filename)
+
+        data, headers = markdown_evaluation.bleu_stats.to_table("BlEU")
+
+        logging.info(
+            "Markdown text (BLEU): \n\n"
+            + tabulate(data, headers=headers, tablefmt="github")
+        )
+
     elif modality == EvaluationModality.CODEFORMER:
         pass
 

diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py
@@ -0,0 +1,79 @@
+import logging
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+from datasets import load_dataset
+from docling_core.types.doc.base import ImageRefMode
+from docling_core.types.doc.document import DoclingDocument
+from nltk.tokenize import word_tokenize
+from pydantic import BaseModel
+from tqdm import tqdm  # type: ignore
+
+from docling_eval.benchmarks.constants import BenchMarkColumns  # type: ignore
+from docling_eval.utils.bleu import compute_bleu_score
+from docling_eval.utils.stats import DatasetStatistics, compute_stats
+
+_log = logging.getLogger(__name__)
+
+
+class PageMarkdownEvaluation(BaseModel):
+    doc_id: str
+
+    true_md: str
+    pred_md: str
+    bleu: float
+
+
+class DatasetMarkdownEvaluation(BaseModel):
+    evaluations: List[PageMarkdownEvaluation]
+    bleu_stats: DatasetStatistics
+
+
+class MarkdownTextEvaluator:
+    def __init__(self):
+        pass
+
+    def __call__(self, ds_path: Path, split: str = "test") -> DatasetMarkdownEvaluation:
+        parquet_files = str(ds_path / split / "*.parquet")
+        ds = load_dataset("parquet", data_files={split: parquet_files})
+        _log.info(f"oveview of dataset: {ds}")
+        if ds is not None:
+            ds_selection = ds[split]
+
+        evaluations: list[PageMarkdownEvaluation] = []
+        bleus = []
+
+        broken_inputs = 0
+        for i, data in tqdm(
+            enumerate(ds_selection),
+            desc="Markdown text evaluations",
+            ncols=120,
+            total=len(ds_selection),
+        ):
+            doc_id = data[BenchMarkColumns.DOC_ID]
+            true_doc_dict = data[BenchMarkColumns.GROUNDTRUTH]
+            true_doc: DoclingDocument = DoclingDocument.model_validate_json(
+                true_doc_dict
+            )
+            pred_doc_dict = data[BenchMarkColumns.PREDICTION]
+            pred_doc: DoclingDocument = DoclingDocument.model_validate_json(
+                pred_doc_dict
+            )
+
+            # Export to markdown and tokenize
+            true_md = true_doc.export_to_markdown(image_mode=ImageRefMode.PLACEHOLDER)
+            true_tokens = word_tokenize(true_md)
+            pred_md = pred_doc.export_to_markdown(image_mode=ImageRefMode.PLACEHOLDER)
+            pred_tokens = word_tokenize(pred_md)
+
+            bleu = compute_bleu_score(true_tokens, pred_tokens)
+            bleus.append(bleu)
+            md_evaluation = PageMarkdownEvaluation(
+                doc_id=doc_id, true_md=true_md, pred_md=pred_md, bleu=bleu
+            )
+            evaluations.append(md_evaluation)
+        bleu_stats = compute_stats(bleus)
+        ds_md_evalutions = DatasetMarkdownEvaluation(
+            evaluations=evaluations, bleu_stats=bleu_stats
+        )
+        return ds_md_evalutions
diff --git a/docling_eval/utils/bleu.py b/docling_eval/utils/bleu.py
@@ -1,7 +1,7 @@
 from nltk.translate.bleu_score import corpus_bleu
 
 
-def compute_bleu_score(
+def compute_bleu_scores(
     targets: list[list[str]], predictions: list[list[str]]
 ) -> tuple[list[float], float]:
     r"""
@@ -28,3 +28,26 @@ def compute_bleu_score(
         for tg, pred in zip(targets, predictions)
     ]
     return bleu_scores, sum(bleu_scores) / len(bleu_scores)
+
+
+def compute_bleu_score(target: list[str], prediction: list[str]) -> float:
+    r"""
+    Compute the BLEU score for the given targetand prediction text.
+
+    Parameters
+    ----------
+    targets : List[List[str]]
+        The ground truth target sequences.
+    predictions : List[List[str]]
+        The predicted sequences. Each prediction is a list of tokens.
+
+    Returns
+    -------
+    bleu_score
+    """
+    weights = (0.25, 0.25, 0.25, 0.25)
+
+    # reference: Ground truth (in BLEU there can be many references)
+    # hypothesis: prediction
+    bleu = corpus_bleu([[target]], [prediction], weights=weights)
+    return bleu