Skip to content

Commit

Permalink
feat: MarkdownTextEvaluator: Introduce text evaluation based on markd…
Browse files Browse the repository at this point in the history
…own export of DoclingDocument.

Use BLEU metric

Signed-off-by: Nikos Livathinos <[email protected]>
  • Loading branch information
nikos-livathinos committed Jan 14, 2025
1 parent 96a7d88 commit 020594b
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 1 deletion.
1 change: 1 addition & 0 deletions docling_eval/benchmarks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class EvaluationModality(str, Enum):
TABLEFORMER = "tableformer"
CODEFORMER = "codeformer"
READING_ORDER = "reading_order"
MARKDOWN_TEXT = "markdown_text"


class BenchMarkNames(str, Enum):
Expand Down
22 changes: 22 additions & 0 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
DatasetLayoutEvaluation,
LayoutEvaluator,
)
from docling_eval.evaluators.markdown_text_evaluator import (
DatasetMarkdownEvaluation,
MarkdownTextEvaluator,
)
from docling_eval.evaluators.readingorder_evaluator import (
DatasetReadingOrderEvaluation,
ReadingOrderEvaluator,
Expand Down Expand Up @@ -140,6 +144,13 @@ def evaluate(
readingorder_evaluation.model_dump(), fd, indent=2, sort_keys=True
)

elif modality == EvaluationModality.MARKDOWN_TEXT:
md_evaluator = MarkdownTextEvaluator()
md_evaluation = md_evaluator(idir, split="test")

with open(save_fn, "w") as fd:
json.dump(md_evaluation.model_dump(), fd, indent=2, sort_keys=True)

elif modality == EvaluationModality.CODEFORMER:
pass

Expand Down Expand Up @@ -207,6 +218,17 @@ def visualise(
+ tabulate(data, headers=headers, tablefmt="github")
)

elif modality == EvaluationModality.MARKDOWN_TEXT:
with open(filename, "r") as fd:
markdown_evaluation = DatasetMarkdownEvaluation.parse_file(filename)

data, headers = markdown_evaluation.bleu_stats.to_table("BlEU")

logging.info(
"Markdown text (BLEU): \n\n"
+ tabulate(data, headers=headers, tablefmt="github")
)

elif modality == EvaluationModality.CODEFORMER:
pass

Expand Down
79 changes: 79 additions & 0 deletions docling_eval/evaluators/markdown_text_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import logging
from pathlib import Path
from typing import Dict, List, Tuple

from datasets import load_dataset
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import DoclingDocument
from nltk.tokenize import word_tokenize
from pydantic import BaseModel
from tqdm import tqdm # type: ignore

from docling_eval.benchmarks.constants import BenchMarkColumns # type: ignore
from docling_eval.utils.bleu import compute_bleu_score
from docling_eval.utils.stats import DatasetStatistics, compute_stats

_log = logging.getLogger(__name__)


class PageMarkdownEvaluation(BaseModel):
doc_id: str

true_md: str
pred_md: str
bleu: float


class DatasetMarkdownEvaluation(BaseModel):
evaluations: List[PageMarkdownEvaluation]
bleu_stats: DatasetStatistics


class MarkdownTextEvaluator:
def __init__(self):
pass

def __call__(self, ds_path: Path, split: str = "test") -> DatasetMarkdownEvaluation:
parquet_files = str(ds_path / split / "*.parquet")
ds = load_dataset("parquet", data_files={split: parquet_files})
_log.info(f"oveview of dataset: {ds}")
if ds is not None:
ds_selection = ds[split]

evaluations: list[PageMarkdownEvaluation] = []
bleus = []

broken_inputs = 0
for i, data in tqdm(
enumerate(ds_selection),
desc="Markdown text evaluations",
ncols=120,
total=len(ds_selection),
):
doc_id = data[BenchMarkColumns.DOC_ID]
true_doc_dict = data[BenchMarkColumns.GROUNDTRUTH]
true_doc: DoclingDocument = DoclingDocument.model_validate_json(
true_doc_dict
)
pred_doc_dict = data[BenchMarkColumns.PREDICTION]
pred_doc: DoclingDocument = DoclingDocument.model_validate_json(
pred_doc_dict
)

# Export to markdown and tokenize
true_md = true_doc.export_to_markdown(image_mode=ImageRefMode.PLACEHOLDER)
true_tokens = word_tokenize(true_md)
pred_md = pred_doc.export_to_markdown(image_mode=ImageRefMode.PLACEHOLDER)
pred_tokens = word_tokenize(pred_md)

bleu = compute_bleu_score(true_tokens, pred_tokens)
bleus.append(bleu)
md_evaluation = PageMarkdownEvaluation(
doc_id=doc_id, true_md=true_md, pred_md=pred_md, bleu=bleu
)
evaluations.append(md_evaluation)
bleu_stats = compute_stats(bleus)
ds_md_evalutions = DatasetMarkdownEvaluation(
evaluations=evaluations, bleu_stats=bleu_stats
)
return ds_md_evalutions
25 changes: 24 additions & 1 deletion docling_eval/utils/bleu.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from nltk.translate.bleu_score import corpus_bleu


def compute_bleu_score(
def compute_bleu_scores(
targets: list[list[str]], predictions: list[list[str]]
) -> tuple[list[float], float]:
r"""
Expand All @@ -28,3 +28,26 @@ def compute_bleu_score(
for tg, pred in zip(targets, predictions)
]
return bleu_scores, sum(bleu_scores) / len(bleu_scores)


def compute_bleu_score(target: list[str], prediction: list[str]) -> float:
r"""
Compute the BLEU score for the given targetand prediction text.
Parameters
----------
targets : List[List[str]]
The ground truth target sequences.
predictions : List[List[str]]
The predicted sequences. Each prediction is a list of tokens.
Returns
-------
bleu_score
"""
weights = (0.25, 0.25, 0.25, 0.25)

# reference: Ground truth (in BLEU there can be many references)
# hypothesis: prediction
bleu = corpus_bleu([[target]], [prediction], weights=weights)
return bleu

0 comments on commit 020594b

Please sign in to comment.