From 246627f4f6aef1121dd4211cc223f356a133c60e Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 7 Oct 2024 09:38:47 +0200 Subject: [PATCH] fix: make doc metadata keys pure strings (#38) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .../simple_metadata_extractor.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py b/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py index e2a1d46..82ac42b 100644 --- a/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +++ b/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py @@ -6,20 +6,18 @@ """Simple metadata extractor module.""" -from enum import Enum from typing import Any from docling_core.transforms.metadata_extractor import BaseMetadataExtractor from docling_core.types import Document as DLDocument +_DL_DOC_HASH = "dl_doc_hash" +_ORIGIN = "origin" + class SimpleMetadataExtractor(BaseMetadataExtractor): """Simple metadata extractor class.""" - class _Keys(str, Enum): - DL_DOC_HASH = "dl_doc_hash" - ORIGIN = "origin" - include_origin: bool = False def get_metadata( @@ -35,10 +33,10 @@ def get_metadata( dict[str, Any]: the extracted metadata """ meta: dict[str, Any] = { - self._Keys.DL_DOC_HASH: doc.file_info.document_hash, + _DL_DOC_HASH: doc.file_info.document_hash, } if self.include_origin: - meta[self._Keys.ORIGIN] = origin + meta[_ORIGIN] = origin return meta def get_excluded_embed_metadata_keys(self) -> list[str]: @@ -47,9 +45,9 @@ def get_excluded_embed_metadata_keys(self) -> list[str]: Returns: list[str]: the metadata to exclude """ - excl_keys: list[str] = [self._Keys.DL_DOC_HASH] + excl_keys: list[str] = [_DL_DOC_HASH] if self.include_origin: - excl_keys.append(self._Keys.ORIGIN) + excl_keys.append(_ORIGIN) return excl_keys def get_excluded_llm_metadata_keys(self) -> list[str]: