Skip to content

Commit

Permalink
fix: make doc metadata keys pure strings (#38)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Oct 7, 2024
1 parent b5592ad commit 246627f
Showing 1 changed file with 7 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,18 @@
"""Simple metadata extractor module."""


from enum import Enum
from typing import Any

from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
from docling_core.types import Document as DLDocument

_DL_DOC_HASH = "dl_doc_hash"
_ORIGIN = "origin"


class SimpleMetadataExtractor(BaseMetadataExtractor):
"""Simple metadata extractor class."""

class _Keys(str, Enum):
DL_DOC_HASH = "dl_doc_hash"
ORIGIN = "origin"

include_origin: bool = False

def get_metadata(
Expand All @@ -35,10 +33,10 @@ def get_metadata(
dict[str, Any]: the extracted metadata
"""
meta: dict[str, Any] = {
self._Keys.DL_DOC_HASH: doc.file_info.document_hash,
_DL_DOC_HASH: doc.file_info.document_hash,
}
if self.include_origin:
meta[self._Keys.ORIGIN] = origin
meta[_ORIGIN] = origin
return meta

def get_excluded_embed_metadata_keys(self) -> list[str]:
Expand All @@ -47,9 +45,9 @@ def get_excluded_embed_metadata_keys(self) -> list[str]:
Returns:
list[str]: the metadata to exclude
"""
excl_keys: list[str] = [self._Keys.DL_DOC_HASH]
excl_keys: list[str] = [_DL_DOC_HASH]
if self.include_origin:
excl_keys.append(self._Keys.ORIGIN)
excl_keys.append(_ORIGIN)
return excl_keys

def get_excluded_llm_metadata_keys(self) -> list[str]:
Expand Down

0 comments on commit 246627f

Please sign in to comment.