Skip to content

Commit

Permalink
OcrdFile.local_filename returns str, some typing
Browse files Browse the repository at this point in the history
supersedes #1167
  • Loading branch information
kba committed Feb 6, 2024
1 parent b94b185 commit 83111e0
Show file tree
Hide file tree
Showing 13 changed files with 101 additions and 70 deletions.
5 changes: 3 additions & 2 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import sys
import tarfile
import io
from ocrd.workspace import Workspace

from ocrd_utils import (
VERSION as OCRD_VERSION,
Expand Down Expand Up @@ -49,7 +50,7 @@ class Processor():

def __init__(
self,
workspace,
workspace : Workspace,
ocrd_tool=None,
parameter=None,
# TODO OCR-D/core#274
Expand Down Expand Up @@ -163,7 +164,7 @@ def verify(self):
"""
return True

def process(self):
def process(self) -> None:
"""
Process the :py:attr:`workspace`
from the given :py:attr:`input_file_grp`
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class DummyProcessor(Processor):
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
"""

def process(self):
def process(self) -> None:
LOG = getLogger('ocrd.dummy')
assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)
Expand Down
21 changes: 17 additions & 4 deletions src/ocrd/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from re import sub
from tempfile import NamedTemporaryFile
from contextlib import contextmanager
from typing import Optional, Union

from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor
from PIL import Image
Expand All @@ -13,6 +14,7 @@
import requests

from ocrd_models import OcrdMets, OcrdFile
from ocrd_models.ocrd_file import ClientSideOcrdFile
from ocrd_models.ocrd_page import parse, BorderType, to_xml
from ocrd_modelfactory import exif_from_filename, page_from_file
from ocrd_utils import (
Expand Down Expand Up @@ -71,7 +73,16 @@ class Workspace():
baseurl (string) : Base URL to prefix to relative URL.
"""

def __init__(self, resolver, directory, mets=None, mets_basename=DEFAULT_METS_BASENAME, automatic_backup=False, baseurl=None, mets_server_url=None):
def __init__(
self,
resolver,
directory,
mets : Optional[Union[OcrdMets, ClientSideOcrdMets]] = None,
mets_basename=DEFAULT_METS_BASENAME,
automatic_backup=False,
baseurl=None,
mets_server_url=None
):
self.resolver = resolver
self.directory = directory
self.mets_target = str(Path(directory, mets_basename))
Expand Down Expand Up @@ -328,14 +339,16 @@ def rename_file_group(self, old, new):
local_filename_replacements = {}
log.info("Moving files")
for mets_file in self.mets.find_files(fileGrp=old, local_only=True):
new_local_filename = old_local_filename = str(mets_file.local_filename)
new_local_filename = old_local_filename = mets_file.local_filename
assert new_local_filename
assert old_local_filename
# Directory part
new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename)
# File part
new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename)
local_filename_replacements[str(mets_file.local_filename)] = new_local_filename
# move file from ``old`` to ``new``
mets_file.local_filename.rename(new_local_filename)
Path(old_local_filename).rename(new_local_filename)
# change the url of ``mets:file``
mets_file.local_filename = new_local_filename
# change the file ID and update structMap
Expand Down Expand Up @@ -375,7 +388,7 @@ def rename_file_group(self, old, new):

@deprecated_alias(pageId="page_id")
@deprecated_alias(ID="file_id")
def add_file(self, file_grp, content=None, **kwargs):
def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSideOcrdFile]:
"""
Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace.
Expand Down
3 changes: 2 additions & 1 deletion src/ocrd_modelfactory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
from datetime import datetime
from pathlib import Path
from typing import Tuple, Union
from yaml import safe_load, safe_dump

from PIL import Image
Expand Down Expand Up @@ -78,7 +79,7 @@ def page_from_image(input_file, with_tree=False):
revmap = dict(((node, element) for element, node in mapping.items()))
return pcgts, etree, mapping, revmap

def page_from_file(input_file, with_tree=False):
def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]:
"""
Create :py:class:`~ocrd_models.ocrd_page.OcrdPage`
from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path
Expand Down
73 changes: 43 additions & 30 deletions src/ocrd_models/ocrd_file.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""
API to ``mets:file``
"""
from os.path import splitext, basename
from pathlib import Path
from typing import Any, List, Optional, Union

from ocrd_utils import deprecation_warning

from .ocrd_xml_base import ET
from .constants import NAMESPACES as NS, TAG_METS_FLOCAT, TAG_METS_FILE
from .ocrd_xml_base import ET # type: ignore
from .constants import NAMESPACES as NS, TAG_METS_FLOCAT

class OcrdFile():
"""
Expand All @@ -22,9 +22,8 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non
mets (OcrdMets): Containing :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
mimetype (string): ``@MIMETYPE`` of this ``mets:file``
pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file``
local_filename (string): Local filename
url (string): original ``@xlink:href`` of this ``mets:file``
local_filename (Path): ``@xlink:href`` pointing to the locally cached version of the file in the workspace
local_filename (string): ``@xlink:href`` pointing to the locally cached version of the file in the workspace
ID (string): ``@ID`` of this ``mets:file``
loctype (string): DEPRECATED do not use
"""
Expand All @@ -39,7 +38,7 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non
self.pageId = pageId

if local_filename:
self.local_filename = Path(local_filename)
self.local_filename = local_filename
if url:
self.url = url

Expand Down Expand Up @@ -70,38 +69,38 @@ def __eq__(self, other):
# self.fileGrp == other.fileGrp

@property
def basename(self):
def basename(self) -> str:
"""
Get the ``.name`` of the local file
"""
if not self.local_filename:
return
return self.local_filename.name
return ''
return Path(self.local_filename).name

@property
def extension(self):
def extension(self) -> str:
if not self.local_filename:
return
return ''.join(self.local_filename.suffixes)
return ''
return ''.join(Path(self.local_filename).suffixes)

@property
def basename_without_extension(self):
def basename_without_extension(self) -> str:
"""
Get the ``os.path.basename`` of the local file, if any, with extension removed.
"""
if not self.local_filename:
return
return self.local_filename.name[:-len(self.extension)]
return ''
return Path(self.local_filename).name[:-len(self.extension)]

@property
def ID(self):
def ID(self) -> str:
"""
Get the ``@ID`` of the ``mets:file``.
"""
return self._el.get('ID')

@ID.setter
def ID(self, ID):
def ID(self, ID : Optional[str]) -> None:
"""
Set the ``@ID`` of the ``mets:file`` to :py:attr:`ID`.
"""
Expand All @@ -116,7 +115,7 @@ def ID(self, ID):
self.pageId = pageId

@property
def pageId(self):
def pageId(self) -> str:
"""
Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` (physical page manifestation).
"""
Expand All @@ -125,7 +124,7 @@ def pageId(self):
return self.mets.get_physical_page_for_file(self)

@pageId.setter
def pageId(self, pageId):
def pageId(self, pageId : Optional[str]) -> None:
"""
Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` (physical page manifestation) to :py:attr:`pageId`.
"""
Expand All @@ -136,21 +135,21 @@ def pageId(self, pageId):
self.mets.set_physical_page_for_file(pageId, self)

@property
def loctypes(self):
def loctypes(self) -> List[str]:
"""
Get the ``@LOCTYPE``s of the ``mets:file``.
"""
return [x.get('LOCTYPE') for x in self._el.findall('mets:FLocat', NS)]

@property
def mimetype(self):
def mimetype(self) -> str:
"""
Get the ``@MIMETYPE`` of the ``mets:file``.
"""
return self._el.get('MIMETYPE')

@mimetype.setter
def mimetype(self, mimetype):
def mimetype(self, mimetype : Optional[str]) -> None:
"""
Set the ``@MIMETYPE`` of the ``mets:file`` to :py:attr:`mimetype`.
"""
Expand All @@ -159,7 +158,7 @@ def mimetype(self, mimetype):
self._el.set('MIMETYPE', mimetype)

@property
def fileGrp(self):
def fileGrp(self) -> str:
"""
The ``@USE`` of the containing ``mets:fileGrp``
"""
Expand All @@ -169,7 +168,7 @@ def fileGrp(self):
raise ValueError("OcrdFile not related to METS")

@property
def url(self):
def url(self) -> str:
"""
Get the remote/original URL ``@xlink:href`` of this ``mets:file``.
"""
Expand All @@ -179,7 +178,7 @@ def url(self):
return ''

@url.setter
def url(self, url):
def url(self, url : Optional[str]) -> None:
"""
Set the remote/original URL ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`.
"""
Expand All @@ -194,16 +193,17 @@ def url(self, url):
el_FLocat.set("LOCTYPE", "URL")

@property
def local_filename(self):
def local_filename(self) -> Optional[str]:
"""
Get the local/cached ``@xlink:href`` of this ``mets:file``.
"""
el_FLocat = self._el.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', NS)
if el_FLocat is not None:
return Path(el_FLocat.get("{%s}href" % NS["xlink"]))
return el_FLocat.get("{%s}href" % NS["xlink"])
return None

@local_filename.setter
def local_filename(self, fname):
def local_filename(self, fname : Optional[Union[Path, str]]):
"""
Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`.
"""
Expand All @@ -212,9 +212,11 @@ def local_filename(self, fname):
if el_FLocat is not None:
self._el.remove(el_FLocat)
return
else:
fname = str(fname)
if el_FLocat is None:
el_FLocat = ET.SubElement(self._el, TAG_METS_FLOCAT)
el_FLocat.set("{%s}href" % NS["xlink"], str(fname))
el_FLocat.set("{%s}href" % NS["xlink"], fname)
el_FLocat.set("LOCTYPE", "OTHER")
el_FLocat.set("OTHERLOCTYPE", "FILE")

Expand All @@ -226,7 +228,18 @@ class ClientSideOcrdFile:
this represents the response of the :py:class:`ocrd.mets_server.OcrdMetsServer`.
"""

def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None, fileGrp=None):
def __init__(
self,
el,
mimetype: str = '',
pageId: str = '',
loctype: str ='OTHER',
local_filename: Optional[str] = None,
mets : Any = None,
url: str = '',
ID: str = '',
fileGrp: str = ''
):
"""
Args:
el (): ignored
Expand Down
13 changes: 7 additions & 6 deletions src/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from datetime import datetime
import re
import typing
from lxml import etree as ET
from typing_extensions import Optional
from lxml import etree as ET # type: ignore
from copy import deepcopy
from warnings import warn

Expand Down Expand Up @@ -34,7 +35,7 @@
METS_XML_EMPTY,
)

from .ocrd_xml_base import OcrdXmlDocument, ET
from .ocrd_xml_base import OcrdXmlDocument
from .ocrd_file import OcrdFile
from .ocrd_agent import OcrdAgent

Expand Down Expand Up @@ -134,9 +135,9 @@ def _clear_caches(self):
Deallocates the caches
"""

self._file_cache = None
self._page_cache = None
self._fptr_cache = None
self._file_cache = {}
self._page_cache = {}
self._fptr_cache = {}

def refresh_caches(self):
if self._cache_flag:
Expand Down Expand Up @@ -243,7 +244,7 @@ def find_files(
pageId=None,
mimetype=None,
url=None,
local_filename=None,
local_filename : Optional[str] = None,
local_only=False,
include_fileGrp=None,
exclude_fileGrp=None,
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd_models/ocrd_xml_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Base class for XML documents loaded from either content or filename.
"""
from os.path import exists
from lxml import etree as ET
from lxml import etree as ET # type: ignore

from .constants import NAMESPACES
from .utils import xmllint_format
Expand Down
Loading

0 comments on commit 83111e0

Please sign in to comment.