Skip to content

Commit

Permalink
Merge branch 'keep-remote-links'
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Sep 11, 2023
2 parents 4885180 + 0f26809 commit f058d3e
Show file tree
Hide file tree
Showing 27 changed files with 380 additions and 343 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ assets: repo/assets
.PHONY: test
# Run all unit tests
test: assets
$(PYTHON) -m pytest --continue-on-collection-errors --durations=10\
$(PYTHON) \
-m pytest $(PYTEST_ARGS) --durations=10\
--ignore=$(TESTDIR)/test_logging.py \
--ignore=$(TESTDIR)/test_logging_conf.py \
--ignore-glob="$(TESTDIR)/**/*bench*.py" \
Expand Down
8 changes: 4 additions & 4 deletions ocrd/ocrd/cli/bashlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def bashlib_input_files(**kwargs):
working_dir = kwargs.pop('working_dir')
if is_local_filename(mets) and not isfile(get_local_filename(mets)):
msg = "File does not exist: %s" % mets
raise Exception(msg)
raise FileNotFoundError(msg)
resolver = Resolver()
workspace = resolver.workspace_from_url(mets, working_dir)
processor = Processor(workspace,
Expand All @@ -113,11 +113,11 @@ def bashlib_input_files(**kwargs):
input_file_grp=kwargs['input_file_grp'],
output_file_grp=kwargs['output_file_grp'])
for input_files in processor.zip_input_files(mimetype=None, on_error='abort'):
for field in ['url', 'ID', 'mimetype', 'pageId']:
for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
# make this bash-friendly (show initialization for associative array)
if len(input_files) > 1:
# single quotes allow us to preserve the list value inside the alist
print("[%s]='%s'" % (field, ' '.join(getattr(res, field) for res in input_files)), end=' ')
print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ')
else:
print("[%s]='%s'" % (field, getattr(input_files[0], field)), end=' ')
print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ')
print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp']))
13 changes: 9 additions & 4 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
@workspace_cli.command('find')
@mets_find_options
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
default=['url'],
default=['local_filename'],
multiple=True,
type=click.Choice([
'url',
Expand All @@ -418,9 +418,10 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
'local_filename',
]))
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
@click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS")
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
@pass_workspace
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, wait):
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, undo_download, wait):
"""
Find files.
Expand All @@ -443,13 +444,17 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
mimetype=mimetype,
page_id=page_id,
):
ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field]
if download and not f.local_filename:
workspace.download_file(f)
modified_mets = True
if wait:
time.sleep(wait)
ret.append([f.ID if field == 'pageId' else getattr(f, field) or ''
for field in output_field])
if undo_download and f.local_filename:
ret_entry = [f'Removed local_filename {f.local_filename}']
f.local_filename = None
modified_mets = True
ret.append(ret_entry)
if modified_mets:
workspace.save_mets()
if 'pageId' in output_field:
Expand Down
57 changes: 31 additions & 26 deletions ocrd/ocrd/mets_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from os import environ, _exit
from io import BytesIO
from typing import Any, Dict, Optional, Union, List, Tuple
from pathlib import Path
from urllib.parse import urlparse

from fastapi import FastAPI, Request, File, Form, Response
Expand All @@ -31,13 +32,13 @@ class OcrdFileModel(BaseModel):
file_grp : str = Field()
file_id : str = Field()
mimetype : str = Field()
page_id : Union[str, None] = Field()
url : Union[str, None] = Field()
local_filename : Union[str, None] = Field()
page_id : Optional[str] = Field()
url : Optional[str] = Field()
local_filename : Optional[str] = Field()

@staticmethod
def create(file_grp : str, file_id : str, page_id : Union[str, None], url : str, local_filename : str, mimetype : str):
return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=local_filename)
def create(file_grp : str, file_id : str, page_id : Optional[str], url : Optional[str], local_filename : Optional[Union[str, Path]], mimetype : str):
return OcrdFileModel(file_grp=file_grp, file_id=file_id, page_id=page_id, mimetype=mimetype, url=url, local_filename=str(local_filename))

class OcrdAgentModel(BaseModel):
name : str = Field()
Expand All @@ -57,9 +58,16 @@ class OcrdFileListModel(BaseModel):

@staticmethod
def create(files : List[OcrdFile]):
return OcrdFileListModel(
files=[OcrdFileModel.create(file_grp=f.fileGrp, file_id=f.ID, mimetype=f.mimetype, page_id=f.pageId, url=f.url, local_filename=f.local_filename) for f in files]
)
ret = OcrdFileListModel(
files=[OcrdFileModel.create(
file_grp=f.fileGrp,
file_id=f.ID,
mimetype=f.mimetype,
page_id=f.pageId,
url=f.url,
local_filename=f.local_filename
) for f in files])
return ret

class OcrdFileGroupListModel(BaseModel):
file_groups : List[str] = Field()
Expand Down Expand Up @@ -144,17 +152,14 @@ def file_groups(self):
@deprecated_alias(pageId="page_id")
@deprecated_alias(ID="file_id")
def add_file(self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs):
self.session.request(
'POST',
f'{self.url}/file',
data=OcrdFileModel.create(
file_id=file_id,
file_grp=file_grp,
page_id=page_id,
mimetype=mimetype,
url=url,
local_filename=local_filename).dict(),
)
data = OcrdFileModel.create(
file_id=file_id,
file_grp=file_grp,
page_id=page_id,
mimetype=mimetype,
url=url,
local_filename=local_filename)
r = self.session.request('POST', f'{self.url}/file', data=data.dict())
return ClientSideOcrdFile(
None,
ID=file_id,
Expand Down Expand Up @@ -208,10 +213,10 @@ async def exception_handler_invalid_regex(request: Request, exc: re.error):

@app.get("/file", response_model=OcrdFileListModel)
async def find_files(
file_grp : Union[str, None] = None,
file_id : Union[str, None] = None,
page_id : Union[str, None] = None,
mimetype : Union[str, None] = None,
file_grp : Optional[str] = None,
file_id : Optional[str] = None,
page_id : Optional[str] = None,
mimetype : Optional[str] = None,
):
"""
Find files in the mets
Expand All @@ -227,10 +232,10 @@ def save():
async def add_file(
file_grp : str = Form(),
file_id : str = Form(),
page_id : Union[str, None] = Form(),
page_id : Optional[str] = Form(),
mimetype : str = Form(),
url : Union[str, None] = Form(),
local_filename : Union[str, None] = Form(),
url : Optional[str] = Form(None),
local_filename : Optional[str] = Form(None),
):
"""
Add a file
Expand Down
55 changes: 30 additions & 25 deletions ocrd/ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,29 @@ class Resolver():

def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None):
"""
Download a file to a directory.
Download a URL ``url`` to a local file in ``directory``.
Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there.
If ``url`` looks like a file path, check whether that exists.
If it does exist and is within ``directory` already, return early.
If it does exist but is outside of ``directory``. copy it.
If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls.
If `basename` is not given but subdir is, assume user knows what she's doing and
use last URL segment as the basename.
If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``.
If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename.
If the target file already exists within ``directory``, behavior depends on ``if_exists``:
- ``skip`` (default): do nothing and return early. Note that this
- ``overwrite``: overwrite the existing file
- ``raise``: raise a ``FileExistsError``
Args:
directory (string): Directory to download files to
url (string): URL to download from
Keyword Args:
basename (string, None): basename part of the filename on disk.
basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset.
if_exists (string, "skip"): What to do if target file already exists.
One of ``skip`` (default), ``overwrite`` or ``raise``
subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``.
subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``.
retries (int, None): Number of retries to attempt on network failure.
timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data.
Expand All @@ -55,56 +60,57 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)

if not url:
raise Exception("'url' must be a string")
raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok
if not directory:
raise Exception("'directory' must be a string") # actually Path would also work
raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work

url = str(url)
directory = Path(directory)
directory.mkdir(parents=True, exist_ok=True)
directory = str(directory.resolve())

subdir_path = Path(subdir if subdir else '')
basename_path = Path(basename if basename else nth_url_segment(url))
ret = str(Path(subdir_path, basename_path))
ret = Path(subdir_path, basename_path)
dst_path = Path(directory, ret)

# log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
# print('url=%s', url)
# print('directory=%s', directory)
# print('subdir_path=%s', subdir_path)
# print('basename_path=%s', basename_path)
# print('ret=%s', ret)
# print('dst_path=%s', dst_path)
# log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
# print(f'>>> url={url}')
# print(f'>>> directory={directory}')
# print(f'>>> subdir_path={subdir_path}')
# print(f'>>> basename_path={basename_path}')
# print(f'>>> dst_path={dst_path}')
# print(f'>>> ret={ret}')

src_path = None
if is_local_filename(url):
try:
# XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
src_path = Path(get_local_filename(url)).resolve()
except FileNotFoundError as e:
log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
raise e
if not src_path.exists():
raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}")
if src_path == dst_path:
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
return ret
return str(ret)

# Respect 'if_exists' arg
if dst_path.exists():
if if_exists == 'skip':
return ret
return str(ret)
if if_exists == 'raise':
raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))
raise FileExistsError(f"File already exists and if_exists == 'raise': {dst_path}")

# Create dst_path parent dir
dst_path.parent.mkdir(parents=True, exist_ok=True)

# Copy files or download remote assets
if src_path:
# src_path set, so it is a file source, we can copy directly
log.debug("Copying file '%s' to '%s'", src_path, dst_path)
dst_path.write_bytes(src_path.read_bytes())
else:
# src_path not set, it's an http URL, try to download
log.debug("Downloading URL '%s' to '%s'", url, dst_path)
if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'):
retries = config.OCRD_DOWNLOAD_RETRIES
Expand Down Expand Up @@ -141,7 +147,7 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip',
contents = handle_oai_response(response)
dst_path.write_bytes(contents)

return ret
return str(ret)

def workspace_from_url(
self,
Expand Down Expand Up @@ -206,7 +212,6 @@ def workspace_from_url(

log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
mets_basename, mets_url, src_baseurl, dst_dir)

self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')

workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
Expand Down
Loading

0 comments on commit f058d3e

Please sign in to comment.