Skip to content

Commit

Permalink
compute checksums for local file paths
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Dec 4, 2023
1 parent 483c0a6 commit efd77a9
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 19 deletions.
11 changes: 9 additions & 2 deletions yente/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from followthemoney.namespace import Namespace

from yente.logs import get_logger
from yente.data.util import get_url_local_path

log = get_logger(__name__)
BOOT_TIME = datetime_iso(datetime.utcnow())
Expand All @@ -23,13 +24,19 @@ def __init__(self, catalog: DataCatalog["Dataset"], data: Dict[str, Any]):
if name != norm_name:
raise ValueError("Invalid dataset name %r (try: %r)" % (name, norm_name))
super().__init__(catalog, data)
self.load = as_bool(data.get("load"), not self.is_collection)
self.entities_url = self._get_entities_url(data)

if self.version is None:
ts = data.get("last_export", BOOT_TIME)
if self.entities_url is not None:
path = get_url_local_path(self.entities_url)
if path is not None and path.exists():
mtime = path.stat().st_mtime
mdt = datetime.fromtimestamp(mtime)
ts = datetime_iso(mdt)
self.version = iso_to_version(ts) or "static"

self.load = as_bool(data.get("load"), not self.is_collection)
self.entities_url = self._get_entities_url(data)
namespace = as_bool(data.get("namespace"), False)
self.ns = Namespace(self.name) if namespace else None
self.index_version: Optional[str] = None
Expand Down
14 changes: 7 additions & 7 deletions yente/data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from yente import settings
from yente.logs import get_logger
from yente.data.util import http_session, resolve_url_type
from yente.data.util import http_session, get_url_local_path

BUFFER = 10 * 1024 * 1024

Expand All @@ -17,9 +17,9 @@
async def load_yaml_url(url: str) -> Any:
if url.lower().endswith(".json"):
return await load_json_url(url)
url_ = resolve_url_type(url)
if isinstance(url_, Path):
async with aiofiles.open(url_, "r") as fh:
path = get_url_local_path(url)
if path is not None:
async with aiofiles.open(path, "r") as fh:
data = await fh.read()
else:
async with http_session() as client:
Expand All @@ -29,9 +29,9 @@ async def load_yaml_url(url: str) -> Any:


async def load_json_url(url: str) -> Any:
url_ = resolve_url_type(url)
if isinstance(url_, Path):
async with aiofiles.open(url_, "rb") as fh:
path = get_url_local_path(url)
if path is not None:
async with aiofiles.open(path, "rb") as fh:
data = await fh.read()
else:
async with http_session() as client:
Expand Down
16 changes: 6 additions & 10 deletions yente/data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from prefixdate.precision import Precision
from contextlib import asynccontextmanager
from aiohttp import ClientSession, ClientTimeout, TCPConnector
from typing import AsyncGenerator, Dict, List, Union, Iterable, Optional, Set
from typing import AsyncGenerator, Dict, List, Iterable, Optional, Set
from followthemoney.types import registry
from normality.scripts import is_modern_alphabet
from fingerprints import remove_types, clean_name_light
Expand Down Expand Up @@ -113,17 +113,13 @@ def pick_names(names: List[str], limit: int = 3) -> List[str]:
return picked


def resolve_url_type(url: str) -> Union[Path, str]:
"""Check if a given path is local or remote and return a parsed form."""
def get_url_local_path(url: str) -> Optional[Path]:
"""Check if a given URL is local file path."""
parsed = urlparse(url)
scheme = parsed.scheme.lower()
if scheme in ("http", "https"):
return url
if parsed.path:
path = Path(parsed.path).resolve()
if path.exists():
return path
raise RuntimeError("Cannot open resource: %s" % url)
if scheme in ("file", "") and parsed.path != "":
return Path(parsed.path).resolve()
return None


@asynccontextmanager
Expand Down

0 comments on commit efd77a9

Please sign in to comment.