From e502c462b338eb83988519eae75edfc324b50f18 Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Tue, 14 Nov 2023 16:23:39 -0600 Subject: [PATCH 1/9] feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list --- pyproject.toml | 2 +- src/uproot/_util.py | 11 ++--------- src/uproot/source/fsspec.py | 7 +++---- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b5efe4031..d22888b33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "awkward>=2.4.6", "importlib-metadata;python_version<\"3.8\"", "numpy", + "fsspec", "packaging", "typing_extensions>=4.1.0; python_version < \"3.11\"" ] @@ -63,7 +64,6 @@ test = [ "zstandard", "minio", "aiohttp", - "fsspec", "fsspec-xrootd", "s3fs", "paramiko", diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 61c7f1f51..25c34d0bb 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -19,6 +19,7 @@ from collections.abc import Iterable from urllib.parse import unquote, urlparse +import fsspec import numpy import packaging.version @@ -290,15 +291,7 @@ def regularize_path(path): _windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]") _remote_schemes = ["root", "s3", "http", "https"] -_schemes = ["file", *_remote_schemes] - -try: - # TODO: remove this try/except when fsspec becomes a required dependency - import fsspec - - _schemes = list({*_schemes, *fsspec.available_protocols()}) -except ImportError: - pass +_schemes = list({*_remote_schemes, *fsspec.available_protocols()}) _uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://") _uri_scheme_chain = re.compile( diff --git a/src/uproot/source/fsspec.py b/src/uproot/source/fsspec.py index 5edf2d138..0773d1c47 100644 --- a/src/uproot/source/fsspec.py +++ b/src/uproot/source/fsspec.py @@ -6,6 +6,9 @@ import concurrent.futures import queue +import fsspec +import fsspec.asyn + import uproot import uproot.source.chunk import uproot.source.futures @@ -24,8 +27,6 @@ class FSSpecSource(uproot.source.chunk.Source): """ def __init__(self, file_path: str, **options): - import fsspec.core - options = dict(uproot.reading.open.defaults, **options) storage_options = { k: v @@ -200,8 +201,6 @@ def closed(self) -> bool: class FSSpecLoopExecutor(uproot.source.futures.Executor): @property def loop(self) -> asyncio.AbstractEventLoop: - import fsspec.asyn - return fsspec.asyn.get_loop() def submit(self, coroutine) -> concurrent.futures.Future: From 8f37e6f467301c6a78c7c14320614437f0d93f44 Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:33:41 -0600 Subject: [PATCH 2/9] feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test --- src/uproot/__init__.py | 2 +- src/uproot/_dask.py | 5 - src/uproot/_util.py | 161 +++--------------- src/uproot/behaviors/TBranch.py | 10 -- src/uproot/reading.py | 71 +++----- ...st_0017_multi_basket_multi_branch_fetch.py | 1 + tests/test_0066_fix_http_fallback_freeze.py | 2 +- tests/test_0088_read_with_http.py | 4 + tests/test_0099_read_from_file_object.py | 12 +- ...est_0173_empty_and_multiprocessing_bugs.py | 7 +- ...est_0220_contiguous_byte_ranges_in_http.py | 3 +- tests/test_0302_pickle.py | 6 +- ..._dynamic_classes_cant_be_abc_subclasses.py | 3 +- 13 files changed, 70 insertions(+), 217 deletions(-) diff --git a/src/uproot/__init__.py b/src/uproot/__init__.py index 8777b4122..2d2de7786 100644 --- a/src/uproot/__init__.py +++ b/src/uproot/__init__.py @@ -72,7 +72,6 @@ isort:skip_file """ - from uproot.version import __version__ import uproot.const import uproot.extras @@ -92,6 +91,7 @@ from uproot.source.xrootd import MultithreadedXRootDSource from uproot.source.s3 import S3Source from uproot.source.object import ObjectSource +from uproot.source.fsspec import FSSpecSource from uproot.source.cursor import Cursor from uproot.source.futures import TrivialExecutor from uproot.source.futures import ThreadPoolExecutor diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index b1f033c80..38bb213c7 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -142,11 +142,6 @@ def dask( Options (type; default): * handler (:doc:`uproot.source.chunk.Source` class; None) - * file_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * xrootd_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * s3_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * http_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * object_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) * timeout (float for HTTP, int for XRootD; 30) * max_num_elements (None or int; None) * num_workers (int; 1) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 25c34d0bb..28ac3665e 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -369,63 +369,30 @@ def file_path_to_source_class(file_path, options): Use a file path to get the :doc:`uproot.source.chunk.Source` class that would read it. Returns a tuple of (class, file_path) where the class is a subclass of :doc:`uproot.source.chunk.Source`. - - The "handler" option is the preferred way to specify a custom source class. - The "*_handler" options are for backwards compatibility and will override the "handler" option if set. """ + import uproot.source.chunk file_path = regularize_path(file_path) - out = options["handler"] - if out is not None: - if not (isinstance(out, type) and issubclass(out, uproot.source.chunk.Source)): - raise TypeError( - f"'handler' is not a class object inheriting from Source: {out!r}" - ) - # check if "object_handler" is set - if ( - options["object_handler"] is not None - or options["file_handler"] is not None - or options["xrootd_handler"] is not None - or options["s3_handler"] is not None - or options["http_handler"] is not None + source_cls = options["handler"] + if source_cls is not None: + if not ( + isinstance(source_cls, type) + and issubclass(source_cls, uproot.source.chunk.Source) ): - # These options will override the "handler" option for backwards compatibility - warnings.warn( - """In version 5.2.0, the '*_handler' argument ('http_handler`, 's3_handler', etc.) will be removed from 'uproot.open'. Use 'handler' instead.""", - stacklevel=1, + raise TypeError( + f"'handler' is not a class object inheriting from Source: {source_cls!r}" ) - else: - return out, file_path + return source_cls, file_path if ( not isinstance(file_path, str) and hasattr(file_path, "read") and hasattr(file_path, "seek") ): - out = options["object_handler"] - if out is None: - out = uproot.source.object.ObjectSource - else: - warnings.warn( - f"""In version 5.2.0, the 'object_handler' argument will be removed from 'uproot.open'. Use -uproot.open(..., handler={out!r}) -instead. - -To raise these warnings as errors (and get stack traces to find out where they're called), run -import warnings -warnings.filterwarnings("error", module="uproot.*") -after the first `import uproot` or use `@pytest.mark.filterwarnings("error:::uproot.*")` in pytest.""", - DeprecationWarning, - stacklevel=1, - ) - if not (isinstance(out, type) and issubclass(out, uproot.source.chunk.Source)): - raise TypeError( - f"'object_handler' is not a class object inheriting from Source: {out!r}" - ) - - return out, file_path + source_cls = uproot.source.object.ObjectSource + return source_cls, file_path windows_absolute_path = None if win and _windows_absolute_path_pattern.match(file_path) is not None: @@ -457,107 +424,27 @@ def file_path_to_source_class(file_path, options): else: file_path = windows_absolute_path - out = options["file_handler"] - if out is None: - out = uproot.source.file.MemmapSource - else: - warnings.warn( - f"""In version 5.2.0, the 'file_handler' argument will be removed from 'uproot.open'. Use - uproot.open(..., handler={out!r} - instead. - - To raise these warnings as errors (and get stack traces to find out where they're called), run - import warnings - warnings.filterwarnings("error", module="uproot.*") - after the first `import uproot` or use `@pytest.mark.filterwarnings("error:::uproot.*")` in pytest.""", - DeprecationWarning, - stacklevel=1, - ) + # uproot.source.file.MemmapSource + source_cls = uproot.source.fsspec.FSSpecSource - if not (isinstance(out, type) and issubclass(out, uproot.source.chunk.Source)): - raise TypeError( - "'file_handler' is not a class object inheriting from Source: " - + repr(out) - ) - return out, os.path.expanduser(file_path) + return source_cls, os.path.expanduser(file_path) elif scheme == "root": - out = options["xrootd_handler"] - if out is None: - out = uproot.source.xrootd.XRootDSource - else: - warnings.warn( - f"""In version 5.2.0, the 'xrootd_handler' argument will be removed from 'uproot.open'. Use - uproot.open(..., handler={out!r} - instead. - - To raise these warnings as errors (and get stack traces to find out where they're called), run - import warnings - warnings.filterwarnings("error", module="uproot.*") - after the first `import uproot` or use `@pytest.mark.filterwarnings("error:::uproot.*")` in pytest.""", - DeprecationWarning, - stacklevel=1, - ) - if not (isinstance(out, type) and issubclass(out, uproot.source.chunk.Source)): - raise TypeError( - "'xrootd_handler' is not a class object inheriting from Source: " - + repr(out) - ) - return out, file_path + # uproot.source.xrootd.XRootDSource + source_cls = uproot.source.fsspec.FSSpecSource + return source_cls, file_path elif scheme == "s3": - out = options["s3_handler"] - if out is None: - out = uproot.source.s3.S3Source - else: - warnings.warn( - f"""In version 5.2.0, the 's3_handler' argument will be removed from 'uproot.open'. Use -uproot.open(..., handler={out!r} -instead. - -To raise these warnings as errors (and get stack traces to find out where they're called), run -import warnings -warnings.filterwarnings("error", module="uproot.*") -after the first `import uproot` or use `@pytest.mark.filterwarnings("error:::uproot.*")` in pytest.""", - DeprecationWarning, - stacklevel=1, - ) - if not (isinstance(out, type) and issubclass(out, uproot.source.chunk.Source)): - raise TypeError( - "'s3' is not a class object inheriting from Source: " + repr(out) - ) - return out, file_path + # https://github.com/scikit-hep/uproot5/pull/1012 + source_cls = uproot.source.s3.S3Source + return source_cls, file_path elif scheme in ("http", "https"): - out = options["http_handler"] - if out is None: - out = uproot.source.http.HTTPSource - else: - warnings.warn( - f"""In version 5.2.0, the 'http_handler' argument will be removed from 'uproot.open'. Use -uproot.open(..., handler={out!r} -instead. - -To raise these warnings as errors (and get stack traces to find out where they're called), run -import warnings -warnings.filterwarnings("error", module="uproot.*") -after the first `import uproot` or use `@pytest.mark.filterwarnings("error:::uproot.*")` in pytest.""", - DeprecationWarning, - stacklevel=1, - ) - if not (isinstance(out, type) and issubclass(out, uproot.source.chunk.Source)): - raise TypeError( - "'http_handler' is not a class object inheriting from Source: " - + repr(out) - ) - return out, file_path - - else: - # try to use fsspec before raising an error - if scheme in _schemes: - return uproot.source.fsspec.FSSpecSource, file_path + # uproot.source.http.HTTPSource + source_cls = uproot.source.fsspec.FSSpecSource + return source_cls, file_path - raise ValueError(f"URI scheme not recognized: {file_path}") + return uproot.source.fsspec.FSSpecSource, file_path if isinstance(__builtins__, dict): diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index 7fb3e3c45..a82603176 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -154,11 +154,6 @@ def iterate( Options (type; default): * handler (:doc:`uproot.source.chunk.Source` class; None) - * file_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * xrootd_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * s3_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * http_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * object_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) * timeout (float for HTTP, int for XRootD; 30) * max_num_elements (None or int; None) * num_workers (int; 1) @@ -328,11 +323,6 @@ def concatenate( Options (type; default): * handler (:doc:`uproot.source.chunk.Source` class; None) - * file_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * xrootd_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * s3_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * http_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * object_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) * timeout (float for HTTP, int for XRootD; 30) * max_num_elements (None or int; None) * num_workers (int; 1) diff --git a/src/uproot/reading.py b/src/uproot/reading.py index 63aa88e6f..935c6e646 100644 --- a/src/uproot/reading.py +++ b/src/uproot/reading.py @@ -12,11 +12,11 @@ import struct import sys import uuid -import warnings from collections.abc import Mapping, MutableMapping import uproot import uproot.behaviors.TBranch +import uproot.source.fsspec from uproot._util import no_filter @@ -43,7 +43,7 @@ def open( ``"rel/file.root:tdirectory/ttree"``, ``Path("rel:/file.root")``, ``Path("/abs/path:stuff.root")`` object_cache (None, MutableMapping, or int): Cache of objects drawn - from ROOT directories (e.g histograms, TTrees, other directories); + from ROOT directories (e.g. histograms, TTrees, other directories); if None, do not use a cache; if an int, create a new cache of this size. array_cache (None, MutableMapping, or memory size): Cache of arrays @@ -78,11 +78,6 @@ def open( Options (type; default): * handler (:doc:`uproot.source.chunk.Source` class; None) - * file_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * xrootd_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * s3_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * http_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * object_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) * timeout (float for HTTP, int for XRootD; 30) * max_num_elements (None or int; None) * num_workers (int; 1) @@ -157,43 +152,22 @@ def open( return file.root_directory[object_path] +open.defaults = { + "handler": None, + "timeout": 30, + "max_num_elements": None, + "num_workers": 1, + "use_threads": sys.platform != "emscripten", + "num_fallback_workers": 10, + "begin_chunk_size": 403, # the smallest a ROOT file can be + "minimal_ttree_metadata": True, +} + + class _OpenDefaults(dict): - def __getitem__(self, where): - if where == "xrootd_handler" and where not in self: - # See https://github.com/scikit-hep/uproot5/issues/294 - if uproot.extras.older_xrootd("5.2.0"): - message = ( - f"XRootD {uproot.extras.xrootd_version()} is not fully supported; " - """either upgrade to 5.2.0+ or set - - open.defaults["xrootd_handler"] = uproot.MultithreadedXRootDSource -""" - ) - warnings.warn(message, FutureWarning, stacklevel=1) - - # The key should still be set, regardless of whether we see the warning. - self["xrootd_handler"] = uproot.source.xrootd.XRootDSource - - return dict.__getitem__(self, where) - - -open.defaults = _OpenDefaults( - { - "handler": None, # To be updated to fsspec source - "file_handler": None, # Deprecated - "s3_handler": None, # Deprecated - "http_handler": None, # Deprecated - "object_handler": None, # Deprecated - "xrootd_handler": None, # Deprecated - "timeout": 30, - "max_num_elements": None, - "num_workers": 1, - "use_threads": sys.platform != "emscripten", - "num_fallback_workers": 10, - "begin_chunk_size": 403, # the smallest a ROOT file can be - "minimal_ttree_metadata": True, - } -) + def __init__(self): + raise NotImplementedError # kept for backwards compatibility + must_be_attached = [ "TROOT", @@ -537,11 +511,6 @@ class ReadOnlyFile(CommonFileMethods): Options (type; default): * handler (:doc:`uproot.source.chunk.Source` class; None) - * file_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * xrootd_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * s3_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * http_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) - * object_handler (:doc:`uproot.source.chunk.Source` class; None) (Deprecated: Use `handler` instead. If set, this will take precedence over `handler`) * timeout (float for HTTP, int for XRootD; 30) * max_num_elements (None or int; None) * num_workers (int; 1) @@ -572,7 +541,7 @@ def __init__( self.decompression_executor = decompression_executor self.interpretation_executor = interpretation_executor - self._options = _OpenDefaults(open.defaults) + self._options = open.defaults.copy() self._options.update(options) for option in ["begin_chunk_size"]: self._options[option] = uproot._util.memory_size(self._options[option]) @@ -582,10 +551,10 @@ def __init__( self.hook_before_create_source() - Source, file_path = uproot._util.file_path_to_source_class( + source_cls, file_path = uproot._util.file_path_to_source_class( file_path, self._options ) - self._source = Source(file_path, **self._options) + self._source = source_cls(file_path, **self._options) self.hook_before_get_chunks() diff --git a/tests/test_0017_multi_basket_multi_branch_fetch.py b/tests/test_0017_multi_basket_multi_branch_fetch.py index b42f6db4b..1ed316f21 100644 --- a/tests/test_0017_multi_basket_multi_branch_fetch.py +++ b/tests/test_0017_multi_basket_multi_branch_fetch.py @@ -310,6 +310,7 @@ def test_cache(): skhep_testdata.data_path("uproot-sample-6.20.04-uncompressed.root"), object_cache=100, array_cache="100 MB", + handler=uproot.source.file.MemmapSource, ) as f: assert f.cache_key == "db4be408-93ad-11ea-9027-d201a8c0beef:/" assert f["sample"].cache_key == "db4be408-93ad-11ea-9027-d201a8c0beef:/sample;1" diff --git a/tests/test_0066_fix_http_fallback_freeze.py b/tests/test_0066_fix_http_fallback_freeze.py index 632af99bf..4b6ff26e2 100644 --- a/tests/test_0066_fix_http_fallback_freeze.py +++ b/tests/test_0066_fix_http_fallback_freeze.py @@ -1,6 +1,5 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE -import numpy import pytest import uproot @@ -8,6 +7,7 @@ @pytest.mark.network def test(): + pytest.importorskip("aiohttp") with uproot.open( {"http://scikit-hep.org/uproot3/examples/HZZ.root": "events"} ) as t: diff --git a/tests/test_0088_read_with_http.py b/tests/test_0088_read_with_http.py index 7ba760429..446200595 100644 --- a/tests/test_0088_read_with_http.py +++ b/tests/test_0088_read_with_http.py @@ -9,6 +9,8 @@ @pytest.mark.network def test_issue176(): + pytest.importorskip("aiohttp") + with uproot.open( "https://starterkit.web.cern.ch/starterkit/data/advanced-python-2019/dalitzdata.root" ) as f: @@ -18,6 +20,8 @@ def test_issue176(): @pytest.mark.network def test_issue176_again(): + pytest.importorskip("aiohttp") + with uproot.open( "https://starterkit.web.cern.ch/starterkit/data/advanced-python-2019/dalitzdata.root" ) as f: diff --git a/tests/test_0099_read_from_file_object.py b/tests/test_0099_read_from_file_object.py index f7a9faf02..730b2560b 100644 --- a/tests/test_0099_read_from_file_object.py +++ b/tests/test_0099_read_from_file_object.py @@ -4,11 +4,19 @@ import skhep_testdata import uproot +import uproot.source.fsspec +import uproot.source.object -def test(): +@pytest.mark.parametrize( + "handler", + [None, uproot.source.object.ObjectSource], +) +def test_read_from_file_like_object(handler): with open(skhep_testdata.data_path("uproot-Zmumu.root"), "rb") as f: - assert uproot.open({f: "events"})["px1"].array(library="np")[:10].tolist() == [ + assert uproot.open({f: "events"}, handler=handler)["px1"].array(library="np")[ + :10 + ].tolist() == [ -41.1952876442, 35.1180497674, 35.1180497674, diff --git a/tests/test_0173_empty_and_multiprocessing_bugs.py b/tests/test_0173_empty_and_multiprocessing_bugs.py index 8280cadf4..dd5a0efaf 100644 --- a/tests/test_0173_empty_and_multiprocessing_bugs.py +++ b/tests/test_0173_empty_and_multiprocessing_bugs.py @@ -1,7 +1,6 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE import multiprocessing -import sys import pytest import skhep_testdata @@ -17,8 +16,8 @@ def test_empty(): assert t["z"].array(library="np").tolist() == [] -def readone(filename): - with uproot.open(filename) as f: +def read_one(filename): + with uproot.open(filename, handler=uproot.source.file.MemmapSource) as f: f.decompression_executor = uproot.ThreadPoolExecutor() t = f["events"] b = t["px1"] @@ -28,7 +27,7 @@ def readone(filename): def test_multiprocessing(): with multiprocessing.Pool(1) as pool: out = pool.map( - readone, + read_one, [ skhep_testdata.data_path("uproot-Zmumu.root"), skhep_testdata.data_path("uproot-Zmumu-zlib.root"), diff --git a/tests/test_0220_contiguous_byte_ranges_in_http.py b/tests/test_0220_contiguous_byte_ranges_in_http.py index 3db77022d..26205206f 100644 --- a/tests/test_0220_contiguous_byte_ranges_in_http.py +++ b/tests/test_0220_contiguous_byte_ranges_in_http.py @@ -1,6 +1,5 @@ # BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE -import numpy import pytest import uproot @@ -8,6 +7,8 @@ @pytest.mark.network def test(): + pytest.importorskip("aiohttp") + with uproot.open( "https://starterkit.web.cern.ch/starterkit/data/advanced-python-2019/RD_distribution.root:tree" ) as f: diff --git a/tests/test_0302_pickle.py b/tests/test_0302_pickle.py index fbd436511..c33fb40f6 100644 --- a/tests/test_0302_pickle.py +++ b/tests/test_0302_pickle.py @@ -12,7 +12,7 @@ "handler", [ uproot.source.file.MemmapSource, - # uproot.source.fsspec.FSSpecSource, + uproot.source.fsspec.FSSpecSource, ], ) def test_pickle_roundtrip_local(handler): @@ -41,7 +41,7 @@ def test_pickle_roundtrip_local(handler): "handler", [ uproot.source.http.HTTPSource, - # uproot.source.fsspec.FSSpecSource, + uproot.source.fsspec.FSSpecSource, ], ) @pytest.mark.network @@ -73,7 +73,7 @@ def test_pickle_roundtrip_http(handler): "handler", [ uproot.source.xrootd.XRootDSource, - # uproot.source.fsspec.FSSpecSource, + uproot.source.fsspec.FSSpecSource, ], ) @pytest.mark.network diff --git a/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py b/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py index 0ac90a524..c50407149 100644 --- a/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py +++ b/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py @@ -3,9 +3,8 @@ import pickle import sys -import numpy as np +import uproot import pytest -import skhep_testdata @pytest.mark.skipif( From 7bf6e8af60ebd6c66edef130d6b2cb754bb30f79 Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:16:55 -0600 Subject: [PATCH 3/9] feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source --- src/uproot/_util.py | 4 ++-- src/uproot/version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 28ac3665e..1a86c4207 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -435,8 +435,8 @@ def file_path_to_source_class(file_path, options): return source_cls, file_path elif scheme == "s3": - # https://github.com/scikit-hep/uproot5/pull/1012 - source_cls = uproot.source.s3.S3Source + # uproot.source.s3.S3Source + source_cls = uproot.source.fsspec.FSSpecSource return source_cls, file_path elif scheme in ("http", "https"): diff --git a/src/uproot/version.py b/src/uproot/version.py index a395bccc8..dfdb7f2ea 100644 --- a/src/uproot/version.py +++ b/src/uproot/version.py @@ -12,7 +12,7 @@ import re -__version__ = "5.1.2" +__version__ = "5.2.0rc1" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) From 2d84cfe9fe6d6f79f1953f611506223571d68aa2 Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:51:04 -0600 Subject: [PATCH 4/9] feat: simplify object path split (#1028) * simplify object path split * add example from https://github.com/scikit-hep/uproot5/issues/975 * fix tests * add more test cases * test case update * remove scheme unused regex --- src/uproot/_util.py | 68 ++++++------------------- tests/test_0001_source_class.py | 8 +-- tests/test_0692_fsspec_reading.py | 2 +- tests/test_0976_path_object_split.py | 74 ++++++++++++++++++++++------ 4 files changed, 79 insertions(+), 73 deletions(-) diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 1a86c4207..2017c30ad 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -12,7 +12,6 @@ import itertools import numbers import os -import pathlib import platform import re import warnings @@ -290,14 +289,10 @@ def regularize_path(path): _windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]") _windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]") +# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs) _remote_schemes = ["root", "s3", "http", "https"] _schemes = list({*_remote_schemes, *fsspec.available_protocols()}) -_uri_scheme = re.compile("^(" + "|".join([re.escape(x) for x in _schemes]) + ")://") -_uri_scheme_chain = re.compile( - "^(" + "|".join([re.escape(x) for x in _schemes]) + ")::" -) - def file_object_path_split(urlpath: str) -> tuple[str, str | None]: """ @@ -313,54 +308,19 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: """ urlpath: str = regularize_path(urlpath).strip() - path = urlpath - - def _split_path(path: str) -> list[str]: - parts = path.split(":") - if pathlib.PureWindowsPath(path).drive: - # Windows absolute path - assert len(parts) >= 2, f"could not split object from windows path {path}" - parts = [parts[0] + ":" + parts[1]] + parts[2:] - return parts - - if "://" not in path: - path = "file://" + path - - # replace the match of _uri_scheme_chain with "" until there is no match - while _uri_scheme_chain.match(path): - path = _uri_scheme_chain.sub("", path) - - if _uri_scheme.match(path): - # if not a local path, attempt to match a URI scheme - if path.startswith("file://"): - parsed_url_path = path[7:] - else: - parsed_url_path = urlparse(path).path - - if parsed_url_path.startswith("//"): - parsed_url_path = parsed_url_path[2:] - - parts = _split_path(parsed_url_path) - else: - # invalid scheme - scheme = path.split("://")[0] - raise ValueError( - f"Invalid URI scheme: '{scheme}://' in {path}. Available schemes: {', '.join(_schemes)}." - ) - - if len(parts) == 1: - obj = None - elif len(parts) == 2: - obj = parts[1] - # remove the object from the path (including the colon) - urlpath = urlpath[: -len(obj) - 1] - # clean badly placed slashes - obj = obj.strip().lstrip("/") - while "//" in obj: - obj = obj.replace("//", "/") - else: - raise ValueError(f"could not split object from path {path}") - + obj = None + + separator = "::" + parts = urlpath.split(separator) + object_regex = re.compile(r"(.+\.root):(.*$)") + for i, part in enumerate(reversed(parts)): + match = object_regex.match(part) + if match: + obj = re.sub(r"/+", "/", match.group(2).strip().lstrip("/")).rstrip("/") + parts[-i - 1] = match.group(1) + break + + urlpath = separator.join(parts) return urlpath, obj diff --git a/tests/test_0001_source_class.py b/tests/test_0001_source_class.py index 2ca6d98a7..be1d1186c 100644 --- a/tests/test_0001_source_class.py +++ b/tests/test_0001_source_class.py @@ -148,13 +148,13 @@ def test_colons_and_ports(): "https://example.com:443", None, ) - assert uproot._util.file_object_path_split("https://example.com:443/something") == ( - "https://example.com:443/something", + assert uproot._util.file_object_path_split("https://example.com:443/file.root") == ( + "https://example.com:443/file.root", None, ) assert uproot._util.file_object_path_split( - "https://example.com:443/something:else" - ) == ("https://example.com:443/something", "else") + "https://example.com:443/file.root:object" + ) == ("https://example.com:443/file.root", "object") @pytest.mark.parametrize("use_threads", [True, False], indirect=True) diff --git a/tests/test_0692_fsspec_reading.py b/tests/test_0692_fsspec_reading.py index fb99dd65c..052d916c6 100644 --- a/tests/test_0692_fsspec_reading.py +++ b/tests/test_0692_fsspec_reading.py @@ -208,7 +208,7 @@ def test_fsspec_zip(tmp_path): # open with fsspec with uproot.open( - f"zip://{filename}::file://{filename_zip}:Events/MET_pt" + f"zip://{filename}:Events/MET_pt::file://{filename_zip}" ) as branch: data = branch.array(library="np") assert len(data) == 40 diff --git a/tests/test_0976_path_object_split.py b/tests/test_0976_path_object_split.py index 71638a276..84043f8c7 100644 --- a/tests/test_0976_path_object_split.py +++ b/tests/test_0976_path_object_split.py @@ -64,24 +64,38 @@ ), ), ( - "ssh://user@host:22/path/to/file:object", + "ssh://user@host:22/path/to/file.root:/object//path", ( - "ssh://user@host:22/path/to/file", - "object", + "ssh://user@host:22/path/to/file.root", + "object/path", ), ), ( - "ssh://user@host:50230/path/to/file", + "ssh://user@host:22/path/to/file.root:/object//path:with:colon:in:path/something/", ( - "ssh://user@host:50230/path/to/file", + "ssh://user@host:22/path/to/file.root", + "object/path:with:colon:in:path/something", + ), + ), + ( + "ssh://user@host:50230/path/to/file.root", + ( + "ssh://user@host:50230/path/to/file.root", None, ), ), ( - "s3://bucket/path/to/file:object", + "s3://bucket/path/to/file.root:/dir////object", + ( + "s3://bucket/path/to/file.root", + "dir/object", + ), + ), + ( + "s3://bucket/path/to/file.root:", ( - "s3://bucket/path/to/file", - "object", + "s3://bucket/path/to/file.root", + "", ), ), ( @@ -98,27 +112,56 @@ None, ), ), + # https://github.com/scikit-hep/uproot5/issues/975 ( - "zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt", + "DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root:RNT:CollectionTree", + ( + "DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root", + "RNT:CollectionTree", + ), + ), + ( + "zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", ( "zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", "Events/MET_pt", ), ), ( - "simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip:Events/MET_pt", + "simplecache::zip://uproot-issue121.root:Events/MET_pt::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", ( "simplecache::zip://uproot-issue121.root::file:///tmp/pytest-of-runner/pytest-0/test_fsspec_zip0/uproot-issue121.root.zip", "Events/MET_pt", ), ), ( - r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip:Events/MET_pt", + r"zip://uproot-issue121.root:Events/MET_pt::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip", ( r"zip://uproot-issue121.root::file://C:\Users\runneradmin\AppData\Local\Temp\pytest-of-runneradmin\pytest-0\test_fsspec_zip0\uproot-issue121.root.zip", "Events/MET_pt", ), ), + ( + "zip://uproot-issue121.root:Events/MET_pt::file:///some/weird/path:with:colons/file.root", + ( + "zip://uproot-issue121.root::file:///some/weird/path:with:colons/file.root", + "Events/MET_pt", + ), + ), + ( + "/some/weird/path:with:colons/file.root:Events/MET_pt", + ( + "/some/weird/path:with:colons/file.root", + "Events/MET_pt", + ), + ), + ( + "/some/weird/path:with:colons/file.root", + ( + "/some/weird/path:with:colons/file.root", + None, + ), + ), ], ) def test_url_split(input_value, expected_output): @@ -131,9 +174,12 @@ def test_url_split(input_value, expected_output): @pytest.mark.parametrize( "input_value", [ - "local/file.root://Events", + "local/file.root.zip://Events", + "local/file.roo://Events", + "local/file://Events", ], ) def test_url_split_invalid(input_value): - with pytest.raises(ValueError): - uproot._util.file_object_path_split(input_value) + path, obj = uproot._util.file_object_path_split(input_value) + assert obj is None + assert path == input_value From 0c57861c92828b3430bbf0ca21a99bd58262240d Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Mon, 20 Nov 2023 14:32:58 -0600 Subject: [PATCH 5/9] feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash --- src/uproot/_util.py | 139 ++++++++--------------- src/uproot/reading.py | 6 +- src/uproot/sink/file.py | 123 +++++++++++--------- src/uproot/version.py | 2 +- src/uproot/writing/writable.py | 73 ++++-------- tests/test_0325_fix_windows_file_uris.py | 91 --------------- tests/test_0692_fsspec_reading.py | 30 +++++ tests/test_0692_fsspec_writing.py | 46 ++++++++ tests/test_0976_path_object_split.py | 7 ++ 9 files changed, 225 insertions(+), 292 deletions(-) delete mode 100644 tests/test_0325_fix_windows_file_uris.py diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 2017c30ad..44f38e1ca 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -12,17 +12,20 @@ import itertools import numbers import os -import platform import re import warnings from collections.abc import Iterable -from urllib.parse import unquote, urlparse +from pathlib import Path +from typing import IO +from urllib.parse import urlparse import fsspec import numpy import packaging.version -win = platform.system().lower().startswith("win") +import uproot.source.chunk +import uproot.source.fsspec +import uproot.source.object def tobytes(array): @@ -36,7 +39,7 @@ def tobytes(array): return array.tostring() -def isint(x): +def isint(x) -> bool: """ Returns True if and only if ``x`` is an integer (including NumPy, not including bool). @@ -46,7 +49,7 @@ def isint(x): ) -def isnum(x): +def isnum(x) -> bool: """ Returns True if and only if ``x`` is a number (including NumPy, not including bool). @@ -56,7 +59,7 @@ def isnum(x): ) -def ensure_str(x): +def ensure_str(x) -> str: """ Ensures that ``x`` is a string (decoding with 'surrogateescape' if necessary). """ @@ -94,18 +97,17 @@ def is_file_like( obj, readable: bool = False, writable: bool = False, seekable: bool = False ) -> bool: return ( - callable(getattr(obj, "read", None)) - and callable(getattr(obj, "write", None)) - and callable(getattr(obj, "seek", None)) - and callable(getattr(obj, "tell", None)) - and callable(getattr(obj, "flush", None)) + all( + callable(getattr(obj, attr, None)) + for attr in ("read", "write", "seek", "tell", "flush") + ) and (not readable or not hasattr(obj, "readable") or obj.readable()) and (not writable or not hasattr(obj, "writable") or obj.writable()) and (not seekable or not hasattr(obj, "seekable") or obj.seekable()) ) -def parse_version(version): +def parse_version(version: str): """ Converts a semver string into a Version object that can be compared with ``<``, ``>=``, etc. @@ -116,7 +118,7 @@ def parse_version(version): return packaging.version.parse(version) -def from_module(obj, module_name): +def from_module(obj, module_name: str) -> bool: """ Returns True if ``obj`` is an instance of a class from a module given by name. @@ -155,7 +157,7 @@ def _regularize_filter_regex_flags(flags): return flagsbyte -def no_filter(x): +def no_filter(x) -> bool: """ A filter that accepts anything (always returns True). """ @@ -285,10 +287,6 @@ def regularize_path(path): return path -_windows_drive_letter_ending = re.compile(r".*\b[A-Za-z]$") -_windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]") -_windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]") - # These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs) _remote_schemes = ["root", "s3", "http", "https"] _schemes = list({*_remote_schemes, *fsspec.available_protocols()}) @@ -324,87 +322,48 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]: return urlpath, obj -def file_path_to_source_class(file_path, options): +def file_path_to_source_class( + file_path_or_object: str | Path | IO, options: dict +) -> tuple[type[uproot.source.chunk.Source], str | IO]: """ Use a file path to get the :doc:`uproot.source.chunk.Source` class that would read it. Returns a tuple of (class, file_path) where the class is a subclass of :doc:`uproot.source.chunk.Source`. """ - import uproot.source.chunk - - file_path = regularize_path(file_path) + file_path_or_object: str | IO = regularize_path(file_path_or_object) source_cls = options["handler"] - if source_cls is not None: - if not ( - isinstance(source_cls, type) - and issubclass(source_cls, uproot.source.chunk.Source) + if source_cls is not None and not ( + isinstance(source_cls, type) + and issubclass(source_cls, uproot.source.chunk.Source) + ): + raise TypeError( + f"'handler' is not a class object inheriting from Source: {source_cls!r}" + ) + + # Infer the source class from the file path + if all( + callable(getattr(file_path_or_object, attr, None)) for attr in ("read", "seek") + ): + # need a very soft object check for ubuntu python3.8 pyroot ci tests, cannot use uproot._util.is_file_like + if ( + source_cls is not None + and source_cls is not uproot.source.object.ObjectSource ): raise TypeError( - f"'handler' is not a class object inheriting from Source: {source_cls!r}" + f"'handler' is not ObjectSource for a file-like object: {source_cls!r}" ) - return source_cls, file_path - - if ( - not isinstance(file_path, str) - and hasattr(file_path, "read") - and hasattr(file_path, "seek") - ): - source_cls = uproot.source.object.ObjectSource - return source_cls, file_path - - windows_absolute_path = None - if win and _windows_absolute_path_pattern.match(file_path) is not None: - windows_absolute_path = file_path - - parsed_url = urlparse(file_path) - if parsed_url.scheme.lower() == "file": - parsed_url_path = unquote(parsed_url.path) + return uproot.source.object.ObjectSource, file_path_or_object + elif isinstance(file_path_or_object, str): + source_cls = ( + uproot.source.fsspec.FSSpecSource if source_cls is None else source_cls + ) + return source_cls, file_path_or_object else: - parsed_url_path = parsed_url.path - - if win and windows_absolute_path is None: - if _windows_absolute_path_pattern.match(parsed_url_path) is not None: - windows_absolute_path = parsed_url_path - elif _windows_absolute_path_pattern_slash.match(parsed_url_path) is not None: - windows_absolute_path = parsed_url_path[1:] - - scheme = parsed_url.scheme.lower() - if ( - scheme == "file" - or len(parsed_url.scheme) == 0 - or windows_absolute_path is not None - ): - if windows_absolute_path is None: - if parsed_url.netloc.lower() == "localhost": - file_path = parsed_url_path - else: - file_path = parsed_url.netloc + parsed_url_path - else: - file_path = windows_absolute_path - - # uproot.source.file.MemmapSource - source_cls = uproot.source.fsspec.FSSpecSource - - return source_cls, os.path.expanduser(file_path) - - elif scheme == "root": - # uproot.source.xrootd.XRootDSource - source_cls = uproot.source.fsspec.FSSpecSource - return source_cls, file_path - - elif scheme == "s3": - # uproot.source.s3.S3Source - source_cls = uproot.source.fsspec.FSSpecSource - return source_cls, file_path - - elif scheme in ("http", "https"): - # uproot.source.http.HTTPSource - source_cls = uproot.source.fsspec.FSSpecSource - return source_cls, file_path - - return uproot.source.fsspec.FSSpecSource, file_path + raise TypeError( + f"file_path is not a string or file-like object: {file_path_or_object!r}" + ) if isinstance(__builtins__, dict): @@ -448,7 +407,7 @@ def _file_not_found(files, message=None): ) -def memory_size(data, error_message=None): +def memory_size(data, error_message=None) -> int: """ Regularizes strings like '## kB' and plain integer number of bytes to an integer number of bytes. @@ -739,7 +698,7 @@ def damerau_levenshtein(a, b, ratio=False): # Modified Damerau-Levenshtein distance. Adds a middling penalty # for capitalization. # https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance - M = [[0] * (len(b) + 1) for i in range(len(a) + 1)] + M = [[0] * (len(b) + 1) for _ in range(len(a) + 1)] for i in range(len(a) + 1): M[i][0] = i @@ -771,7 +730,7 @@ def damerau_levenshtein(a, b, ratio=False): # Transpose only M[i][j] = min(M[i][j], M[i - 2][j - 2] + 1) else: - # Traspose and capitalization + # Transpose and capitalization M[i][j] = min(M[i][j], M[i - 2][j - 2] + 1.5) if not ratio: diff --git a/src/uproot/reading.py b/src/uproot/reading.py index 935c6e646..e33b6fc3a 100644 --- a/src/uproot/reading.py +++ b/src/uproot/reading.py @@ -9,10 +9,14 @@ """ from __future__ import annotations +from __future__ import annotations + import struct import sys import uuid from collections.abc import Mapping, MutableMapping +from pathlib import Path +from typing import IO import uproot import uproot.behaviors.TBranch @@ -525,7 +529,7 @@ class ReadOnlyFile(CommonFileMethods): def __init__( self, - file_path, + file_path: str | Path | IO, *, object_cache=100, array_cache="100 MB", diff --git a/src/uproot/sink/file.py b/src/uproot/sink/file.py index 167c8da17..c631e7cc2 100644 --- a/src/uproot/sink/file.py +++ b/src/uproot/sink/file.py @@ -13,6 +13,9 @@ import numbers import os +from typing import IO + +import fsspec import uproot._util @@ -20,69 +23,84 @@ class FileSink: """ Args: - file_path (str): The filesystem path of the file to open. + urlpath_or_file_like (str, Path, or file-like object): If a string or Path, a + filesystem URL that specifies the file to open by fsspec. If a file-like object, it + must have ``read``, ``write``, ``seek``, ``tell``, and ``flush`` methods. + + An object that can write (and read) files on a local or remote filesystem. + It can be initialized from a file-like object (already opened) or a filesystem URL. + If initialized from a filesystem URL, fsspec is used to open the file. + In this case the file is opened in the first read or write operation. + """ - An object that can write (and read) files on a local filesystem, which either opens - a new file from a ``file_path`` in ``"r+b"`` mode or wraps a file-like object - with the :ref:`uproot.sink.file.FileSink.from_object` constructor. + def __init__(self, urlpath_or_file_like: str | IO, **storage_options): + self._open_file = None + self._file = None - With the ``file_path``-based constructor, the file is opened upon first read or - write. - """ + if uproot._util.is_file_like( + urlpath_or_file_like, readable=False, writable=False, seekable=False + ): + self._file = urlpath_or_file_like + + if not uproot._util.is_file_like( + self._file, readable=True, writable=True, seekable=True + ): + raise TypeError( + """writable file can only be created from a file path or an object that supports reading and writing""" + ) + else: + if not self._file_exists(urlpath_or_file_like, **storage_options): + self._truncate_file(urlpath_or_file_like, **storage_options) + + self._open_file = fsspec.open( + urlpath_or_file_like, mode="r+b", **storage_options + ) @classmethod - def from_object(cls, obj) -> FileSink: + def _file_exists(cls, urlpath: str, **storage_options) -> bool: """ Args: - obj (file-like object): An object with ``read``, ``write``, ``seek``, - ``tell``, and ``flush`` methods. + urlpath (str): A filesystem URL that specifies the file to check by fsspec. - Creates a :doc:`uproot.sink.file.FileSink` from a file-like object, such - as ``io.BytesIO``. The object must be readable, writable, and seekable - with ``"r+b"`` mode semantics. + Returns True if the file exists; False otherwise. """ - if uproot._util.is_file_like(obj, readable=True, writable=True, seekable=True): - self = cls(None) - self._file = obj - else: - raise TypeError( - """writable file can only be created from a file path or an object - - * that has 'read', 'write', 'seek', and 'tell' methods - * is 'readable() and writable() and seekable()'""" - ) - return self + fs, local_path = fsspec.core.url_to_fs(urlpath, **storage_options) + return fs.exists(local_path) @classmethod - def from_fsspec(cls, open_file) -> FileSink: - import fsspec + def _truncate_file(cls, urlpath: str, **storage_options) -> None: + """ + Args: + urlpath (str): A filesystem URL that specifies the file to truncate by fsspec. - if not isinstance(open_file, fsspec.core.OpenFile): - raise TypeError("""argument should be of type fsspec.core.OpenFile""") - self = cls(None) - self._fsspec_open_file = open_file - return self + Truncates the file to zero bytes. Creates parent directories if necessary. + """ + fs, local_path = fsspec.core.url_to_fs(urlpath, **storage_options) + parent_directory = fs.sep.join(local_path.split(fs.sep)[:-1]) + fs.mkdirs(parent_directory, exist_ok=True) + fs.touch(local_path, truncate=True) - def __init__(self, file_path: str | None): - self._file_path = file_path - self._file = None - self._fsspec_open_file = None + @property + def from_object(self) -> bool: + """ + True if constructed with a file-like object; False otherwise. + """ + return self._open_file is None @property def file_path(self) -> str | None: """ A path to the file, which is None if constructed with a file-like object. """ - return self._file_path + return self._open_file.path if self._open_file else None def _ensure(self): - if self._file: - return - - if self._fsspec_open_file: - self._file = self._fsspec_open_file.open() - else: - self._file = open(self._file_path, "r+b") + """ + Opens the file if it is not already open. + Sets the file's position to the beginning. + """ + if not self._file: + self._file = self._open_file.open() self._file.seek(0) @@ -95,14 +113,14 @@ def __setstate__(self, state): self.__dict__ = state self._file = None - def tell(self): + def tell(self) -> int: """ Calls the file or file-like object's ``tell`` method. """ self._ensure() return self._file.tell() - def flush(self): + def flush(self) -> None: """ Calls the file or file-like object's ``flush`` method. """ @@ -134,12 +152,9 @@ def __exit__(self, exception_type, exception_value, traceback): @property def in_path(self) -> str: - if self._file_path is None: - return "" - else: - return "\n\nin path: " + self._file_path + return f"\n\nin path: {self.file_path}" if self.file_path is not None else "" - def write(self, location, serialization): + def write(self, location: int, serialization) -> int: """ Args: location (int): Position in the file to write. @@ -150,23 +165,21 @@ def write(self, location, serialization): """ self._ensure() self._file.seek(location) - self._file.write(serialization) + return self._file.write(serialization) - def set_file_length(self, length): + def set_file_length(self, length: int): """ Sets the file's length to ``length``, truncating with zeros if necessary. Calls ``seek``, ``tell``, and possibly ``write``. """ self._ensure() - # self._file.truncate(length) - self._file.seek(0, os.SEEK_END) missing = length - self._file.tell() if missing > 0: self._file.write(b"\x00" * missing) - def read(self, location, num_bytes, insist=True) -> bytes: + def read(self, location: int, num_bytes: int, insist: bool = True) -> bytes: """ Args: location (int): Position in the file to read. diff --git a/src/uproot/version.py b/src/uproot/version.py index dfdb7f2ea..363472158 100644 --- a/src/uproot/version.py +++ b/src/uproot/version.py @@ -12,7 +12,7 @@ import re -__version__ = "5.2.0rc1" +__version__ = "5.2.0rc2" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) diff --git a/src/uproot/writing/writable.py b/src/uproot/writing/writable.py index 870e50669..4d1d4b448 100644 --- a/src/uproot/writing/writable.py +++ b/src/uproot/writing/writable.py @@ -20,14 +20,12 @@ import datetime import itertools -import os import queue import sys import uuid from collections.abc import Mapping, MutableMapping from pathlib import Path from typing import IO -from urllib.parse import urlparse import uproot._util import uproot.compression @@ -40,7 +38,7 @@ from uproot._util import no_filter, no_rename -def create(file_path: str | IO, **options): +def create(file_path: str | Path | IO, **options): """ Args: file_path (str, ``pathlib.Path`` or file-like object): The filesystem path of the @@ -48,7 +46,7 @@ def create(file_path: str | IO, **options): options: See below. Opens a local file for writing. Like ROOT's ``"CREATE"`` option, this function - raises an error (``OSError``) if a file already exists at ``file_path``. + raises an error (``FileExistsError``) if a file already exists at ``file_path``. Returns a :doc:`uproot.writing.writable.WritableDirectory`. @@ -62,61 +60,23 @@ def create(file_path: str | IO, **options): the :doc:`uproot.writing.writable.WritableFile`. Default is ``uproot.ZLIB(1)``. See :doc:`uproot.writing.writable.WritableFile` for details on these options. + + Additional options are passed to as ``storage_options`` to the fsspec filesystem """ file_path = uproot._util.regularize_path(file_path) - if isinstance(file_path, str) and os.path.exists(file_path): - raise OSError( + storage_options = { + key: value for key, value in options.items() if key not in create.defaults + } + if isinstance(file_path, str) and uproot.sink.file.FileSink._file_exists( + file_path, **storage_options + ): + raise FileExistsError( "path exists and refusing to overwrite (use 'uproot.recreate' to " f"overwrite)\n\nfor path {file_path}" ) return recreate(file_path, **options) -def _sink_from_path( - file_path_or_object: str | Path | IO, **storage_options -) -> uproot.sink.file.FileSink: - if uproot._util.is_file_like(file_path_or_object): - return uproot.sink.file.FileSink.from_object(file_path_or_object) - - file_path = uproot._util.regularize_path(file_path_or_object) - file_path, obj = uproot._util.file_object_path_split(file_path) - if obj is not None: - raise ValueError(f"file path '{file_path}' cannot contain an object: {obj}") - - parsed_url = urlparse(file_path) - scheme = parsed_url.scheme - - if not scheme: - # no scheme, assume local file - if not os.path.exists(file_path): - # truncate the file - Path(file_path).parent.mkdir(parents=True, exist_ok=True) - with open(file_path, mode="wb"): - pass - return uproot.sink.file.FileSink(file_path) - - # use fsspec to open the file - try: - # TODO: remove try/except block when fsspec becomes a dependency - import fsspec - - # truncate the file if it doesn't exist (also create parent directories) - fs, local_path = fsspec.core.url_to_fs(file_path, **storage_options) - if not fs.exists(local_path): - parent_directory = fs.sep.join(local_path.split(fs.sep)[:-1]) - fs.mkdirs(parent_directory, exist_ok=True) - fs.touch(local_path, truncate=True) - - open_file = fsspec.open(file_path, mode="r+b", **storage_options) - return uproot.sink.file.FileSink.from_fsspec(open_file) - - except ImportError: - raise ImportError( - f"cannot open file path '{file_path}' with scheme '{scheme}' because " - "the fsspec package is not installed." - ) from None - - def recreate(file_path: str | Path | IO, **options): """ Args: @@ -139,13 +99,15 @@ def recreate(file_path: str | Path | IO, **options): the :doc:`uproot.writing.writable.WritableFile`. Default is ``uproot.ZLIB(1)``. See :doc:`uproot.writing.writable.WritableFile` for details on these options. + + Additional options are passed to as ``storage_options`` to the fsspec filesystem. """ + file_path = uproot._util.regularize_path(file_path) storage_options = { key: value for key, value in options.items() if key not in recreate.defaults } - sink = _sink_from_path(file_path, **storage_options) - + sink = uproot.sink.file.FileSink(file_path, **storage_options) compression = options.pop("compression", create.defaults["compression"]) initial_directory_bytes = options.pop( @@ -192,12 +154,15 @@ def update(file_path: str | Path | IO, **options): * uuid_function (callable; ``uuid.uuid1``) See :doc:`uproot.writing.writable.WritableFile` for details on these options. + + Additional options are passed to as ``storage_options`` to the fsspec filesystem """ + file_path = uproot._util.regularize_path(file_path) storage_options = { key: value for key, value in options.items() if key not in update.defaults } - sink = _sink_from_path(file_path, **storage_options) + sink = uproot.sink.file.FileSink(file_path, **storage_options) initial_directory_bytes = options.pop( "initial_directory_bytes", create.defaults["initial_directory_bytes"] diff --git a/tests/test_0325_fix_windows_file_uris.py b/tests/test_0325_fix_windows_file_uris.py deleted file mode 100644 index 39ddfeac2..000000000 --- a/tests/test_0325_fix_windows_file_uris.py +++ /dev/null @@ -1,91 +0,0 @@ -# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE - -import numpy as np -import pytest - -import uproot._util -import uproot.reading - - -@pytest.mark.skipif( - not uproot._util.win, reason="Drive letters only parsed on Windows." -) -def test_windows_drive_letters(): - assert ( - uproot._util.file_path_to_source_class( - "file:///g:/mydir/file.root", uproot.reading.open.defaults - )[1] - == "g:/mydir/file.root" - ) - - assert ( - uproot._util.file_path_to_source_class( - "file:/g:/mydir/file.root", uproot.reading.open.defaults - )[1] - == "g:/mydir/file.root" - ) - - assert ( - uproot._util.file_path_to_source_class( - "file:g:/mydir/file.root", uproot.reading.open.defaults - )[1] - == "g:/mydir/file.root" - ) - - assert ( - uproot._util.file_path_to_source_class( - "/g:/mydir/file.root", uproot.reading.open.defaults - )[1] - == "g:/mydir/file.root" - ) - - assert ( - uproot._util.file_path_to_source_class( - r"\g:/mydir/file.root", uproot.reading.open.defaults - )[1] - == "g:/mydir/file.root" - ) - - assert ( - uproot._util.file_path_to_source_class( - r"g:\mydir\file.root", uproot.reading.open.defaults - )[1] - == r"g:\mydir\file.root" - ) - - assert ( - uproot._util.file_path_to_source_class( - r"\g:\mydir\file.root", uproot.reading.open.defaults - )[1] - == r"g:\mydir\file.root" - ) - - -def test_escaped_uri_codes(): - # If they're file:// paths, yes we should unquote the % signs. - assert ( - uproot._util.file_path_to_source_class( - "file:///my%20file.root", uproot.reading.open.defaults - )[1] - == "/my file.root" - ) - assert ( - uproot._util.file_path_to_source_class( - "file:///my%E2%80%92file.root", uproot.reading.open.defaults - )[1] - == "/my\u2012file.root" - ) - - # Otherwise, no we should not. - assert ( - uproot._util.file_path_to_source_class( - "/my%20file.root", uproot.reading.open.defaults - )[1] - == "/my%20file.root" - ) - assert ( - uproot._util.file_path_to_source_class( - "/my%E2%80%92file.root", uproot.reading.open.defaults - )[1] - == "/my%E2%80%92file.root" - ) diff --git a/tests/test_0692_fsspec_reading.py b/tests/test_0692_fsspec_reading.py index 052d916c6..b24921d80 100644 --- a/tests/test_0692_fsspec_reading.py +++ b/tests/test_0692_fsspec_reading.py @@ -7,6 +7,7 @@ import uproot.source.xrootd import uproot.source.s3 +from typing import BinaryIO import skhep_testdata import queue import fsspec @@ -15,6 +16,35 @@ import sys +@pytest.mark.parametrize( + "urlpath, source_class", + [ + ("file.root", uproot.source.fsspec.FSSpecSource), + ("s3://path/file.root", uproot.source.fsspec.FSSpecSource), + (r"C:\path\file.root", uproot.source.fsspec.FSSpecSource), + (r"file://C:\path\file.root", uproot.source.fsspec.FSSpecSource), + ("root://file.root", uproot.source.fsspec.FSSpecSource), + (BinaryIO(), uproot.source.object.ObjectSource), + ], +) +def test_default_source(urlpath, source_class): + assert uproot._util.file_path_to_source_class( + urlpath, options=uproot.reading.open.defaults + ) == (source_class, urlpath) + + +@pytest.mark.parametrize( + "to_open, handler", + [ + ("file.root", "invalid_handler"), + (BinaryIO(), uproot.source.fsspec.FSSpecSource), + ], +) +def test_invalid_handler(to_open, handler): + with pytest.raises(TypeError): + uproot.open(to_open, handler=handler) + + def test_open_fsspec_http(server): pytest.importorskip("aiohttp") diff --git a/tests/test_0692_fsspec_writing.py b/tests/test_0692_fsspec_writing.py index 5cb73b1ef..3fac30c2f 100644 --- a/tests/test_0692_fsspec_writing.py +++ b/tests/test_0692_fsspec_writing.py @@ -31,6 +31,52 @@ def test_fsspec_writing_local(tmp_path, scheme): assert f["tree"]["x"].array().tolist() == [1, 2, 3] +# https://github.com/scikit-hep/uproot5/issues/325 +@pytest.mark.parametrize( + "scheme", + [ + "", + "file:", # this is not a valid schema, but we should support it + "file://", + "simplecache::file://", + ], +) +@pytest.mark.parametrize( + "filename", ["file.root", "file%2Eroot", "my%E2%80%92file.root", "my%20file.root"] +) +@pytest.mark.parametrize( + "slash_prefix", + ["", "/", "\\"], +) +def test_fsspec_writing_local_uri(tmp_path, scheme, slash_prefix, filename): + uri = scheme + slash_prefix + os.path.join(tmp_path, "some", "path", filename) + print(uri) + with uproot.create(uri) as f: + f["tree"] = {"x": np.array([1, 2, 3])} + with uproot.open(uri) as f: + assert f["tree"]["x"].array().tolist() == [1, 2, 3] + + +@pytest.mark.parametrize( + "scheme", + [ + "", + "file://", + "simplecache::file://", + # "memory://", # uncomment when https://github.com/fsspec/filesystem_spec/pull/1426 is available in pypi + "simplecache::memory://", + ], +) +def test_fsspec_writing_create(tmp_path, scheme): + uri = scheme + os.path.join(tmp_path, "some", "path", "file.root") + with uproot.create(uri) as f: + f["tree"] = {"x": np.array([1, 2, 3])} + + with pytest.raises(FileExistsError): + with uproot.create(uri): + pass + + def test_issue_1029(tmp_path): # https://github.com/scikit-hep/uproot5/issues/1029 urlpath = os.path.join(tmp_path, "some", "path", "file.root") diff --git a/tests/test_0976_path_object_split.py b/tests/test_0976_path_object_split.py index 84043f8c7..be6100e01 100644 --- a/tests/test_0976_path_object_split.py +++ b/tests/test_0976_path_object_split.py @@ -162,6 +162,13 @@ None, ), ), + ( + r"C:\tmp\test\dir\my%20file.root:Dir/Test", + ( + r"C:\tmp\test\dir\my%20file.root", + "Dir/Test", + ), + ), ], ) def test_url_split(input_value, expected_output): From afeeee7cffa1e2f80528ed0fa66126180cdc4d18 Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Mon, 27 Nov 2023 16:57:28 -0600 Subject: [PATCH 6/9] test: improve path object split tests (#1039) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from https://github.com/scikit-hep/uproot5/issues/975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * add new test case * split big test in two * retry on socket error * xrootd iterator * iterate over different files * iterate over tree * pytest fixture for test directory * pytest fixture for test directory * add annotation to open argument * remove repeated test --- src/uproot/reading.py | 4 +- src/uproot/version.py | 2 +- tests/conftest.py | 6 ++ ..._dynamic_classes_cant_be_abc_subclasses.py | 5 +- tests/test_0692_fsspec_reading.py | 65 +++++++++++++ tests/test_0976_path_object_split.py | 91 +++++++++++-------- 6 files changed, 131 insertions(+), 42 deletions(-) diff --git a/src/uproot/reading.py b/src/uproot/reading.py index e33b6fc3a..d046116a8 100644 --- a/src/uproot/reading.py +++ b/src/uproot/reading.py @@ -9,8 +9,6 @@ """ from __future__ import annotations -from __future__ import annotations - import struct import sys import uuid @@ -25,7 +23,7 @@ def open( - path, + path: str | Path | IO | dict[str | Path | IO, str], *, object_cache=100, array_cache="100 MB", diff --git a/src/uproot/version.py b/src/uproot/version.py index 363472158..7dc2ff112 100644 --- a/src/uproot/version.py +++ b/src/uproot/version.py @@ -12,7 +12,7 @@ import re -__version__ = "5.2.0rc2" +__version__ = "5.2.0rc3" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) diff --git a/tests/conftest.py b/tests/conftest.py index 86683a71d..e3c4b74a4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ import contextlib import skhep_testdata from functools import partial +import os # The base http server does not support range requests. Watch https://github.com/python/cpython/issues/86809 for updates from http.server import HTTPServer @@ -69,3 +70,8 @@ def serve_forever(httpd=server): def server(): with serve() as server_url: yield server_url + + +@pytest.fixture(scope="module") +def tests_directory() -> str: + return os.path.dirname(os.path.realpath(__file__)) diff --git a/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py b/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py index c50407149..14397829f 100644 --- a/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py +++ b/tests/test_0520_dynamic_classes_cant_be_abc_subclasses.py @@ -2,6 +2,7 @@ import pickle import sys +import os import uproot import pytest @@ -11,6 +12,6 @@ sys.version_info < (3, 7), reason="Dynamic types depend on module __getattr__, a Python 3.7+ feature.", ) -def test(): - with open("tests/samples/h_dynamic.pkl", "rb") as f: +def test_pickle(tests_directory): + with open(os.path.join(tests_directory, "samples/h_dynamic.pkl"), "rb") as f: assert len(list(pickle.load(f).axis(0))) == 100 diff --git a/tests/test_0692_fsspec_reading.py b/tests/test_0692_fsspec_reading.py index b24921d80..d2851c0d1 100644 --- a/tests/test_0692_fsspec_reading.py +++ b/tests/test_0692_fsspec_reading.py @@ -244,6 +244,71 @@ def test_fsspec_zip(tmp_path): assert len(data) == 40 +@pytest.mark.network +@pytest.mark.xrootd +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + uproot.source.xrootd.XRootDSource, + None, + ], +) +def test_open_fsspec_xrootd_iterate_files(handler): + pytest.importorskip("XRootD") + + iterator = uproot.iterate( + files=[ + { + "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleMuParked.root": "Events" + }, + { + "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleMuParked.root": "Events" + }, + ], + expressions=["run"], + step_size=100, + handler=handler, + ) + + for i, data in enumerate(iterator): + if i >= 5: + break + assert len(data) == 100 + assert all(data["run"] == 194778) + + +@pytest.mark.network +@pytest.mark.xrootd +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + uproot.source.xrootd.XRootDSource, + None, + ], +) +def test_open_fsspec_xrootd_iterate_tree(handler): + pytest.importorskip("XRootD") + + with uproot.open( + { + "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_DoubleMuParked.root": "Events" + }, + handler=handler, + ) as f: + iterator = f.iterate( + ["run"], + step_size=100, + ) + + for i, data in enumerate(iterator): + if i >= 5: + break + assert len(data) == 100 + assert all(data["run"] == 194778) + + # https://github.com/scikit-hep/uproot5/issues/1035 @pytest.mark.parametrize( "handler", diff --git a/tests/test_0976_path_object_split.py b/tests/test_0976_path_object_split.py index be6100e01..a5617fa18 100644 --- a/tests/test_0976_path_object_split.py +++ b/tests/test_0976_path_object_split.py @@ -14,13 +14,6 @@ "Events", ), ), - ( - "https://github.com/scikit-hep/scikit-hep-testdata/raw/v0.4.33/src/skhep_testdata/data/uproot-issue121.root", - ( - "https://github.com/scikit-hep/scikit-hep-testdata/raw/v0.4.33/src/skhep_testdata/data/uproot-issue121.root", - None, - ), - ), ( "github://scikit-hep:scikit-hep-testdata@v0.4.33/src/skhep_testdata/data/uproot-issue121.root:Dir/Events", ( @@ -28,13 +21,6 @@ "Dir/Events", ), ), - ( - "github://scikit-hep:scikit-hep-testdata@v0.4.33/src/skhep_testdata/data/uproot-issue121.root", - ( - "github://scikit-hep:scikit-hep-testdata@v0.4.33/src/skhep_testdata/data/uproot-issue121.root", - None, - ), - ), ( " http://localhost:8080/dir/test.root: Test ", ( @@ -56,13 +42,6 @@ "Dir/Test", ), ), - ( - r"C:\tmp\test\dir\file.root", - ( - r"C:\tmp\test\dir\file.root", - None, - ), - ), ( "ssh://user@host:22/path/to/file.root:/object//path", ( @@ -78,10 +57,31 @@ ), ), ( - "ssh://user@host:50230/path/to/file.root", + "s3://bucket/path/to/file.root:/dir////object", ( - "ssh://user@host:50230/path/to/file.root", - None, + "s3://bucket/path/to/file.root", + "dir/object", + ), + ), + ( + "s3://bucket/path/to/file.root:", + ( + "s3://bucket/path/to/file.root", + "", + ), + ), + ( + "ssh://user@host:22/path/to/file.root:/object//path", + ( + "ssh://user@host:22/path/to/file.root", + "object/path", + ), + ), + ( + "ssh://user@host:22/path/to/file.root:/object//path:with:colon:in:path/something/", + ( + "ssh://user@host:22/path/to/file.root", + "object/path:with:colon:in:path/something", ), ), ( @@ -105,11 +105,12 @@ "Events", ), ), + # https://github.com/scikit-hep/uproot5/issues/975 ( - "00376186-543E-E311-8D30-002618943857.root", + "DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root:RNT:CollectionTree", ( - "00376186-543E-E311-8D30-002618943857.root", - None, + "DAOD_PHYSLITE_2023-09-13T1230.art.rntuple.root", + "RNT:CollectionTree", ), ), # https://github.com/scikit-hep/uproot5/issues/975 @@ -155,13 +156,6 @@ "Events/MET_pt", ), ), - ( - "/some/weird/path:with:colons/file.root", - ( - "/some/weird/path:with:colons/file.root", - None, - ), - ), ( r"C:\tmp\test\dir\my%20file.root:Dir/Test", ( @@ -178,15 +172,40 @@ def test_url_split(input_value, expected_output): assert obj == obj_expected +@pytest.mark.parametrize( + "input_value", + [ + "/some/weird/path:with:colons/file.root", + "00376186-543E-E311-8D30-002618943857.root", + " file.root", + "dir/file with spaces.root", + "ssh://user@host:50230/path/to/file.root", + r"C:\tmp\test\dir\file.root", + "github://scikit-hep:scikit-hep-testdata@v0.4.33/src/skhep_testdata/data/uproot-issue121.root", + "https://github.com/scikit-hep/scikit-hep-testdata/raw/v0.4.33/src/skhep_testdata/data/uproot-issue121.root", + "root://xcache.af.uchicago.edu:1094//root://fax.mwt2.org:1094//pnfs/uchicago.edu/atlaslocalgroupdisk/rucio/data18_13TeV/df/a4/DAOD_PHYSLITE.34858087._000001.pool.root.1", + "root://xcacheserver:2222//root://originserver:1111/path/file", + "https://xcacheserver:1111//root[s]://originserver:12222/path/file", + "roots://xcacheserver:2312//https://originserver:3122/path/file", + "http://xcacheserver:8762//https://originserver:4212/path/file", + ], +) +def test_url_no_split(input_value): + url, obj = uproot._util.file_object_path_split(input_value) + assert obj is None + assert url == input_value.strip() + + @pytest.mark.parametrize( "input_value", [ "local/file.root.zip://Events", "local/file.roo://Events", "local/file://Events", + "http://xcacheserver:8762//https://originserver:4212/path/file.root.1:CollectionTree", ], ) def test_url_split_invalid(input_value): - path, obj = uproot._util.file_object_path_split(input_value) + url, obj = uproot._util.file_object_path_split(input_value) assert obj is None - assert path == input_value + assert url == input_value From 6330f65ef8b6331b1195f21194e270fdbbffed74 Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Mon, 11 Dec 2023 23:36:48 +0100 Subject: [PATCH 7/9] test: add test for issue 1054 (newer fsspec failing to parse files with colons in name) (#1055) * add test for issue 1054 * additional test * make sure fsspec fix works * try new test in older fsspec version (need to test windows) * skip test in windows due to colons in name * add explicit object-path split with open * revert use fsspec fork in ci --- src/uproot/version.py | 2 +- tests/test_0692_fsspec_reading.py | 57 +++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/uproot/version.py b/src/uproot/version.py index 7dc2ff112..05897ca84 100644 --- a/src/uproot/version.py +++ b/src/uproot/version.py @@ -12,7 +12,7 @@ import re -__version__ = "5.2.0rc3" +__version__ = "5.2.0rc4" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) diff --git a/tests/test_0692_fsspec_reading.py b/tests/test_0692_fsspec_reading.py index d2851c0d1..18dbaab1f 100644 --- a/tests/test_0692_fsspec_reading.py +++ b/tests/test_0692_fsspec_reading.py @@ -15,6 +15,8 @@ import os import sys +is_windows = sys.platform.startswith("win") + @pytest.mark.parametrize( "urlpath, source_class", @@ -166,6 +168,61 @@ def test_open_fsspec_xrootd(handler): assert (data == 194778).all() +@pytest.mark.parametrize( + "handler", + [ + uproot.source.file.MemmapSource, + uproot.source.file.MultithreadedFileSource, + uproot.source.fsspec.FSSpecSource, + None, + ], +) +@pytest.mark.skipif( + is_windows, reason="Windows does not support colons (':') in filenames" +) +def test_issue_1054_filename_colons(handler): + root_filename = "uproot-issue121.root" + local_path = str(skhep_testdata.data_path(root_filename)) + local_path_new = local_path[: -len(root_filename)] + "file:with:colons.root" + os.rename(local_path, local_path_new) + with uproot.open(local_path_new, handler=handler) as f: + data = f["Events/MET_pt"].array(library="np") + assert len(data) == 40 + + with uproot.open(local_path_new + ":Events", handler=handler) as tree: + data = tree["MET_pt"].array(library="np") + assert len(data) == 40 + + with uproot.open(local_path_new + ":Events/MET_pt", handler=handler) as branch: + data = branch.array(library="np") + assert len(data) == 40 + + +@pytest.mark.parametrize( + "handler", + [ + uproot.source.file.MemmapSource, + uproot.source.file.MultithreadedFileSource, + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_issue_1054_object_path_split(handler): + root_filename = "uproot-issue121.root" + local_path = str(skhep_testdata.data_path(root_filename)) + with uproot.open(local_path, handler=handler) as f: + data = f["Events/MET_pt"].array(library="np") + assert len(data) == 40 + + with uproot.open(local_path + ":Events", handler=handler) as tree: + data = tree["MET_pt"].array(library="np") + assert len(data) == 40 + + with uproot.open(local_path + ":Events/MET_pt", handler=handler) as branch: + data = branch.array(library="np") + assert len(data) == 40 + + def test_fsspec_chunks(server): pytest.importorskip("aiohttp") From 5dc898f4a69eb9fcb53ab86f651bb300cd33e027 Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Wed, 13 Dec 2023 19:41:23 +0100 Subject: [PATCH 8/9] feat: globbing with fsspec (#1061) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from https://github.com/scikit-hep/uproot5/issues/975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * test: improve path object split tests (#1039) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from https://github.com/scikit-hep/uproot5/issues/975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * add new test case * split big test in two * retry on socket error * xrootd iterator * iterate over different files * iterate over tree * pytest fixture for test directory * pytest fixture for test directory * add annotation to open argument * remove repeated test * test: add test for issue 1054 (newer fsspec failing to parse files with colons in name) (#1055) * add test for issue 1054 * additional test * make sure fsspec fix works * try new test in older fsspec version (need to test windows) * skip test in windows due to colons in name * add explicit object-path split with open * revert use fsspec fork in ci * use fsspec to expand glob * skip root from remote_schemas * test iterate over xrootd * test * add temporary install to ci * remove ci debug * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from https://github.com/scikit-hep/uproot5/issues/975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * test: add test for issue 1054 (newer fsspec failing to parse files with colons in name) (#1055) * add test for issue 1054 * additional test * make sure fsspec fix works * try new test in older fsspec version (need to test windows) * skip test in windows due to colons in name * add explicit object-path split with open * revert use fsspec fork in ci * test: improve path object split tests (#1039) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * feat: set `fsspec` (`s3fs`) as default handler for s3 paths (#1032) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * feat: set fsspec as default source (#1023) * feat: add fsspec as required dependency (#1021) * fsspec requirements * simplify fsspec import * use loop property * correctly create schemes list * remove deprecated handlers from docs * simplify source selection * return object source * pickle executor * rename test * test more handlers * option to check writeable file-like object * rename test * explicitly set handler * fix s3 source * rename test * Revert "fix s3 source" This reverts commit e76fdbb1e3011a2ad3bd9ea8cb4634b43caa6af6. * sesparate PR for s3 fix (https://github.com/scikit-hep/uproot5/pull/1024) * strip file:// * rename test * rename tests * add aiohttp skip * attempt to parse windows paths * test ci * Revert "test ci" This reverts commit 4c1c8a5bdc3cbbab540c125807b1139a51e9c455. * rename test * remove fsspec from test * remove *_handler options * update defaults * do not override default s3 * do not use fsspec for multiprocessing * rename test * fix not selecting object source * missing import * normalize doc * remove helper * never return None as source * remove unnecessary xrootd source default override since fsspec is default now * rename test * add empty class to pass old pickle test * set version to 5.2.0rc1 (release candidate) * set s3fs as default for s3 * test different handlers * correct serialization of fsspec source * feat: simplify object path split (#1028) * simplify object path split * add example from https://github.com/scikit-hep/uproot5/issues/975 * fix tests * add more test cases * test case update * remove scheme unused regex * feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034) * writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash * add new test case * split big test in two * retry on socket error * xrootd iterator * iterate over different files * iterate over tree * pytest fixture for test directory * pytest fixture for test directory * add annotation to open argument * remove repeated test * test: add test for issue 1054 (newer fsspec failing to parse files with colons in name) (#1055) * add test for issue 1054 * additional test * make sure fsspec fix works * try new test in older fsspec version (need to test windows) * skip test in windows due to colons in name * add explicit object-path split with open * revert use fsspec fork in ci * try to expand all glob strings if they have the protocol * making it work on windows * testing globbing for s3 * add failing test for http globbing * test more handlers, failing test for xrootd (missing files) * understanding error * add class method to extract fsspec options * call super constructor for fsspec source * pass options to regularize files util * python 3.12 aiohttp test in other PR * attempt to hide the ssl destructor error * retry on "expired" * style: pre-commit fixes --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/uproot/_dask.py | 2 +- src/uproot/_util.py | 36 +++++++---- src/uproot/behaviors/TBranch.py | 4 +- src/uproot/models/TTree.py | 1 - src/uproot/source/fsspec.py | 25 ++++---- tests/test_0692_fsspec_reading.py | 99 +++++++++++++++++++++++++++++++ 6 files changed, 137 insertions(+), 30 deletions(-) diff --git a/src/uproot/_dask.py b/src/uproot/_dask.py index 38bb213c7..fd66335c4 100644 --- a/src/uproot/_dask.py +++ b/src/uproot/_dask.py @@ -161,7 +161,7 @@ def dask( array from ``TTrees``. """ - files = uproot._util.regularize_files(files, steps_allowed=True) + files = uproot._util.regularize_files(files, steps_allowed=True, **options) is_3arg = [len(x) == 3 for x in files] if any(is_3arg): diff --git a/src/uproot/_util.py b/src/uproot/_util.py index 44f38e1ca..952232f35 100644 --- a/src/uproot/_util.py +++ b/src/uproot/_util.py @@ -287,11 +287,6 @@ def regularize_path(path): return path -# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs) -_remote_schemes = ["root", "s3", "http", "https"] -_schemes = list({*_remote_schemes, *fsspec.available_protocols()}) - - def file_object_path_split(urlpath: str) -> tuple[str, str | None]: """ Split a path with a colon into a file path and an object-in-file path. @@ -815,7 +810,9 @@ def regularize_steps(steps): return out.tolist() -def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allowed): +def _regularize_files_inner( + files, parse_colon, counter, HasBranches, steps_allowed, **options +): files2 = regularize_path(files) maybe_steps = None @@ -830,12 +827,24 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo else: file_path, object_path = files, None + # This parses the windows drive letter as a scheme! parsed_url = urlparse(file_path) - - if parsed_url.scheme.lower() in _remote_schemes: - yield file_path, object_path, maybe_steps - + scheme = parsed_url.scheme + if "://" in file_path and scheme not in ("file", "local"): + # user specified a protocol, so we use fsspec to expand the glob and return the full paths + file_names_full = [ + file.full_name + for file in fsspec.open_files( + files, + **uproot.source.fsspec.FSSpecSource.extract_fsspec_options(options), + ) + ] + # https://github.com/fsspec/filesystem_spec/issues/1459 + # Not all protocols return the full_name attribute correctly (if they have url parameters) + for file_name_full in file_names_full: + yield file_name_full, object_path, maybe_steps else: + # no protocol, default to local file system expanded = os.path.expanduser(file_path) if _regularize_files_isglob.search(expanded) is None: yield file_path, object_path, maybe_steps @@ -885,6 +894,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo counter, HasBranches, steps_allowed, + **options, ): yield file_path, object_path, maybe_steps @@ -892,7 +902,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo for file in files: counter[0] += 1 for file_path, object_path, maybe_steps in _regularize_files_inner( - file, parse_colon, counter, HasBranches, steps_allowed + file, parse_colon, counter, HasBranches, steps_allowed, **options ): yield file_path, object_path, maybe_steps @@ -905,7 +915,7 @@ def _regularize_files_inner(files, parse_colon, counter, HasBranches, steps_allo ) -def regularize_files(files, steps_allowed): +def regularize_files(files, steps_allowed, **options): """ Common code for regularizing the possible file inputs accepted by uproot so they can be used by uproot internal functions. """ @@ -915,7 +925,7 @@ def regularize_files(files, steps_allowed): seen = set() counter = [0] for file_path, object_path, maybe_steps in _regularize_files_inner( - files, True, counter, HasBranches, steps_allowed + files, True, counter, HasBranches, steps_allowed, **options ): if isinstance(file_path, str): key = (counter[0], file_path, object_path) diff --git a/src/uproot/behaviors/TBranch.py b/src/uproot/behaviors/TBranch.py index a82603176..5c6325e48 100644 --- a/src/uproot/behaviors/TBranch.py +++ b/src/uproot/behaviors/TBranch.py @@ -174,7 +174,7 @@ def iterate( array from ``TTrees``. * :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``TTrees``. """ - files = uproot._util.regularize_files(files, steps_allowed=False) + files = uproot._util.regularize_files(files, steps_allowed=False, **options) decompression_executor, interpretation_executor = _regularize_executors( decompression_executor, interpretation_executor, None ) @@ -340,7 +340,7 @@ def concatenate( single concatenated array from ``TTrees``. * :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``TTrees``. """ - files = uproot._util.regularize_files(files, steps_allowed=False) + files = uproot._util.regularize_files(files, steps_allowed=False, **options) decompression_executor, interpretation_executor = _regularize_executors( decompression_executor, interpretation_executor, None ) diff --git a/src/uproot/models/TTree.py b/src/uproot/models/TTree.py index 90342cf6c..360561cdd 100644 --- a/src/uproot/models/TTree.py +++ b/src/uproot/models/TTree.py @@ -906,7 +906,6 @@ def read_members(self, chunk, cursor, context, file): uproot.classes["TTree"] = Model_TTree uproot.classes["ROOT::TIOFeatures"] = Model_ROOT_3a3a_TIOFeatures - fEntriesStruct = struct.Struct(">q") diff --git a/src/uproot/source/fsspec.py b/src/uproot/source/fsspec.py index 0773d1c47..ac36eee55 100644 --- a/src/uproot/source/fsspec.py +++ b/src/uproot/source/fsspec.py @@ -27,30 +27,29 @@ class FSSpecSource(uproot.source.chunk.Source): """ def __init__(self, file_path: str, **options): - options = dict(uproot.reading.open.defaults, **options) - storage_options = { - k: v - for k, v in options.items() - if k not in uproot.reading.open.defaults.keys() - } - - self._fs, self._file_path = fsspec.core.url_to_fs(file_path, **storage_options) + super().__init__() + self._fs, self._file_path = fsspec.core.url_to_fs( + file_path, **self.extract_fsspec_options(options) + ) # What should we do when there is a chain of filesystems? self._async_impl = self._fs.async_impl - self._executor = None self._file = None self._fh = None - self._num_requests = 0 - self._num_requested_chunks = 0 - self._num_requested_bytes = 0 - self._open() self.__enter__() + @classmethod + def extract_fsspec_options(cls, options: dict) -> dict: + uproot_default_options = dict(uproot.reading.open.defaults) + options = dict(uproot_default_options, **options) + return { + k: v for k, v in options.items() if k not in uproot_default_options.keys() + } + def _open(self): self._executor = FSSpecLoopExecutor() self._file = self._fs.open(self._file_path) diff --git a/tests/test_0692_fsspec_reading.py b/tests/test_0692_fsspec_reading.py index 18dbaab1f..2bad9136f 100644 --- a/tests/test_0692_fsspec_reading.py +++ b/tests/test_0692_fsspec_reading.py @@ -388,3 +388,102 @@ def test_issue_1035(handler): branch = tree["MuonSpectrometerTrackParticlesAuxDyn.truthParticleLink"] data = branch.array() assert len(data) == 40 + + +@pytest.mark.network +@pytest.mark.xrootd +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_xrootd(handler): + pytest.importorskip("XRootD") + pytest.importorskip("fsspec_xrootd") + iterator = uproot.iterate( + { + "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/Run2012B_*.root": "Events" + }, + ["PV_x"], + handler=handler, + ) + + arrays = [array for array in iterator] + # if more files are added that match the glob, this test needs to be updated + assert len(arrays) == 2 + + +@pytest.mark.network +@pytest.mark.xrootd +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_xrootd_no_files(handler): + pytest.importorskip("XRootD") + pytest.importorskip("fsspec_xrootd") + iterator = uproot.iterate( + { + "root://eospublic.cern.ch//eos/root-eos/cms_opendata_2012_nanoaod/*/ThisFileShouldNotExist.root": "Events" + }, + ["PV_x"], + handler=handler, + ) + with pytest.raises(FileNotFoundError): + arrays = [array for array in iterator] + + +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_s3(handler): + pytest.importorskip("s3fs") + if sys.version_info < (3, 11): + pytest.skip( + "https://github.com/scikit-hep/uproot5/pull/1012", + ) + + iterator = uproot.iterate( + {"s3://pivarski-princeton/pythia_ppZee_run17emb.*.root": "PicoDst"}, + ["Event/Event.mEventId"], + anon=True, + handler=handler, + ) + + # if more files are added that match the glob, this test needs to be updated + arrays = [array for array in iterator] + assert len(arrays) == 1 + for array in arrays: + assert len(array) == 8004 + + +@pytest.mark.parametrize( + "handler", + [ + uproot.source.fsspec.FSSpecSource, + None, + ], +) +def test_fsspec_globbing_http(handler): + pytest.importorskip("aiohttp") + + # Globbing does not work with http filesystems and will return an empty list of files + # We leave this test here to be notified when this feature is added + iterator = uproot.iterate( + { + "https://github.com/scikit-hep/scikit-hep-testdata/raw/main/src/skhep_testdata/data/uproot-issue*.root": "Events" + }, + ["MET_pt"], + handler=handler, + ) + with pytest.raises(FileNotFoundError): + arrays = [array for array in iterator] From 9d597dc6b355640185289655a3a798ce1118b0dc Mon Sep 17 00:00:00 2001 From: Luis Antonio Obis Aparicio <35803280+lobis@users.noreply.github.com> Date: Wed, 13 Dec 2023 22:23:17 +0100 Subject: [PATCH 9/9] fix: dask distributed fsspec issue (#1065) * add dask distributed to dev dependencies * work on test for issue 1063 * test multiple handlers * file handle pickling * clear file handle before pickling --- pyproject.toml | 2 +- src/uproot/source/fsspec.py | 1 + tests/test_1063_dask_distributed.py | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 tests/test_1063_dask_distributed.py diff --git a/pyproject.toml b/pyproject.toml index d22888b33..e0f2f9b57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ requires-python = ">=3.8" dev = [ "boost_histogram>=0.13", "dask-awkward>=2023.12.1", - "dask[array]", + "dask[array,distributed]", "hist>=1.2", "pandas", "awkward-pandas" diff --git a/src/uproot/source/fsspec.py b/src/uproot/source/fsspec.py index ac36eee55..a1061d606 100644 --- a/src/uproot/source/fsspec.py +++ b/src/uproot/source/fsspec.py @@ -61,6 +61,7 @@ def __repr__(self): return f"<{type(self).__name__} {path} at 0x{id(self):012x}>" def __getstate__(self): + self._fh = None state = dict(self.__dict__) state.pop("_executor") state.pop("_file") diff --git a/tests/test_1063_dask_distributed.py b/tests/test_1063_dask_distributed.py new file mode 100644 index 000000000..502a22107 --- /dev/null +++ b/tests/test_1063_dask_distributed.py @@ -0,0 +1,22 @@ +import pytest +import skhep_testdata + +import uproot +import uproot.source.file +import uproot.source.fsspec + +dask = pytest.importorskip("dask") +dask_awkward = pytest.importorskip("dask_awkward") +dask_distributed = pytest.importorskip("dask.distributed") + + +@pytest.mark.parametrize( + "handler", + [None, uproot.source.file.MemmapSource, uproot.source.fsspec.FSSpecSource], +) +def test_issue_1063(handler): + file_path = skhep_testdata.data_path("uproot-issue121.root") + + with dask_distributed.Client(): + events = uproot.dask({file_path: "Events"}, handler=handler) + dask.compute(events.Muon_pt)