From 26a94dfad2300f564091e039249a97007a621665 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 23 Oct 2024 23:50:04 -0600 Subject: [PATCH 01/98] wip toward zarr v2 reader --- virtualizarr/backend.py | 4 +- virtualizarr/readers/__init__.py | 4 +- virtualizarr/readers/zarrV2V3.py | 375 +++++++++++++++++++++++++++++++ virtualizarr/readers/zarr_v3.py | 154 ------------- 4 files changed, 379 insertions(+), 158 deletions(-) create mode 100644 virtualizarr/readers/zarrV2V3.py delete mode 100644 virtualizarr/readers/zarr_v3.py diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 32403d04..f715605b 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -17,14 +17,14 @@ KerchunkVirtualBackend, NetCDF3VirtualBackend, TIFFVirtualBackend, - ZarrV3VirtualBackend, + ZarrVirtualBackend, ) from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions # TODO add entrypoint to allow external libraries to add to this mapping VIRTUAL_BACKENDS = { "kerchunk": KerchunkVirtualBackend, - "zarr_v3": ZarrV3VirtualBackend, + "zarr_v3": ZarrVirtualBackend, "dmrpp": DMRPPVirtualBackend, # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) "netcdf3": NetCDF3VirtualBackend, diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py index 0f83ba39..ac4d66f9 100644 --- a/virtualizarr/readers/__init__.py +++ b/virtualizarr/readers/__init__.py @@ -4,7 +4,7 @@ from virtualizarr.readers.kerchunk import KerchunkVirtualBackend from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend from virtualizarr.readers.tiff import TIFFVirtualBackend -from virtualizarr.readers.zarr_v3 import ZarrV3VirtualBackend +from virtualizarr.readers.zarrV2V3 import ZarrVirtualBackend __all__ = [ "DMRPPVirtualBackend", @@ -13,5 +13,5 @@ "KerchunkVirtualBackend", "NetCDF3VirtualBackend", "TIFFVirtualBackend", - "ZarrV3VirtualBackend", + "ZarrVirtualBackend", ] diff --git a/virtualizarr/readers/zarrV2V3.py b/virtualizarr/readers/zarrV2V3.py new file mode 100644 index 00000000..854ec056 --- /dev/null +++ b/virtualizarr/readers/zarrV2V3.py @@ -0,0 +1,375 @@ +import json +from pathlib import Path +from typing import Iterable, Mapping, Optional + +import numcodecs +import numpy as np +from xarray import Dataset, Index, Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import VirtualBackend, separate_coords +from virtualizarr.zarr import ZArray +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions + + + + +class ZarrVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + """ + Read a Zarr store containing chunk manifests and return an xarray Dataset containing virtualized arrays. + + """ + + + + ######### tmp testing! ############ + reader_options={} + loadable_variables='time' + filepath = 'tmp_2_chunk.zarr' + + # check that Zarr is V3 + # 1a + from packaging import version + import zarr + + if version.parse(zarr.__version__).major < 3: + raise ImportError(f"Zarr V3 is required") + + # If drop_variables or loadable_variables is None, + # check_for_collisions will convert them to an empty list + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + # can we avoid fsspec here? + # fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) + ######### tmp############ + + # store = zarr.storage.LocalStore(filepath) + + # zg = zarr.open_consolidated(filepath) + # 1b. + zg = zarr.open_group(filepath) + + # 2a. Use zarr-python to list the variables in the store + zarr_arrays = [val for val in zg.keys()] + + # 2b. and check that all loadable_variables are present + assert set(loadable_variables).issubset(set(zarr_arrays)), f'loadable_variables ({loadable_variables}) is not a subset of variables in existing Zarr store. This zarr contains: {zarr_arrays}' + + # virtual variables are available variables minus drop variables & loadable variables + virtual_variables = list(set(zarr_arrays) - set(loadable_variables) - set(drop_variables)) + + array_variable_list = [] + # 3. For each virtual variable: + for var in virtual_variables: + # 3a. Use zarr-python to get the attributes and the dimension names, + # and coordinate names (which come from the .zmetadata or zarr.json) + array_metadata = zg[var].metadata + # are `_ARRAY_DIMENSIONS` how xarray gets coords? + array_dims = array_metadata.attributes['_ARRAY_DIMENSIONS'] + array_metadata_dict = array_metadata.to_dict() + + array_encoding = { + 'chunks': array_metadata_dict['chunks'], + 'compressor': array_metadata_dict['compressor'], + 'dtype': array_metadata_dict['dtype'], + 'fill_value': array_metadata_dict['fill_value'], + 'order': array_metadata_dict['order'], + } + + + # 3b. + # Use zarr-python to also get the dtype and chunk grid info + everything else needed to create the virtualizarr.zarr.ZArray object (eventually we can skip this step and use a zarr-python array metadata class directly instead of virtualizarr.zarr.ZArray + array_zarray = ZArray(shape = array_metadata_dict['shape'], + chunks = array_metadata_dict['chunks'], + dtype = array_metadata_dict['dtype'], + fill_value = array_metadata_dict['fill_value'], + order = array_metadata_dict['order'], + compressor = array_metadata_dict['compressor'], + filters=array_metadata_dict['filters'], + zarr_format=array_metadata_dict['zarr_format'], + ) + # 3c. Use the knowledge of the store location, variable name, and the zarr format to deduce which directory / S3 prefix the chunks must live in. + # QUESTION: how to get chunk keys from zarr-python + # fsspec ex: + # array_mapper = fsspec.get_mapper(path / 'air') + # [val for val in mapper] -> ['.zarray', '.zattrs', '0.0.0'] + # zarr python: ? + # + # air.chunks -> (1, 1, 1) + + # ToDo Replace fsspec w/ Zarr python + # add in fsspec stuff for now + + ######################### + # GET KEYS FOR MANIFESTS - + # get size, path, offset etc in dict to build ChunkManifest + ######################### + + import fsspec + array_mapper = fsspec.get_mapper(filepath + '/' + var) + + # grab all chunk keys. skip metadata files - do we need this? + array_keys = [val for val in array_mapper if not val.startswith('.')] + + + + # 3d. List all the chunks in that directory using fsspec.ls(detail=True), as that should also return the nbytes of each chunk. Remember that chunks are allowed to be missing. + # 3e. The offset of each chunk is just 0 (ignoring sharding for now), and the length is the file size fsspec returned. The paths are just all the paths fsspec listed. + + # probably trying to do too much in one big dict/list comprehension + # uses fsspec.ls on the array to get a list of dicts of info including chunk size + # filters out metadata to get only chunks + # uses fsspec.utils._unstrip_protocol utility to clean up path + + # "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + # "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + + # array path to use for all chunks + array_path = fsspec.utils._unstrip_protocol(array_mapper.root,array_mapper.fs) + + array_chunk_sizes = {val['name'].split('/')[-1]: + {'path':array_path, + 'offset': 0, + 'length': val['size'] + } for val in array_mapper.fs.ls(array_mapper.root, detail=True) if not val['name'].endswith(('.zarray', '.zattrs', '.zgroup'))} + + # 3f. Parse the path and length information returned by fsspec into the structure that we can pass to ChunkManifest.__init__ + # Initialize array chunk manifest from dictionary + array_chunkmanifest = ChunkManifest(array_chunk_sizes) + + # 3g. Create a ManifestArray from our ChunkManifest and ZArray + array_manifest_array = ManifestArray(zarray=array_zarray, chunkmanifest=array_chunkmanifest) + ######################### + ######################### + + + # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before + array_variable = Variable( + dims=array_dims, data=array_manifest_array, attrs=array_metadata_dict, encoding=array_encoding + ) + + array_variable_list.append(array_variable) + # 4 Get the loadable_variables by just using xr.open_zarr on the same store (should use drop_variables to avoid handling the virtual variables that we already have). + if loadable_variables: + import xarray as xr + # we wanna drop 'drop_variables' but also virtual variables since we already **manifested** them. + ds = xr.open_zarr(filepath, drop_variables=list(set(drop_variables + virtual_variables))) + + + + + + # For each virtual variable: + if group: + raise NotImplementedError() + + if loadable_variables or decode_times: + raise NotImplementedError() + + if reader_options: + raise NotImplementedError() + + drop_vars: list[str] + if drop_variables is None: + drop_vars = [] + else: + drop_vars = list(drop_variables) + + ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") + coord_names = ds_attrs.pop("coordinates", []) + + # TODO recursive glob to create a datatree + # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it + # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 + all_paths = storepath.glob("*/") + directory_paths = [p for p in all_paths if not p.is_file()] + + vars = {} + for array_dir in directory_paths: + var_name = array_dir.name + if var_name in drop_vars: + break + + zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") + manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) + + marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) + var = Variable(data=marr, dims=dim_names, attrs=attrs) + vars[var_name] = var + + if indexes is None: + raise NotImplementedError() + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + data_vars, coords = separate_coords(vars, indexes, coord_names) + + ds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) + + return ds + +# class ZarrV3VirtualBackend(VirtualBackend): +# @staticmethod +# def open_virtual_dataset( +# filepath: str, +# group: str | None = None, +# drop_variables: Iterable[str] | None = None, +# loadable_variables: Iterable[str] | None = None, +# decode_times: bool | None = None, +# indexes: Mapping[str, Index] | None = None, +# reader_options: Optional[dict] = None, +# ) -> Dataset: +# """ +# Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. + +# This is experimental - chunk manifests are not part of the Zarr v3 Spec. +# """ + + + +# storepath = Path(filepath) + +# if group: +# raise NotImplementedError() + +# if loadable_variables or decode_times: +# raise NotImplementedError() + +# if reader_options: +# raise NotImplementedError() + +# drop_vars: list[str] +# if drop_variables is None: +# drop_vars = [] +# else: +# drop_vars = list(drop_variables) + +# ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") +# coord_names = ds_attrs.pop("coordinates", []) + +# # TODO recursive glob to create a datatree +# # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it +# # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 +# all_paths = storepath.glob("*/") +# directory_paths = [p for p in all_paths if not p.is_file()] + +# vars = {} +# for array_dir in directory_paths: +# var_name = array_dir.name +# if var_name in drop_vars: +# break + +# zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") +# manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) + +# marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) +# var = Variable(data=marr, dims=dim_names, attrs=attrs) +# vars[var_name] = var + +# if indexes is None: +# raise NotImplementedError() +# elif indexes != {}: +# # TODO allow manual specification of index objects +# raise NotImplementedError() +# else: +# indexes = dict(**indexes) # for type hinting: to allow mutation + +# data_vars, coords = separate_coords(vars, indexes, coord_names) + +# ds = Dataset( +# data_vars, +# coords=coords, +# # indexes={}, # TODO should be added in a later version of xarray +# attrs=ds_attrs, +# ) + +# return ds + + +def attrs_from_zarr_group_json(filepath: Path) -> dict: + with open(filepath) as metadata_file: + attrs = json.load(metadata_file) + return attrs["attributes"] + + +def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: + with open(filepath) as metadata_file: + metadata = json.load(metadata_file) + + if { + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + }, + } not in metadata.get("storage_transformers", []): + raise ValueError( + "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + ) + + attrs = metadata.pop("attributes") + dim_names = metadata.pop("dimension_names") + + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) + zarr_format = metadata["zarr_format"] + + if metadata["fill_value"] is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + else: + fill_value = metadata["fill_value"] + + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] + zarray = ZArray( + chunks=chunk_shape, + compressor=compressor, + dtype=np.dtype(metadata["data_type"]), + fill_value=fill_value, + filters=filters or None, + order="C", + shape=shape, + zarr_format=zarr_format, + ) + + return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + if codec_id.startswith("numcodecs."): + codec_id = codec_id[len("numcodecs.") :] + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py deleted file mode 100644 index 4a867ffb..00000000 --- a/virtualizarr/readers/zarr_v3.py +++ /dev/null @@ -1,154 +0,0 @@ -import json -from pathlib import Path -from typing import Iterable, Mapping, Optional - -import numcodecs -import numpy as np -from xarray import Dataset, Index, Variable - -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.readers.common import VirtualBackend, separate_coords -from virtualizarr.zarr import ZArray - - -class ZarrV3VirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - """ - Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. - - This is experimental - chunk manifests are not part of the Zarr v3 Spec. - """ - storepath = Path(filepath) - - if group: - raise NotImplementedError() - - if loadable_variables or decode_times: - raise NotImplementedError() - - if reader_options: - raise NotImplementedError() - - drop_vars: list[str] - if drop_variables is None: - drop_vars = [] - else: - drop_vars = list(drop_variables) - - ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_vars: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return ds - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." - ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) - shape = tuple(metadata["shape"]) - zarr_format = metadata["zarr_format"] - - if metadata["fill_value"] is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" - ) - else: - fill_value = metadata["fill_value"] - - all_codecs = [ - codec - for codec in metadata["codecs"] - if codec["name"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] - zarray = ZArray( - chunks=chunk_shape, - compressor=compressor, - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=filters or None, - order="C", - shape=shape, - zarr_format=zarr_format, - ) - - return zarray, dim_names, attrs - - -def _configurable_to_num_codec_config(configurable: dict) -> dict: - """ - Convert a zarr v3 configurable into a numcodecs codec. - """ - configurable_copy = configurable.copy() - codec_id = configurable_copy.pop("name") - if codec_id.startswith("numcodecs."): - codec_id = codec_id[len("numcodecs.") :] - configuration = configurable_copy.pop("configuration") - return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() From cfb7b8d2091e16b5c704194259fb327a7868e56c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 24 Oct 2024 10:46:09 -0600 Subject: [PATCH 02/98] removed _ARRAY_DIMENSIONS and trimmed down attrs --- virtualizarr/readers/zarrV2V3.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/zarrV2V3.py b/virtualizarr/readers/zarrV2V3.py index 854ec056..cc31aba6 100644 --- a/virtualizarr/readers/zarrV2V3.py +++ b/virtualizarr/readers/zarrV2V3.py @@ -77,9 +77,9 @@ def open_virtual_dataset( # 3a. Use zarr-python to get the attributes and the dimension names, # and coordinate names (which come from the .zmetadata or zarr.json) array_metadata = zg[var].metadata - # are `_ARRAY_DIMENSIONS` how xarray gets coords? - array_dims = array_metadata.attributes['_ARRAY_DIMENSIONS'] + array_metadata_dict = array_metadata.to_dict() + array_dims = array_metadata_dict['attributes'].pop("_ARRAY_DIMENSIONS") array_encoding = { 'chunks': array_metadata_dict['chunks'], @@ -158,7 +158,7 @@ def open_virtual_dataset( # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before array_variable = Variable( - dims=array_dims, data=array_manifest_array, attrs=array_metadata_dict, encoding=array_encoding + dims=array_dims, data=array_manifest_array, attrs=array_metadata_dict['attributes'], encoding=array_encoding ) array_variable_list.append(array_variable) From 2f26f03f51f58d29f36753dc3712b4e6d156c74c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 24 Oct 2024 15:48:50 -0600 Subject: [PATCH 03/98] WIP for zarr reader --- conftest.py | 18 ++ virtualizarr/backend.py | 7 +- virtualizarr/readers/__init__.py | 6 +- virtualizarr/readers/zarrV2V3.py | 375 ------------------------- virtualizarr/tests/__init__.py | 1 + virtualizarr/tests/test_integration.py | 50 ++++ 6 files changed, 78 insertions(+), 379 deletions(-) delete mode 100644 virtualizarr/readers/zarrV2V3.py diff --git a/conftest.py b/conftest.py index 810fd833..ed224c76 100644 --- a/conftest.py +++ b/conftest.py @@ -35,6 +35,24 @@ def netcdf4_file(tmpdir): return filepath +@pytest.fixture() +def zarr_v2_store(tmpdir): + # Set up example xarray dataset + ds = xr.tutorial.open_dataset("air_temperature", chunks={}) + # grabbing a piece and making sure there are multiple chunks present (2): Frozen({'time': (5, 5), 'lat': (9,), 'lon': (18,)}) + chunked_subset = ds.isel( + time=slice(0, 10), lat=slice(0, 9), lon=slice(0, 18) + ).chunk({"time": 5}) + + # Save it to disk as netCDF (in temporary directory) + filepath = f"{tmpdir}/air.zarr" + + chunked_subset.to_zarr(filepath, zarr_format=2) + ds.close() + + return filepath + + @pytest.fixture def netcdf4_virtual_dataset(netcdf4_file): from virtualizarr import open_virtual_dataset diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index f715605b..764d5eca 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -17,6 +17,7 @@ KerchunkVirtualBackend, NetCDF3VirtualBackend, TIFFVirtualBackend, + ZarrV3ChunkManifestVirtualBackend, # If this is kept, we should incorporate it into ZarrVirtualBackend ZarrVirtualBackend, ) from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions @@ -24,7 +25,8 @@ # TODO add entrypoint to allow external libraries to add to this mapping VIRTUAL_BACKENDS = { "kerchunk": KerchunkVirtualBackend, - "zarr_v3": ZarrVirtualBackend, + "zarr_v3": ZarrV3ChunkManifestVirtualBackend, + "zarr": ZarrVirtualBackend, "dmrpp": DMRPPVirtualBackend, # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) "netcdf3": NetCDF3VirtualBackend, @@ -71,8 +73,7 @@ def automatically_determine_filetype( # TODO how do we handle kerchunk json / parquet here? if Path(filepath).suffix == ".zarr": - # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... - raise NotImplementedError() + return FileType.zarr # Read magic bytes from local or remote file fpath = _FsspecFSFromFilepath( diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py index ac4d66f9..19d77dc7 100644 --- a/virtualizarr/readers/__init__.py +++ b/virtualizarr/readers/__init__.py @@ -4,7 +4,10 @@ from virtualizarr.readers.kerchunk import KerchunkVirtualBackend from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend from virtualizarr.readers.tiff import TIFFVirtualBackend -from virtualizarr.readers.zarrV2V3 import ZarrVirtualBackend +from virtualizarr.readers.zarr import ( + ZarrV3ChunkManifestVirtualBackend, + ZarrVirtualBackend, +) __all__ = [ "DMRPPVirtualBackend", @@ -14,4 +17,5 @@ "NetCDF3VirtualBackend", "TIFFVirtualBackend", "ZarrVirtualBackend", + "ZarrV3ChunkManifestVirtualBackend", ] diff --git a/virtualizarr/readers/zarrV2V3.py b/virtualizarr/readers/zarrV2V3.py deleted file mode 100644 index cc31aba6..00000000 --- a/virtualizarr/readers/zarrV2V3.py +++ /dev/null @@ -1,375 +0,0 @@ -import json -from pathlib import Path -from typing import Iterable, Mapping, Optional - -import numcodecs -import numpy as np -from xarray import Dataset, Index, Variable - -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.readers.common import VirtualBackend, separate_coords -from virtualizarr.zarr import ZArray -from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions - - - - -class ZarrVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - """ - Read a Zarr store containing chunk manifests and return an xarray Dataset containing virtualized arrays. - - """ - - - - ######### tmp testing! ############ - reader_options={} - loadable_variables='time' - filepath = 'tmp_2_chunk.zarr' - - # check that Zarr is V3 - # 1a - from packaging import version - import zarr - - if version.parse(zarr.__version__).major < 3: - raise ImportError(f"Zarr V3 is required") - - # If drop_variables or loadable_variables is None, - # check_for_collisions will convert them to an empty list - drop_variables, loadable_variables = check_for_collisions( - drop_variables, - loadable_variables, - ) - - # can we avoid fsspec here? - # fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) - ######### tmp############ - - # store = zarr.storage.LocalStore(filepath) - - # zg = zarr.open_consolidated(filepath) - # 1b. - zg = zarr.open_group(filepath) - - # 2a. Use zarr-python to list the variables in the store - zarr_arrays = [val for val in zg.keys()] - - # 2b. and check that all loadable_variables are present - assert set(loadable_variables).issubset(set(zarr_arrays)), f'loadable_variables ({loadable_variables}) is not a subset of variables in existing Zarr store. This zarr contains: {zarr_arrays}' - - # virtual variables are available variables minus drop variables & loadable variables - virtual_variables = list(set(zarr_arrays) - set(loadable_variables) - set(drop_variables)) - - array_variable_list = [] - # 3. For each virtual variable: - for var in virtual_variables: - # 3a. Use zarr-python to get the attributes and the dimension names, - # and coordinate names (which come from the .zmetadata or zarr.json) - array_metadata = zg[var].metadata - - array_metadata_dict = array_metadata.to_dict() - array_dims = array_metadata_dict['attributes'].pop("_ARRAY_DIMENSIONS") - - array_encoding = { - 'chunks': array_metadata_dict['chunks'], - 'compressor': array_metadata_dict['compressor'], - 'dtype': array_metadata_dict['dtype'], - 'fill_value': array_metadata_dict['fill_value'], - 'order': array_metadata_dict['order'], - } - - - # 3b. - # Use zarr-python to also get the dtype and chunk grid info + everything else needed to create the virtualizarr.zarr.ZArray object (eventually we can skip this step and use a zarr-python array metadata class directly instead of virtualizarr.zarr.ZArray - array_zarray = ZArray(shape = array_metadata_dict['shape'], - chunks = array_metadata_dict['chunks'], - dtype = array_metadata_dict['dtype'], - fill_value = array_metadata_dict['fill_value'], - order = array_metadata_dict['order'], - compressor = array_metadata_dict['compressor'], - filters=array_metadata_dict['filters'], - zarr_format=array_metadata_dict['zarr_format'], - ) - # 3c. Use the knowledge of the store location, variable name, and the zarr format to deduce which directory / S3 prefix the chunks must live in. - # QUESTION: how to get chunk keys from zarr-python - # fsspec ex: - # array_mapper = fsspec.get_mapper(path / 'air') - # [val for val in mapper] -> ['.zarray', '.zattrs', '0.0.0'] - # zarr python: ? - # - # air.chunks -> (1, 1, 1) - - # ToDo Replace fsspec w/ Zarr python - # add in fsspec stuff for now - - ######################### - # GET KEYS FOR MANIFESTS - - # get size, path, offset etc in dict to build ChunkManifest - ######################### - - import fsspec - array_mapper = fsspec.get_mapper(filepath + '/' + var) - - # grab all chunk keys. skip metadata files - do we need this? - array_keys = [val for val in array_mapper if not val.startswith('.')] - - - - # 3d. List all the chunks in that directory using fsspec.ls(detail=True), as that should also return the nbytes of each chunk. Remember that chunks are allowed to be missing. - # 3e. The offset of each chunk is just 0 (ignoring sharding for now), and the length is the file size fsspec returned. The paths are just all the paths fsspec listed. - - # probably trying to do too much in one big dict/list comprehension - # uses fsspec.ls on the array to get a list of dicts of info including chunk size - # filters out metadata to get only chunks - # uses fsspec.utils._unstrip_protocol utility to clean up path - - # "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - # "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, - - # array path to use for all chunks - array_path = fsspec.utils._unstrip_protocol(array_mapper.root,array_mapper.fs) - - array_chunk_sizes = {val['name'].split('/')[-1]: - {'path':array_path, - 'offset': 0, - 'length': val['size'] - } for val in array_mapper.fs.ls(array_mapper.root, detail=True) if not val['name'].endswith(('.zarray', '.zattrs', '.zgroup'))} - - # 3f. Parse the path and length information returned by fsspec into the structure that we can pass to ChunkManifest.__init__ - # Initialize array chunk manifest from dictionary - array_chunkmanifest = ChunkManifest(array_chunk_sizes) - - # 3g. Create a ManifestArray from our ChunkManifest and ZArray - array_manifest_array = ManifestArray(zarray=array_zarray, chunkmanifest=array_chunkmanifest) - ######################### - ######################### - - - # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before - array_variable = Variable( - dims=array_dims, data=array_manifest_array, attrs=array_metadata_dict['attributes'], encoding=array_encoding - ) - - array_variable_list.append(array_variable) - # 4 Get the loadable_variables by just using xr.open_zarr on the same store (should use drop_variables to avoid handling the virtual variables that we already have). - if loadable_variables: - import xarray as xr - # we wanna drop 'drop_variables' but also virtual variables since we already **manifested** them. - ds = xr.open_zarr(filepath, drop_variables=list(set(drop_variables + virtual_variables))) - - - - - - # For each virtual variable: - if group: - raise NotImplementedError() - - if loadable_variables or decode_times: - raise NotImplementedError() - - if reader_options: - raise NotImplementedError() - - drop_vars: list[str] - if drop_variables is None: - drop_vars = [] - else: - drop_vars = list(drop_variables) - - ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_vars: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return ds - -# class ZarrV3VirtualBackend(VirtualBackend): -# @staticmethod -# def open_virtual_dataset( -# filepath: str, -# group: str | None = None, -# drop_variables: Iterable[str] | None = None, -# loadable_variables: Iterable[str] | None = None, -# decode_times: bool | None = None, -# indexes: Mapping[str, Index] | None = None, -# reader_options: Optional[dict] = None, -# ) -> Dataset: -# """ -# Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. - -# This is experimental - chunk manifests are not part of the Zarr v3 Spec. -# """ - - - -# storepath = Path(filepath) - -# if group: -# raise NotImplementedError() - -# if loadable_variables or decode_times: -# raise NotImplementedError() - -# if reader_options: -# raise NotImplementedError() - -# drop_vars: list[str] -# if drop_variables is None: -# drop_vars = [] -# else: -# drop_vars = list(drop_variables) - -# ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") -# coord_names = ds_attrs.pop("coordinates", []) - -# # TODO recursive glob to create a datatree -# # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it -# # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 -# all_paths = storepath.glob("*/") -# directory_paths = [p for p in all_paths if not p.is_file()] - -# vars = {} -# for array_dir in directory_paths: -# var_name = array_dir.name -# if var_name in drop_vars: -# break - -# zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") -# manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - -# marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) -# var = Variable(data=marr, dims=dim_names, attrs=attrs) -# vars[var_name] = var - -# if indexes is None: -# raise NotImplementedError() -# elif indexes != {}: -# # TODO allow manual specification of index objects -# raise NotImplementedError() -# else: -# indexes = dict(**indexes) # for type hinting: to allow mutation - -# data_vars, coords = separate_coords(vars, indexes, coord_names) - -# ds = Dataset( -# data_vars, -# coords=coords, -# # indexes={}, # TODO should be added in a later version of xarray -# attrs=ds_attrs, -# ) - -# return ds - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." - ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) - shape = tuple(metadata["shape"]) - zarr_format = metadata["zarr_format"] - - if metadata["fill_value"] is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" - ) - else: - fill_value = metadata["fill_value"] - - all_codecs = [ - codec - for codec in metadata["codecs"] - if codec["name"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] - zarray = ZArray( - chunks=chunk_shape, - compressor=compressor, - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=filters or None, - order="C", - shape=shape, - zarr_format=zarr_format, - ) - - return zarray, dim_names, attrs - - -def _configurable_to_num_codec_config(configurable: dict) -> dict: - """ - Convert a zarr v3 configurable into a numcodecs codec. - """ - configurable_copy = configurable.copy() - codec_id = configurable_copy.pop("name") - if codec_id.startswith("numcodecs."): - codec_id = codec_id[len("numcodecs.") :] - configuration = configurable_copy.pop("configuration") - return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 70f613ce..9034332b 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -37,6 +37,7 @@ def _importorskip( has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") +has_zarrV3, requires_zarrV3 = _importorskip("zarr", minversion="3.0.0") def create_manifestarray( diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 09d0c0a8..0712ce07 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,3 +1,5 @@ +from dataclasses import dataclass + import numpy as np import pytest import xarray as xr @@ -87,6 +89,54 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] +# we should parameterize for: +# - group +# - drop_variables +# - loadable_variables +# - store readable over cloud storage? +# - zarr store version v2, v3 + + +# testing out pytest parameterization with dataclasses :shrug: -- we can revert to a more normal style +@dataclass +class ZarrV2Param: + loadable_variables: list[str] | None + drop_variables: list[str] | None + + +ZARR_V2_PARAMS = [ + ZarrV2Param(loadable_variables=None, drop_variables=None), + ZarrV2Param(loadable_variables=["time"], drop_variables=None), + ZarrV2Param(loadable_variables=None, drop_variables=["lat", "lon"]), + ZarrV2Param(loadable_variables=["lat", "lon"], drop_variables=["time"]), +] + +# @requires_zarrV3 # we should have this, but we need the decorator to understand beta versions? + + +@pytest.mark.parametrize( + "input_params", + [inputs for inputs in ZARR_V2_PARAMS], +) +def test_zarrV2_roundtrip(zarr_v2_store, input_params): + # ds = xr.open_zarr(zarr_v2_store) + + open_virtual_dataset( + zarr_v2_store, + loadable_variables=input_params.loadable_variables, + drop_variables=input_params.drop_variables, + indexes={}, + ) + # assert vds has: + # loadable vars are np arrays? + # drop vars are not present + # virtual vars are manifest arrays, not loaded arrays + + # Do we have a good way in XRT to compare virtual datasets to xarray datasets? assert_duckarray_allclose? + # from xarray.testing import assert_duckarray_allclose + # xrt.assert_allclose(ds, vds) + + @requires_kerchunk @pytest.mark.parametrize("format", ["dict", "json", "parquet"]) class TestKerchunkRoundtrip: From eab87a62168f091b022b7e3dc4abb235a0e864ba Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 24 Oct 2024 16:00:52 -0600 Subject: [PATCH 04/98] adding in the key piece, the reader --- virtualizarr/readers/zarr.py | 372 +++++++++++++++++++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 virtualizarr/readers/zarr.py diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py new file mode 100644 index 00000000..e00ea728 --- /dev/null +++ b/virtualizarr/readers/zarr.py @@ -0,0 +1,372 @@ +import json +from pathlib import Path +from typing import Iterable, Mapping, Optional + +import numcodecs +import numpy as np +from xarray import Dataset, Index, Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import ( + VirtualBackend, + separate_coords, +) +from virtualizarr.utils import check_for_collisions +from virtualizarr.zarr import ZArray + + +class ZarrVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + """ + Create a virtual dataset from an existing Zarr store + """ + + # ToDo: + # group, decode_times and reader_options not used yet + # testing + # steps 5-8 incomplete + + ######### tmp for testing! ############ + # reader_options={} + # loadable_variables='time' + # drop_variables=[] + # indexes={} + # filepath = 'tmp_2_chunk.zarr' + + # check that Zarr is V3 + # 1a + import zarr + from packaging import version + + if version.parse(zarr.__version__).major < 3: + raise ImportError("Zarr V3 is required") + + # check_for_collisions will convert them to an empty list + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + # can we avoid fsspec here if we are using zarr-python for all the reading? + # fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) + ######### tmp############ + + # store = zarr.storage.LocalStore(filepath) + # zg = zarr.open_consolidated(filepath) + + # 1b. + + zg = zarr.open_group(filepath, mode="r") + + # 2a. Use zarr-python to list the variables in the store + zarr_arrays = [val for val in zg.keys()] + + # 2b. and check that all loadable_variables are present + assert set( + loadable_variables + ).issubset( + set(zarr_arrays) + ), f"loadable_variables ({loadable_variables}) is not a subset of variables in existing Zarr store. This zarr contains: {zarr_arrays}" + + # virtual variables are available variables minus drop variables & loadable variables + virtual_variables = list( + set(zarr_arrays) - set(loadable_variables) - set(drop_variables) + ) + + array_variable_list = [] + all_array_dims = [] + + # 3. For each virtual variable: + for var in virtual_variables: + # 3a. Use zarr-python to get the attributes and the dimension names, + # and coordinate names (which come from the .zmetadata or zarr.json) + array_metadata = zg[var].metadata + + array_metadata_dict = array_metadata.to_dict() + + # extract _ARRAY_DIMENSIONS and remove it from attrs + array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") + + # should these have defaults? + array_encoding = { + "chunks": array_metadata_dict.get("chunks", None), + "compressor": array_metadata_dict.get("compressor", None), + "dtype": array_metadata_dict.get("dtype", None), + "fill_value": array_metadata_dict.get("fill_value", None), + "order": array_metadata_dict.get("order", None), + } + + # 3b. + # Use zarr-python to also get the dtype and chunk grid info + everything else needed to create the virtualizarr.zarr.ZArray object (eventually we can skip this step and use a zarr-python array metadata class directly instead of virtualizarr.zarr.ZArray + array_zarray = ZArray( + shape=array_metadata_dict.get("shape", None), + chunks=array_metadata_dict.get("chunks", None), + dtype=array_metadata_dict.get("dtype", None), + fill_value=array_metadata_dict.get("fill_value", None), + order=array_metadata_dict.get("order", None), + compressor=array_metadata_dict.get("compressor", None), + filters=array_metadata_dict.get("filters", None), + zarr_format=array_metadata_dict.get("zarr_format", None), + ) + + # 3c. Use the knowledge of the store location, variable name, and the zarr format to deduce which directory / S3 prefix the chunks must live in. + # QUESTION: how to get chunk keys from zarr-python + # fsspec ex: + # array_mapper = fsspec.get_mapper(path / 'air') + # [val for val in mapper] -> ['.zarray', '.zattrs', '0.0.0'] + # zarr python: ? + # + # air.chunks -> (1, 1, 1) + + # ToDo Replace fsspec w/ Zarr python for chunk size: https://github.com/zarr-developers/zarr-python/pull/2426 + # add in fsspec stuff for now + + ######################### + # GET KEYS FOR MANIFESTS - + # get size, path, offset etc in dict to build ChunkManifest + ######################### + + import fsspec + + array_mapper = fsspec.get_mapper(filepath + "/" + var) + + # grab all chunk keys. skip metadata files - do we need this for anything? + array_keys = [val for val in array_mapper if not val.startswith(".")] + + # 3d. List all the chunks in that directory using fsspec.ls(detail=True), as that should also return the nbytes of each chunk. Remember that chunks are allowed to be missing. + # 3e. The offset of each chunk is just 0 (ignoring sharding for now), and the length is the file size fsspec returned. The paths are just all the paths fsspec listed. + + # probably trying to do too much in one big dict/list comprehension + # uses fsspec.ls on the array to get a list of dicts of info including chunk size + # filters out metadata to get only chunks + # uses fsspec.utils._unstrip_protocol utility to clean up path + + # "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + # "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + + # array path to use for all chunks + array_path = fsspec.utils._unstrip_protocol( + array_mapper.root, array_mapper.fs + ) + + array_chunk_sizes = { + val["name"].split("/")[-1]: { + "path": array_path, + "offset": 0, + "length": val["size"], + } + for val in array_mapper.fs.ls(array_mapper.root, detail=True) + if not val["name"].endswith((".zarray", ".zattrs", ".zgroup")) + } + + # 3f. Parse the path and length information returned by fsspec into the structure that we can pass to ChunkManifest.__init__ + # Initialize array chunk manifest from dictionary + + array_chunkmanifest = ChunkManifest(array_chunk_sizes) + + # 3g. Create a ManifestArray from our ChunkManifest and ZArray + array_manifest_array = ManifestArray( + zarray=array_zarray, chunkmanifest=array_chunkmanifest + ) + ######################### + ######################### + + # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before + array_variable = Variable( + dims=array_dims, + data=array_manifest_array, + attrs=array_metadata_dict.get("attributes", {}), + encoding=array_encoding, + ) + + array_variable_list.append(array_variable) + + all_array_dims.extend([dim for dim in array_dims]) + + # do we need this for `separate_coords`? + # Extending list + flatten so we don't have nested lists + all_array_dims = list(set(all_array_dims)) + + # 4 Get the loadable_variables by just using xr.open_zarr on the same store (should use drop_variables to avoid handling the virtual variables that we already have). + if loadable_variables: + import xarray as xr + + # we wanna drop 'drop_variables' but also virtual variables since we already **manifested** them. + ds = xr.open_zarr( + filepath, drop_variables=list(set(drop_variables + virtual_variables)) + ) + + # 5 Use separate_coords to set the correct variables as coordinate variables (and avoid building indexes whilst doing it) + + # this fails - --> 154 for name, var in vars.items(): - AttributeError: 'list' object has no attribute 'items' + + # separate_coords( + # vars = list(set(loadable_variables + virtual_variables)), + # indexes= indexes, + # coord_names=all_array_dims, + # ) + + # 6 Merge all the variables into one xr.Dataset and return it. + + # ToDo + # return vds + + # 7 All the above should be wrapped in a virtualizarr.readers.zarr.open_virtual_dataset function, which then should be called as a method from a ZarrVirtualBackend(VirtualBackend) subclass. + + # Done + + # 8 Finally add that ZarrVirtualBackend to the list of readers in virtualizarr.backend.py + # Done + + +class ZarrV3ChunkManifestVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + """ + Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. + + This is experimental - chunk manifests are not part of the Zarr v3 Spec. + """ + + storepath = Path(filepath) + + if group: + raise NotImplementedError() + + if loadable_variables or decode_times: + raise NotImplementedError() + + if reader_options: + raise NotImplementedError() + + drop_vars: list[str] + if drop_variables is None: + drop_vars = [] + else: + drop_vars = list(drop_variables) + + ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") + coord_names = ds_attrs.pop("coordinates", []) + + # TODO recursive glob to create a datatree + # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it + # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 + all_paths = storepath.glob("*/") + directory_paths = [p for p in all_paths if not p.is_file()] + + vars = {} + for array_dir in directory_paths: + var_name = array_dir.name + if var_name in drop_vars: + break + + zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") + manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) + + marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) + var = Variable(data=marr, dims=dim_names, attrs=attrs) + vars[var_name] = var + + if indexes is None: + raise NotImplementedError() + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + data_vars, coords = separate_coords(vars, indexes, coord_names) + + ds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) + + return ds + + +def attrs_from_zarr_group_json(filepath: Path) -> dict: + with open(filepath) as metadata_file: + attrs = json.load(metadata_file) + return attrs["attributes"] + + +def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: + with open(filepath) as metadata_file: + metadata = json.load(metadata_file) + + if { + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + }, + } not in metadata.get("storage_transformers", []): + raise ValueError( + "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + ) + + attrs = metadata.pop("attributes") + dim_names = metadata.pop("dimension_names") + + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) + zarr_format = metadata["zarr_format"] + + if metadata["fill_value"] is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + else: + fill_value = metadata["fill_value"] + + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] + zarray = ZArray( + chunks=chunk_shape, + compressor=compressor, + dtype=np.dtype(metadata["data_type"]), + fill_value=fill_value, + filters=filters or None, + order="C", + shape=shape, + zarr_format=zarr_format, + ) + + return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + if codec_id.startswith("numcodecs."): + codec_id = codec_id[len("numcodecs.") :] + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() From 13db375c47bc612b1647c0f0dccdadf449e62d88 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 31 Oct 2024 14:58:40 -0600 Subject: [PATCH 05/98] virtual dataset is returned! Now to deal with fill_value --- virtualizarr/readers/common.py | 18 +++-- virtualizarr/readers/zarr.py | 105 ++++++++----------------- virtualizarr/tests/test_integration.py | 25 ++++-- virtualizarr/utils.py | 8 +- 4 files changed, 69 insertions(+), 87 deletions(-) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 9be2b45f..ab617251 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -60,18 +60,27 @@ def open_loadable_vars_and_indexes( # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. + fpath = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) + + # Update the xarray open_dataset kwargs if Zarr + + if fpath.filepath.suffix == ".zarr": + engine = "zarr" + xr_input = fpath.filepath + + else: + engine = None + xr_input = fpath.open_file() ds = open_dataset( - cast(XArrayOpenT, fpath), + cast(XArrayOpenT, xr_input), drop_variables=drop_variables, group=group, decode_times=decode_times, + engine=engine, ) if indexes is None: @@ -87,7 +96,6 @@ def open_loadable_vars_and_indexes( raise NotImplementedError() else: indexes = dict(**indexes) # for type hinting: to allow mutation - # TODO we should drop these earlier by using drop_variables loadable_vars = { str(name): var diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index e00ea728..896545de 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -9,6 +9,8 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.readers.common import ( VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, separate_coords, ) from virtualizarr.utils import check_for_collisions @@ -30,18 +32,6 @@ def open_virtual_dataset( Create a virtual dataset from an existing Zarr store """ - # ToDo: - # group, decode_times and reader_options not used yet - # testing - # steps 5-8 incomplete - - ######### tmp for testing! ############ - # reader_options={} - # loadable_variables='time' - # drop_variables=[] - # indexes={} - # filepath = 'tmp_2_chunk.zarr' - # check that Zarr is V3 # 1a import zarr @@ -57,11 +47,6 @@ def open_virtual_dataset( ) # can we avoid fsspec here if we are using zarr-python for all the reading? - # fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) - ######### tmp############ - - # store = zarr.storage.LocalStore(filepath) - # zg = zarr.open_consolidated(filepath) # 1b. @@ -78,15 +63,15 @@ def open_virtual_dataset( ), f"loadable_variables ({loadable_variables}) is not a subset of variables in existing Zarr store. This zarr contains: {zarr_arrays}" # virtual variables are available variables minus drop variables & loadable variables - virtual_variables = list( + virtual_vars = list( set(zarr_arrays) - set(loadable_variables) - set(drop_variables) ) - array_variable_list = [] + virtual_variable_mapping = {} all_array_dims = [] # 3. For each virtual variable: - for var in virtual_variables: + for var in virtual_vars: # 3a. Use zarr-python to get the attributes and the dimension names, # and coordinate names (which come from the .zmetadata or zarr.json) array_metadata = zg[var].metadata @@ -96,7 +81,8 @@ def open_virtual_dataset( # extract _ARRAY_DIMENSIONS and remove it from attrs array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") - # should these have defaults? + # should these have defaults defined and shared across readers? + # Should these have common validation for Zarr V3 codecs & such? array_encoding = { "chunks": array_metadata_dict.get("chunks", None), "compressor": array_metadata_dict.get("compressor", None), @@ -119,30 +105,14 @@ def open_virtual_dataset( ) # 3c. Use the knowledge of the store location, variable name, and the zarr format to deduce which directory / S3 prefix the chunks must live in. - # QUESTION: how to get chunk keys from zarr-python - # fsspec ex: - # array_mapper = fsspec.get_mapper(path / 'air') - # [val for val in mapper] -> ['.zarray', '.zattrs', '0.0.0'] - # zarr python: ? - # - # air.chunks -> (1, 1, 1) - # ToDo Replace fsspec w/ Zarr python for chunk size: https://github.com/zarr-developers/zarr-python/pull/2426 - # add in fsspec stuff for now - - ######################### - # GET KEYS FOR MANIFESTS - - # get size, path, offset etc in dict to build ChunkManifest - ######################### import fsspec array_mapper = fsspec.get_mapper(filepath + "/" + var) - # grab all chunk keys. skip metadata files - do we need this for anything? - array_keys = [val for val in array_mapper if not val.startswith(".")] - # 3d. List all the chunks in that directory using fsspec.ls(detail=True), as that should also return the nbytes of each chunk. Remember that chunks are allowed to be missing. + # 3e. The offset of each chunk is just 0 (ignoring sharding for now), and the length is the file size fsspec returned. The paths are just all the paths fsspec listed. # probably trying to do too much in one big dict/list comprehension @@ -150,9 +120,6 @@ def open_virtual_dataset( # filters out metadata to get only chunks # uses fsspec.utils._unstrip_protocol utility to clean up path - # "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - # "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, - # array path to use for all chunks array_path = fsspec.utils._unstrip_protocol( array_mapper.root, array_mapper.fs @@ -177,8 +144,6 @@ def open_virtual_dataset( array_manifest_array = ManifestArray( zarray=array_zarray, chunkmanifest=array_chunkmanifest ) - ######################### - ######################### # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before array_variable = Variable( @@ -188,44 +153,36 @@ def open_virtual_dataset( encoding=array_encoding, ) - array_variable_list.append(array_variable) + virtual_variable_mapping[f"{var}"] = array_variable all_array_dims.extend([dim for dim in array_dims]) - # do we need this for `separate_coords`? - # Extending list + flatten so we don't have nested lists - all_array_dims = list(set(all_array_dims)) + coord_names = list(set(all_array_dims)) # 4 Get the loadable_variables by just using xr.open_zarr on the same store (should use drop_variables to avoid handling the virtual variables that we already have). - if loadable_variables: - import xarray as xr - - # we wanna drop 'drop_variables' but also virtual variables since we already **manifested** them. - ds = xr.open_zarr( - filepath, drop_variables=list(set(drop_variables + virtual_variables)) - ) - - # 5 Use separate_coords to set the correct variables as coordinate variables (and avoid building indexes whilst doing it) - - # this fails - --> 154 for name, var in vars.items(): - AttributeError: 'list' object has no attribute 'items' - - # separate_coords( - # vars = list(set(loadable_variables + virtual_variables)), - # indexes= indexes, - # coord_names=all_array_dims, - # ) + # We want to drop 'drop_variables' but also virtual variables since we already **manifested** them. + + non_loadable_variables = list(set(virtual_vars).union(set(drop_variables))) + + # pre made func for this?! Woohoo + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=non_loadable_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) # 6 Merge all the variables into one xr.Dataset and return it. - - # ToDo - # return vds - - # 7 All the above should be wrapped in a virtualizarr.readers.zarr.open_virtual_dataset function, which then should be called as a method from a ZarrVirtualBackend(VirtualBackend) subclass. - - # Done - - # 8 Finally add that ZarrVirtualBackend to the list of readers in virtualizarr.backend.py - # Done + return construct_virtual_dataset( + virtual_vars=virtual_variable_mapping, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=zg.attrs.asdict(), + ) class ZarrV3ChunkManifestVirtualBackend(VirtualBackend): diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 0712ce07..7b2c625a 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -79,6 +79,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( vds = open_virtual_dataset( netcdf4_file, loadable_variables=vars_to_inline, indexes={} ) + refs = vds.virtualize.to_kerchunk(format="dict") # TODO I would just compare the entire dicts but kerchunk returns inconsistent results - see https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 @@ -95,8 +96,6 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( # - loadable_variables # - store readable over cloud storage? # - zarr store version v2, v3 - - # testing out pytest parameterization with dataclasses :shrug: -- we can revert to a more normal style @dataclass class ZarrV2Param: @@ -119,20 +118,34 @@ class ZarrV2Param: [inputs for inputs in ZARR_V2_PARAMS], ) def test_zarrV2_roundtrip(zarr_v2_store, input_params): - # ds = xr.open_zarr(zarr_v2_store) - - open_virtual_dataset( + ds = open_virtual_dataset( zarr_v2_store, loadable_variables=input_params.loadable_variables, drop_variables=input_params.drop_variables, indexes={}, ) + + # THIS FAILS! TypeError: np.float32(nan) is not JSON serializable + # Question: How do we handle this fill value: fill_value=np.float32(nan) + ds_refs = ds.virtualize.to_kerchunk(format="dict") + + # tmp fix if you want to override the fill vals! + ds.lat.data.zarray.fill_value = float("nan") + ds.time.data.zarray.fill_value = float("nan") + ds.lon.data.zarray.fill_value = float("nan") + + # Use dataset_from_kerchunk_refs to reconstruct the dataset + roundtrip = dataset_from_kerchunk_refs(ds_refs) + + # Assert equal to original dataset + xrt.assert_equal(roundtrip, ds) + # assert vds has: # loadable vars are np arrays? # drop vars are not present # virtual vars are manifest arrays, not loaded arrays - # Do we have a good way in XRT to compare virtual datasets to xarray datasets? assert_duckarray_allclose? + # Do we have a good way in XRT to compare virtual datasets to xarray datasets? assert_duckarray_allclose? or just roundtrip it. # from xarray.testing import assert_duckarray_allclose # xrt.assert_allclose(ds, vds) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index c9260aa6..beaa60c8 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -49,13 +49,17 @@ def read_bytes(self, bytes: int) -> bytes: with self.open_file() as of: return of.read(bytes) + def get_mapper(self): + """Returns a mapper for use with Zarr""" + return self.fs.get_mapper(self.filepath) + def __post_init__(self) -> None: """Initialize the fsspec filesystem object""" import fsspec from upath import UPath - universal_filepath = UPath(self.filepath) - protocol = universal_filepath.protocol + self.filepath = UPath(self.filepath) + protocol = self.filepath.protocol self.reader_options = self.reader_options or {} storage_options = self.reader_options.get("storage_options", {}) # type: ignore From a047ff9e345edaaf294bc297352bfd8e6842025b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 12 Nov 2024 11:31:01 -0700 Subject: [PATCH 06/98] Update virtualizarr/readers/zarr.py Co-authored-by: Tom Nicholas --- virtualizarr/readers/zarr.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 896545de..551a1afb 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -56,11 +56,11 @@ def open_virtual_dataset( zarr_arrays = [val for val in zg.keys()] # 2b. and check that all loadable_variables are present - assert set( - loadable_variables - ).issubset( - set(zarr_arrays) - ), f"loadable_variables ({loadable_variables}) is not a subset of variables in existing Zarr store. This zarr contains: {zarr_arrays}" + missing_vars = set(loadable_variables) - set(zarr_arrays) + if missing_vars: + raise ValueError( + f"Some loadable variables specified are not present in this zarr store: {missing_vars}" + ) # virtual variables are available variables minus drop variables & loadable variables virtual_vars = list( From f7c9a3f91b139d94d5bdf798b0796328b5629815 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 14 Nov 2024 21:42:04 -0700 Subject: [PATCH 07/98] replace fsspec ls with zarr.getsize --- virtualizarr/readers/zarr.py | 304 ++++++++++++++++++++--------------- 1 file changed, 171 insertions(+), 133 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 551a1afb..f260010a 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -4,6 +4,7 @@ import numcodecs import numpy as np +import zarr from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -46,142 +47,14 @@ def open_virtual_dataset( loadable_variables, ) - # can we avoid fsspec here if we are using zarr-python for all the reading? - - # 1b. - - zg = zarr.open_group(filepath, mode="r") - - # 2a. Use zarr-python to list the variables in the store - zarr_arrays = [val for val in zg.keys()] - - # 2b. and check that all loadable_variables are present - missing_vars = set(loadable_variables) - set(zarr_arrays) - if missing_vars: - raise ValueError( - f"Some loadable variables specified are not present in this zarr store: {missing_vars}" - ) - - # virtual variables are available variables minus drop variables & loadable variables - virtual_vars = list( - set(zarr_arrays) - set(loadable_variables) - set(drop_variables) - ) - - virtual_variable_mapping = {} - all_array_dims = [] - - # 3. For each virtual variable: - for var in virtual_vars: - # 3a. Use zarr-python to get the attributes and the dimension names, - # and coordinate names (which come from the .zmetadata or zarr.json) - array_metadata = zg[var].metadata - - array_metadata_dict = array_metadata.to_dict() - - # extract _ARRAY_DIMENSIONS and remove it from attrs - array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") - - # should these have defaults defined and shared across readers? - # Should these have common validation for Zarr V3 codecs & such? - array_encoding = { - "chunks": array_metadata_dict.get("chunks", None), - "compressor": array_metadata_dict.get("compressor", None), - "dtype": array_metadata_dict.get("dtype", None), - "fill_value": array_metadata_dict.get("fill_value", None), - "order": array_metadata_dict.get("order", None), - } - - # 3b. - # Use zarr-python to also get the dtype and chunk grid info + everything else needed to create the virtualizarr.zarr.ZArray object (eventually we can skip this step and use a zarr-python array metadata class directly instead of virtualizarr.zarr.ZArray - array_zarray = ZArray( - shape=array_metadata_dict.get("shape", None), - chunks=array_metadata_dict.get("chunks", None), - dtype=array_metadata_dict.get("dtype", None), - fill_value=array_metadata_dict.get("fill_value", None), - order=array_metadata_dict.get("order", None), - compressor=array_metadata_dict.get("compressor", None), - filters=array_metadata_dict.get("filters", None), - zarr_format=array_metadata_dict.get("zarr_format", None), - ) - - # 3c. Use the knowledge of the store location, variable name, and the zarr format to deduce which directory / S3 prefix the chunks must live in. - # ToDo Replace fsspec w/ Zarr python for chunk size: https://github.com/zarr-developers/zarr-python/pull/2426 - - import fsspec - - array_mapper = fsspec.get_mapper(filepath + "/" + var) - - # 3d. List all the chunks in that directory using fsspec.ls(detail=True), as that should also return the nbytes of each chunk. Remember that chunks are allowed to be missing. - - # 3e. The offset of each chunk is just 0 (ignoring sharding for now), and the length is the file size fsspec returned. The paths are just all the paths fsspec listed. - - # probably trying to do too much in one big dict/list comprehension - # uses fsspec.ls on the array to get a list of dicts of info including chunk size - # filters out metadata to get only chunks - # uses fsspec.utils._unstrip_protocol utility to clean up path - - # array path to use for all chunks - array_path = fsspec.utils._unstrip_protocol( - array_mapper.root, array_mapper.fs - ) - - array_chunk_sizes = { - val["name"].split("/")[-1]: { - "path": array_path, - "offset": 0, - "length": val["size"], - } - for val in array_mapper.fs.ls(array_mapper.root, detail=True) - if not val["name"].endswith((".zarray", ".zattrs", ".zgroup")) - } - - # 3f. Parse the path and length information returned by fsspec into the structure that we can pass to ChunkManifest.__init__ - # Initialize array chunk manifest from dictionary - - array_chunkmanifest = ChunkManifest(array_chunk_sizes) - - # 3g. Create a ManifestArray from our ChunkManifest and ZArray - array_manifest_array = ManifestArray( - zarray=array_zarray, chunkmanifest=array_chunkmanifest - ) - - # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before - array_variable = Variable( - dims=array_dims, - data=array_manifest_array, - attrs=array_metadata_dict.get("attributes", {}), - encoding=array_encoding, - ) - - virtual_variable_mapping[f"{var}"] = array_variable - - all_array_dims.extend([dim for dim in array_dims]) - - coord_names = list(set(all_array_dims)) - - # 4 Get the loadable_variables by just using xr.open_zarr on the same store (should use drop_variables to avoid handling the virtual variables that we already have). - # We want to drop 'drop_variables' but also virtual variables since we already **manifested** them. - - non_loadable_variables = list(set(virtual_vars).union(set(drop_variables))) - - # pre made func for this?! Woohoo - loadable_vars, indexes = open_loadable_vars_and_indexes( - filepath, - loadable_variables=loadable_variables, - reader_options=reader_options, - drop_variables=non_loadable_variables, - indexes=indexes, + return virtual_dataset_from_zarr_group( + filepath=filepath, group=group, + drop_variables=drop_variables, + loadable_variables=loadable_variables, decode_times=decode_times, - ) - - # 6 Merge all the variables into one xr.Dataset and return it. - return construct_virtual_dataset( - virtual_vars=virtual_variable_mapping, - loadable_vars=loadable_vars, indexes=indexes, - coord_names=coord_names, - attrs=zg.attrs.asdict(), + reader_options=reader_options, ) @@ -261,6 +134,171 @@ def open_virtual_dataset( return ds +def virtual_dataset_from_zarr_group( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, +) -> Dataset: + import zarr + + zg = zarr.open_group(filepath, mode="r") + + # 2a. Use zarr-python to list the arrays in the store + zarr_arrays = [val for val in zg.keys()] + + # 2b. and check that all loadable_variables are present + missing_vars = set(loadable_variables) - set(zarr_arrays) + if missing_vars: + raise ValueError( + f"Some loadable variables specified are not present in this zarr store: {missing_vars}" + ) + + # virtual variables are available variables minus drop variables & loadable variables + virtual_vars = list( + set(zarr_arrays) - set(loadable_variables) - set(drop_variables) + ) + + virtual_variable_mapping = { + f"{var}": construct_virtual_array( + zarr_group=zg, var_name=var, filepath=filepath + ) + for var in virtual_vars + } + + # list comp hell + coord_names = list( + set( + item + for tup in [ + virtual_variable_mapping[val].dims for val in virtual_variable_mapping + ] + for item in tup + ) + ) + + # 4 Get the loadable_variables by just using xr.open_zarr on the same store (should use drop_variables to avoid handling the virtual variables that we already have). + # We want to drop 'drop_variables' but also virtual variables since we already **manifested** them. + + non_loadable_variables = list(set(virtual_vars).union(set(drop_variables))) + + # pre made func for this?! Woohoo + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=non_loadable_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + # 6 Merge all the variables into one xr.Dataset and return it. + return construct_virtual_dataset( + virtual_vars=virtual_variable_mapping, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=zg.attrs.asdict(), + ) + + +def construct_chunk_key_mapping( + zarr_group: zarr.core.group.Group, array_name: str +) -> dict: + # ZARR VERSION + # how can we get this JUST for the array keys, not all + import asyncio + import pathlib + + async def get_chunk_size(chunk_key: pathlib.PosixPath) -> int: + # async get chunk size + return await zarr_group.store.getsize(chunk_key) + + async def get_chunk_paths() -> dict: + chunk_paths = {} + # Is there a way to list per array? + async for item in zarr_group.store.list(): + if not item.endswith( + (".zarray", ".zattrs", ".zgroup", ".zmetadata") + ) and item.startswith(array_name): + # dict key is created by splitting the value from store.list() by the array_name and trailing /....yuck.. + chunk_paths[item.split(array_name + "/")[-1]] = { + "path": ( + zarr_group.store.root / item + ).as_uri(), # should this be as_posix() or as_uri() + "offset": 0, + "length": await get_chunk_size(item), + } + return chunk_paths + + return asyncio.run(get_chunk_paths()) + + +def construct_virtual_array( + zarr_group, var_name, filepath +): # filepath can be removed once we remove fsspec bit + # 3a. Use zarr-python to get the attributes and the dimension names, + # and coordinate names (which come from the .zmetadata or zarr.json) + array_metadata = zarr_group[var_name].metadata + + array_metadata_dict = array_metadata.to_dict() + + # ARRAY_DIMENSIONS should be removed downstream in the icechunk writer + + if zarr_group[var_name].metadata.zarr_format == 3: + array_dims = zarr_group[var_name].metadata.dimension_names + + else: + # v2 stores + array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") + + # should these have defaults defined and shared across readers? + # Should these have common validation for Zarr V3 codecs & such? + array_encoding = { + "chunks": array_metadata_dict.get("chunks", None), + "compressor": array_metadata_dict.get("compressor", None), + "dtype": array_metadata_dict.get("dtype", None), + "fill_value": array_metadata_dict.get("fill_value", None), + "order": array_metadata_dict.get("order", None), + } + + # 3b. + # Use zarr-python to also get the dtype and chunk grid info + everything else needed to create the virtualizarr.zarr.ZArray object (eventually we can skip this step and use a zarr-python array metadata class directly instead of virtualizarr.zarr.ZArray + array_zarray = ZArray( + shape=array_metadata_dict.get("shape", None), + chunks=array_metadata_dict.get("chunks", None), + dtype=array_metadata_dict.get("dtype", None), + fill_value=array_metadata_dict.get("fill_value", None), + order=array_metadata_dict.get("order", None), + compressor=array_metadata_dict.get("compressor", None), + filters=array_metadata_dict.get("filters", None), + zarr_format=array_metadata_dict.get("zarr_format", None), + ) + + array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name) + + array_chunkmanifest = ChunkManifest(array_chunk_sizes) + + # 3g. Create a ManifestArray from our ChunkManifest and ZArray + array_manifest_array = ManifestArray( + zarray=array_zarray, chunkmanifest=array_chunkmanifest + ) + + # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before + array_variable = Variable( + dims=array_dims, + data=array_manifest_array, + attrs=array_metadata_dict.get("attributes", {}), + encoding=array_encoding, + ) + + return array_variable + + def attrs_from_zarr_group_json(filepath: Path) -> dict: with open(filepath) as metadata_file: attrs = json.load(metadata_file) From 20246065e81ea2a23ff37d8799c0296ea3871251 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 14 Nov 2024 21:47:16 -0700 Subject: [PATCH 08/98] lint --- virtualizarr/readers/zarr.py | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index f260010a..1affb432 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -163,9 +163,7 @@ def virtual_dataset_from_zarr_group( ) virtual_variable_mapping = { - f"{var}": construct_virtual_array( - zarr_group=zg, var_name=var, filepath=filepath - ) + f"{var}": construct_virtual_array(zarr_group=zg, var_name=var) for var in virtual_vars } @@ -206,21 +204,18 @@ def virtual_dataset_from_zarr_group( ) -def construct_chunk_key_mapping( - zarr_group: zarr.core.group.Group, array_name: str -) -> dict: - # ZARR VERSION - # how can we get this JUST for the array keys, not all +def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str) -> dict: import asyncio import pathlib async def get_chunk_size(chunk_key: pathlib.PosixPath) -> int: - # async get chunk size + # async get chunk size of a chunk key return await zarr_group.store.getsize(chunk_key) async def get_chunk_paths() -> dict: + # this type hint for dict is doing a lot of work. Should this be a dataclass or typed dict? chunk_paths = {} - # Is there a way to list per array? + # Is there a way to call `zarr_group.store.list()` per array? async for item in zarr_group.store.list(): if not item.endswith( (".zarray", ".zattrs", ".zgroup", ".zmetadata") @@ -238,22 +233,19 @@ async def get_chunk_paths() -> dict: return asyncio.run(get_chunk_paths()) -def construct_virtual_array( - zarr_group, var_name, filepath -): # filepath can be removed once we remove fsspec bit - # 3a. Use zarr-python to get the attributes and the dimension names, - # and coordinate names (which come from the .zmetadata or zarr.json) +def construct_virtual_array(zarr_group: zarr.core.Group, var_name: str): array_metadata = zarr_group[var_name].metadata array_metadata_dict = array_metadata.to_dict() - # ARRAY_DIMENSIONS should be removed downstream in the icechunk writer - if zarr_group[var_name].metadata.zarr_format == 3: array_dims = zarr_group[var_name].metadata.dimension_names else: # v2 stores + # ARRAY_DIMENSIONS should be removed downstream in the icechunk writer. + # Should we remove them here as well? + array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") # should these have defaults defined and shared across readers? @@ -266,8 +258,6 @@ def construct_virtual_array( "order": array_metadata_dict.get("order", None), } - # 3b. - # Use zarr-python to also get the dtype and chunk grid info + everything else needed to create the virtualizarr.zarr.ZArray object (eventually we can skip this step and use a zarr-python array metadata class directly instead of virtualizarr.zarr.ZArray array_zarray = ZArray( shape=array_metadata_dict.get("shape", None), chunks=array_metadata_dict.get("chunks", None), @@ -283,12 +273,10 @@ def construct_virtual_array( array_chunkmanifest = ChunkManifest(array_chunk_sizes) - # 3g. Create a ManifestArray from our ChunkManifest and ZArray array_manifest_array = ManifestArray( zarray=array_zarray, chunkmanifest=array_chunkmanifest ) - # 3h. Wrap that ManifestArray in an xarray.Variable, using the dims and attrs we read before array_variable = Variable( dims=array_dims, data=array_manifest_array, From 443435bd12b90d69db023f82be8a5c793d60dcac Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 15 Nov 2024 00:21:33 -0700 Subject: [PATCH 09/98] wip test_zarr --- conftest.py | 18 ------------------ virtualizarr/readers/zarr.py | 21 ++++++++++++--------- 2 files changed, 12 insertions(+), 27 deletions(-) diff --git a/conftest.py b/conftest.py index 9d067bde..55c07823 100644 --- a/conftest.py +++ b/conftest.py @@ -35,24 +35,6 @@ def netcdf4_file(tmpdir): return filepath -@pytest.fixture() -def zarr_v2_store(tmpdir): - # Set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature", chunks={}) - # grabbing a piece and making sure there are multiple chunks present (2): Frozen({'time': (5, 5), 'lat': (9,), 'lon': (18,)}) - chunked_subset = ds.isel( - time=slice(0, 10), lat=slice(0, 9), lon=slice(0, 18) - ).chunk({"time": 5}) - - # Save it to disk as netCDF (in temporary directory) - filepath = f"{tmpdir}/air.zarr" - - chunked_subset.to_zarr(filepath, zarr_format=2) - ds.close() - - return filepath - - @pytest.fixture def netcdf4_file_with_2d_coords(tmpdir): ds = xr.tutorial.open_dataset("ROMS_example") diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 1affb432..f893552d 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,6 +1,6 @@ import json -from pathlib import Path -from typing import Iterable, Mapping, Optional +from pathlib import Path, PosixPath +from typing import TYPE_CHECKING, Iterable, Mapping, Optional import numcodecs import numpy as np @@ -17,6 +17,9 @@ from virtualizarr.utils import check_for_collisions from virtualizarr.zarr import ZArray +if TYPE_CHECKING: + from pathlib import PosixPath + class ZarrVirtualBackend(VirtualBackend): @staticmethod @@ -137,8 +140,8 @@ def open_virtual_dataset( def virtual_dataset_from_zarr_group( filepath: str, group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, + drop_variables: Iterable[str] | None = [], + loadable_variables: Iterable[str] | None = [], decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, reader_options: Optional[dict] = None, @@ -147,10 +150,8 @@ def virtual_dataset_from_zarr_group( zg = zarr.open_group(filepath, mode="r") - # 2a. Use zarr-python to list the arrays in the store zarr_arrays = [val for val in zg.keys()] - # 2b. and check that all loadable_variables are present missing_vars = set(loadable_variables) - set(zarr_arrays) if missing_vars: raise ValueError( @@ -161,7 +162,9 @@ def virtual_dataset_from_zarr_group( virtual_vars = list( set(zarr_arrays) - set(loadable_variables) - set(drop_variables) ) + import ipdb + ipdb.set_trace() virtual_variable_mapping = { f"{var}": construct_virtual_array(zarr_group=zg, var_name=var) for var in virtual_vars @@ -206,9 +209,8 @@ def virtual_dataset_from_zarr_group( def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str) -> dict: import asyncio - import pathlib - async def get_chunk_size(chunk_key: pathlib.PosixPath) -> int: + async def get_chunk_size(chunk_key: PosixPath) -> int: # async get chunk size of a chunk key return await zarr_group.store.getsize(chunk_key) @@ -233,7 +235,7 @@ async def get_chunk_paths() -> dict: return asyncio.run(get_chunk_paths()) -def construct_virtual_array(zarr_group: zarr.core.Group, var_name: str): +def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): array_metadata = zarr_group[var_name].metadata array_metadata_dict = array_metadata.to_dict() @@ -250,6 +252,7 @@ def construct_virtual_array(zarr_group: zarr.core.Group, var_name: str): # should these have defaults defined and shared across readers? # Should these have common validation for Zarr V3 codecs & such? + # Note! It seems like zarr v2 and v3 don't have the same array_encoding keys.. array_encoding = { "chunks": array_metadata_dict.get("chunks", None), "compressor": array_metadata_dict.get("compressor", None), From 50fd8b54b4723e561a8bfaa4c820ae203b5a8459 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 15 Nov 2024 00:23:28 -0700 Subject: [PATCH 10/98] removed pdb --- virtualizarr/readers/zarr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index f893552d..b31f2fc9 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -162,9 +162,7 @@ def virtual_dataset_from_zarr_group( virtual_vars = list( set(zarr_arrays) - set(loadable_variables) - set(drop_variables) ) - import ipdb - ipdb.set_trace() virtual_variable_mapping = { f"{var}": construct_virtual_array(zarr_group=zg, var_name=var) for var in virtual_vars From d93c9326208bca9d1425f3c0e0c4f160efb79877 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 18 Nov 2024 22:06:09 -0700 Subject: [PATCH 11/98] zarr import in type checking --- virtualizarr/readers/zarr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index b31f2fc9..a8892b8b 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,10 +1,11 @@ +from __future__ import annotations + import json from pathlib import Path, PosixPath from typing import TYPE_CHECKING, Iterable, Mapping, Optional import numcodecs import numpy as np -import zarr from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -20,6 +21,8 @@ if TYPE_CHECKING: from pathlib import PosixPath + import zarr + class ZarrVirtualBackend(VirtualBackend): @staticmethod From 39be1c51d11be6f8c9b727d3604c162260a62abd Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 18 Nov 2024 22:09:36 -0700 Subject: [PATCH 12/98] moved get_chunk_paths & get_chunk_size async funcs outside of construct_chunk_key_mapping func --- virtualizarr/readers/zarr.py | 48 +++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index a8892b8b..19e1e29f 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -208,32 +208,34 @@ def virtual_dataset_from_zarr_group( ) +async def get_chunk_size(zarr_group: zarr.core.group, chunk_key: PosixPath) -> int: + # async get chunk size of a chunk key + return await zarr_group.store.getsize(chunk_key) + + +async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str) -> dict: + # this type hint for dict is doing a lot of work. Should this be a dataclass or typed dict? + chunk_paths = {} + # Is there a way to call `zarr_group.store.list()` per array? + async for item in zarr_group.store.list(): + if not item.endswith( + (".zarray", ".zattrs", ".zgroup", ".zmetadata") + ) and item.startswith(array_name): + # dict key is created by splitting the value from store.list() by the array_name and trailing /....yuck.. + chunk_paths[item.split(array_name + "/")[-1]] = { + "path": ( + zarr_group.store.root / item + ).as_uri(), # should this be as_posix() or as_uri() + "offset": 0, + "length": await get_chunk_size(zarr_group, item), + } + return chunk_paths + + def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str) -> dict: import asyncio - async def get_chunk_size(chunk_key: PosixPath) -> int: - # async get chunk size of a chunk key - return await zarr_group.store.getsize(chunk_key) - - async def get_chunk_paths() -> dict: - # this type hint for dict is doing a lot of work. Should this be a dataclass or typed dict? - chunk_paths = {} - # Is there a way to call `zarr_group.store.list()` per array? - async for item in zarr_group.store.list(): - if not item.endswith( - (".zarray", ".zattrs", ".zgroup", ".zmetadata") - ) and item.startswith(array_name): - # dict key is created by splitting the value from store.list() by the array_name and trailing /....yuck.. - chunk_paths[item.split(array_name + "/")[-1]] = { - "path": ( - zarr_group.store.root / item - ).as_uri(), # should this be as_posix() or as_uri() - "offset": 0, - "length": await get_chunk_size(item), - } - return chunk_paths - - return asyncio.run(get_chunk_paths()) + return asyncio.run(get_chunk_paths(zarr_group, array_name)) def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): From e718240f669a615056065783da08567a3c5bed55 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 18 Nov 2024 22:13:27 -0700 Subject: [PATCH 13/98] added a few notes from PR review. --- virtualizarr/readers/zarr.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 19e1e29f..e83a5f4f 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -221,14 +221,17 @@ async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str) -> dict: if not item.endswith( (".zarray", ".zattrs", ".zgroup", ".zmetadata") ) and item.startswith(array_name): - # dict key is created by splitting the value from store.list() by the array_name and trailing /....yuck.. + # dict key is created by splitting the value from store.list() by the array_name and trailing /....yuck. + # It would be great if we can ask Zarr for this: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844486393 chunk_paths[item.split(array_name + "/")[-1]] = { "path": ( zarr_group.store.root / item - ).as_uri(), # should this be as_posix() or as_uri() + ).as_uri(), # as_uri to comply with https://github.com/zarr-developers/VirtualiZarr/pull/243 "offset": 0, "length": await get_chunk_size(zarr_group, item), } + # This won't work for sharded stores: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844487578 + return chunk_paths @@ -247,10 +250,6 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): array_dims = zarr_group[var_name].metadata.dimension_names else: - # v2 stores - # ARRAY_DIMENSIONS should be removed downstream in the icechunk writer. - # Should we remove them here as well? - array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") # should these have defaults defined and shared across readers? From bbcd473c9b86e287f08d2c1dd27992d1f2eda989 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 18 Nov 2024 22:15:17 -0700 Subject: [PATCH 14/98] removed array encoding --- virtualizarr/readers/zarr.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index e83a5f4f..5265990a 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -252,17 +252,6 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): else: array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") - # should these have defaults defined and shared across readers? - # Should these have common validation for Zarr V3 codecs & such? - # Note! It seems like zarr v2 and v3 don't have the same array_encoding keys.. - array_encoding = { - "chunks": array_metadata_dict.get("chunks", None), - "compressor": array_metadata_dict.get("compressor", None), - "dtype": array_metadata_dict.get("dtype", None), - "fill_value": array_metadata_dict.get("fill_value", None), - "order": array_metadata_dict.get("order", None), - } - array_zarray = ZArray( shape=array_metadata_dict.get("shape", None), chunks=array_metadata_dict.get("chunks", None), @@ -286,7 +275,6 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): dims=array_dims, data=array_manifest_array, attrs=array_metadata_dict.get("attributes", {}), - encoding=array_encoding, ) return array_variable From ed9f2b445433ddabbe6cf4d239e3e9ed264b7e55 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 13:07:26 -0700 Subject: [PATCH 15/98] v2 passing, v3 skipped for now --- virtualizarr/readers/zarr.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 5265990a..b3e49bae 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -253,14 +253,14 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") array_zarray = ZArray( - shape=array_metadata_dict.get("shape", None), - chunks=array_metadata_dict.get("chunks", None), - dtype=array_metadata_dict.get("dtype", None), - fill_value=array_metadata_dict.get("fill_value", None), - order=array_metadata_dict.get("order", None), - compressor=array_metadata_dict.get("compressor", None), - filters=array_metadata_dict.get("filters", None), - zarr_format=array_metadata_dict.get("zarr_format", None), + shape=array_metadata.shape, + chunks=array_metadata.chunks, + dtype=array_metadata.dtype, + fill_value=array_metadata.fill_value, + order=array_metadata.order, + compressor=array_metadata.compressor, + filters=array_metadata.filters, + zarr_format=array_metadata.zarr_format, ) array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name) From db89da782c8e1a8cc2382942d16a25389d9e5c57 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 13:08:43 -0700 Subject: [PATCH 16/98] added missed staged files --- virtualizarr/tests/test_readers/conftest.py | 18 +++++ virtualizarr/tests/test_readers/test_zarr.py | 73 ++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 virtualizarr/tests/test_readers/conftest.py create mode 100644 virtualizarr/tests/test_readers/test_zarr.py diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py new file mode 100644 index 00000000..16ca2737 --- /dev/null +++ b/virtualizarr/tests/test_readers/conftest.py @@ -0,0 +1,18 @@ +import pytest +import xarray as xr + + +def _xarray_subset(): + ds = xr.tutorial.open_dataset("air_temperature", chunks={}) + return ds.isel(time=slice(0, 10), lat=slice(0, 9), lon=slice(0, 18)).chunk( + {"time": 5} + ) + + +@pytest.fixture(params=[2, 3]) +def zarr_store(tmpdir, request): + ds = _xarray_subset() + filepath = f"{tmpdir}/air.zarr" + ds.to_zarr(filepath, zarr_format=request.param) + ds.close() + return filepath diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py new file mode 100644 index 00000000..8b830740 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -0,0 +1,73 @@ +import pytest +import zarr + +from virtualizarr.readers.zarr import virtual_dataset_from_zarr_group + + +@pytest.mark.parametrize( + "zarr_store", + [ + pytest.param(2, id="Zarr V2"), + pytest.param( + 3, + id="Zarr V3", + marks=pytest.mark.skip( + reason="Need to translate metadata naming conventions/transforms" + ), + ), + ], + indirect=True, +) +def test_dataset_from_zarr(zarr_store): + zg = zarr.open_group(zarr_store) + vds = virtual_dataset_from_zarr_group(filepath=zarr_store, indexes={}) + + zg_metadata_dict = zg.metadata.to_dict() + + arrays = [val for val in zg.keys()] + + # loop through each array and check ZArray info + for array in arrays: + # shape match + assert ( + vds[array].data.zarray.shape + == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["shape"] + ) + # match chunks + assert ( + vds[array].data.zarray.chunks + == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["chunks"] + ) + + assert ( + vds[array].data.zarray.dtype + == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["dtype"] + ) + + # Failure! fill value from zarr is None, None: ipdb> np.dtype(None): dtype('float64') is coerced in zarr.py L21 to 0.0. + # assert vds[array].data.zarray.fill_value == zg_metadata_dict['consolidated_metadata']['metadata'][array]['fill_value'] + + # match order + assert ( + vds[array].data.zarray.order + == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["order"] + ) + # match compressor + assert ( + vds[array].data.zarray.compressor + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + "compressor" + ] + ) + # match filters + assert ( + vds[array].data.zarray.filters + == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["filters"] + ) + # match format + assert ( + vds[array].data.zarray.zarr_format + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + "zarr_format" + ] + ) From 410b2a30f0cdccc93df398ecc31a17aa70c42568 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 13:16:49 -0700 Subject: [PATCH 17/98] missing return --- virtualizarr/tests/test_readers/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 13b4d25a..03c90bd2 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -28,6 +28,7 @@ def zarr_store(tmpdir, request): filepath = f"{tmpdir}/air.zarr" ds.to_zarr(filepath, zarr_format=request.param) ds.close() + return filepath @pytest.fixture From 8a69963d6b31fe51b09ee9a331ac416d2632bac8 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 13:21:11 -0700 Subject: [PATCH 18/98] add network --- virtualizarr/tests/test_readers/conftest.py | 2 +- virtualizarr/tests/test_readers/test_zarr.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 03c90bd2..ef5f9ca0 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -21,7 +21,7 @@ def _xarray_subset(): {"time": 5} ) - +@network @pytest.fixture(params=[2, 3]) def zarr_store(tmpdir, request): ds = _xarray_subset() diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 8b830740..054accb0 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -2,8 +2,10 @@ import zarr from virtualizarr.readers.zarr import virtual_dataset_from_zarr_group +from virtualizarr.tests import network +@network @pytest.mark.parametrize( "zarr_store", [ From 3fca8e6edda06168e7cc73161493630a79aa41fc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Nov 2024 20:21:24 +0000 Subject: [PATCH 19/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/tests/test_readers/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index ef5f9ca0..4cc27b0f 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -21,6 +21,7 @@ def _xarray_subset(): {"time": 5} ) + @network @pytest.fixture(params=[2, 3]) def zarr_store(tmpdir, request): From 34053b023c4c8b26479ef39027619d0eb82f8533 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 13:23:22 -0700 Subject: [PATCH 20/98] conftest fix --- virtualizarr/tests/test_readers/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 4cc27b0f..03c90bd2 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -22,7 +22,6 @@ def _xarray_subset(): ) -@network @pytest.fixture(params=[2, 3]) def zarr_store(tmpdir, request): ds = _xarray_subset() From 5c26b1f4462ec579bb413ad4c5629882f5d3f077 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 15:25:39 -0700 Subject: [PATCH 21/98] naming --- virtualizarr/tests/test_writers/test_zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py index 5afa87a3..19c4263b 100644 --- a/virtualizarr/tests/test_writers/test_zarr.py +++ b/virtualizarr/tests/test_writers/test_zarr.py @@ -8,7 +8,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.backend import FileType -from virtualizarr.readers.zarr_v3 import metadata_from_zarr_json +from virtualizarr.readers.zarr import metadata_from_zarr_json from virtualizarr.writers.zarr import dataset_to_zarr From fb784dc8dd0f6637a2bf7fe52a101f6435c3132d Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 15:27:57 -0700 Subject: [PATCH 22/98] comment out integration test for now --- virtualizarr/tests/test_integration.py | 95 ++++++++++---------------- 1 file changed, 35 insertions(+), 60 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 9f73ef95..e7a76a30 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,5 +1,3 @@ -from dataclasses import dataclass - import numpy as np import pytest import xarray as xr @@ -93,64 +91,41 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] -# we should parameterize for: -# - group -# - drop_variables -# - loadable_variables -# - store readable over cloud storage? -# - zarr store version v2, v3 -# testing out pytest parameterization with dataclasses :shrug: -- we can revert to a more normal style -@dataclass -class ZarrV2Param: - loadable_variables: list[str] | None - drop_variables: list[str] | None - - -ZARR_V2_PARAMS = [ - ZarrV2Param(loadable_variables=None, drop_variables=None), - ZarrV2Param(loadable_variables=["time"], drop_variables=None), - ZarrV2Param(loadable_variables=None, drop_variables=["lat", "lon"]), - ZarrV2Param(loadable_variables=["lat", "lon"], drop_variables=["time"]), -] - -# @requires_zarrV3 # we should have this, but we need the decorator to understand beta versions? - - -@pytest.mark.parametrize( - "input_params", - [inputs for inputs in ZARR_V2_PARAMS], -) -def test_zarrV2_roundtrip(zarr_v2_store, input_params): - ds = open_virtual_dataset( - zarr_v2_store, - loadable_variables=input_params.loadable_variables, - drop_variables=input_params.drop_variables, - indexes={}, - ) - - # THIS FAILS! TypeError: np.float32(nan) is not JSON serializable - # Question: How do we handle this fill value: fill_value=np.float32(nan) - ds_refs = ds.virtualize.to_kerchunk(format="dict") - - # tmp fix if you want to override the fill vals! - ds.lat.data.zarray.fill_value = float("nan") - ds.time.data.zarray.fill_value = float("nan") - ds.lon.data.zarray.fill_value = float("nan") - - # Use dataset_from_kerchunk_refs to reconstruct the dataset - roundtrip = dataset_from_kerchunk_refs(ds_refs) - - # Assert equal to original dataset - xrt.assert_equal(roundtrip, ds) - - # assert vds has: - # loadable vars are np arrays? - # drop vars are not present - # virtual vars are manifest arrays, not loaded arrays - - # Do we have a good way in XRT to compare virtual datasets to xarray datasets? assert_duckarray_allclose? or just roundtrip it. - # from xarray.testing import assert_duckarray_allclose - # xrt.assert_allclose(ds, vds) +# @pytest.mark.parametrize( +# "input_params", +# [inputs for inputs in ZARR_V2_PARAMS], +# ) +# def test_zarrV2_roundtrip(zarr_v2_store, input_params): +# ds = open_virtual_dataset( +# zarr_v2_store, +# loadable_variables=input_params.loadable_variables, +# drop_variables=input_params.drop_variables, +# indexes={}, +# ) + +# # THIS FAILS! TypeError: np.float32(nan) is not JSON serializable +# # Question: How do we handle this fill value: fill_value=np.float32(nan) +# ds_refs = ds.virtualize.to_kerchunk(format="dict") + +# # tmp fix if you want to override the fill vals! +# ds.lat.data.zarray.fill_value = float("nan") +# ds.time.data.zarray.fill_value = float("nan") +# ds.lon.data.zarray.fill_value = float("nan") + +# # Use dataset_from_kerchunk_refs to reconstruct the dataset +# roundtrip = dataset_from_kerchunk_refs(ds_refs) + +# # Assert equal to original dataset +# xrt.assert_equal(roundtrip, ds) + +# # assert vds has: +# # loadable vars are np arrays? +# # drop vars are not present +# # virtual vars are manifest arrays, not loaded arrays + +# # Do we have a good way in XRT to compare virtual datasets to xarray datasets? assert_duckarray_allclose? or just roundtrip it. +# # from xarray.testing import assert_duckarray_allclose +# # xrt.assert_allclose(ds, vds) @requires_kerchunk From 0444fd4e903a82df5c3473ce4f374c82b81b5723 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 18:06:01 -0700 Subject: [PATCH 23/98] refactored test_dataset_from_zarr ZArray tests --- virtualizarr/tests/test_readers/test_zarr.py | 55 +++++--------------- 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 054accb0..62268df1 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -27,49 +27,22 @@ def test_dataset_from_zarr(zarr_store): zg_metadata_dict = zg.metadata.to_dict() arrays = [val for val in zg.keys()] - + zarray_checks = [ + "shape", + "chunks", + "dtype", + "order", + "compressor", + "filters", + "zarr_format", + ] # "dtype" # loop through each array and check ZArray info for array in arrays: - # shape match - assert ( - vds[array].data.zarray.shape - == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["shape"] - ) - # match chunks - assert ( - vds[array].data.zarray.chunks - == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["chunks"] - ) - - assert ( - vds[array].data.zarray.dtype - == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["dtype"] - ) + for attr in zarray_checks: + assert ( + getattr(vds[array].data.zarray, attr) + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][attr] + ) # Failure! fill value from zarr is None, None: ipdb> np.dtype(None): dtype('float64') is coerced in zarr.py L21 to 0.0. # assert vds[array].data.zarray.fill_value == zg_metadata_dict['consolidated_metadata']['metadata'][array]['fill_value'] - - # match order - assert ( - vds[array].data.zarray.order - == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["order"] - ) - # match compressor - assert ( - vds[array].data.zarray.compressor - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ - "compressor" - ] - ) - # match filters - assert ( - vds[array].data.zarray.filters - == zg_metadata_dict["consolidated_metadata"]["metadata"][array]["filters"] - ) - # match format - assert ( - vds[array].data.zarray.zarr_format - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ - "zarr_format" - ] - ) From 66fd45648e2df1b7e8ec9ed74218f631ea06f71a Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 18:33:42 -0700 Subject: [PATCH 24/98] adds zarr v3 req opt --- virtualizarr/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 764609d3..06e50837 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -37,7 +37,7 @@ def _importorskip( has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") -has_zarrV3, requires_zarrV3 = _importorskip("zarr", minversion="3.0.0") +has_zarrV3, requires_zarrV3 = _importorskip("zarr", minversion="2.99.0") has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs") has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin") From 13fce097635cec2522f1f064429b044aa35a24ac Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 18:35:24 -0700 Subject: [PATCH 25/98] zarr_v3 decorator --- virtualizarr/tests/test_readers/test_zarr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 62268df1..dc25227f 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -2,9 +2,10 @@ import zarr from virtualizarr.readers.zarr import virtual_dataset_from_zarr_group -from virtualizarr.tests import network +from virtualizarr.tests import network, requires_zarrV3 +@requires_zarrV3 @network @pytest.mark.parametrize( "zarr_store", From c36962df8de8702a5afd3144c4a74da3f5c9281f Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 19 Nov 2024 18:48:24 -0700 Subject: [PATCH 26/98] add more tests --- virtualizarr/tests/test_readers/test_zarr.py | 24 ++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index dc25227f..0de1ffe6 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -1,6 +1,7 @@ import pytest import zarr +from virtualizarr.manifests import ManifestArray from virtualizarr.readers.zarr import virtual_dataset_from_zarr_group from virtualizarr.tests import network, requires_zarrV3 @@ -15,7 +16,7 @@ 3, id="Zarr V3", marks=pytest.mark.skip( - reason="Need to translate metadata naming conventions/transforms" + reason="ToDo/WIP: Need to translate metadata naming conventions/transforms" ), ), ], @@ -26,7 +27,21 @@ def test_dataset_from_zarr(zarr_store): vds = virtual_dataset_from_zarr_group(filepath=zarr_store, indexes={}) zg_metadata_dict = zg.metadata.to_dict() + non_var_arrays = ["time", "lat", "lon"] + # check dims and coords are present + assert set(vds.coords) == set(non_var_arrays) + assert set(vds.dims) == set(non_var_arrays) + # check vars match + assert set(vds.keys()) == set(["air"]) + # arrays are ManifestArrays + for array in list(vds): + assert isinstance(vds[array].data, ManifestArray) + + # check top level attrs + assert zg.attrs.asdict() == vds.attrs + + # check ZArray values arrays = [val for val in zg.keys()] zarray_checks = [ "shape", @@ -37,6 +52,10 @@ def test_dataset_from_zarr(zarr_store): "filters", "zarr_format", ] # "dtype" + + # Failure! fill value from zarr is None, None: ipdb> np.dtype(None): dtype('float64') is coerced in zarr.py L21 to 0.0. + # assert vds[array].data.zarray.fill_value == zg_metadata_dict['consolidated_metadata']['metadata'][array]['fill_value'] + # loop through each array and check ZArray info for array in arrays: for attr in zarray_checks: @@ -44,6 +63,3 @@ def test_dataset_from_zarr(zarr_store): getattr(vds[array].data.zarray, attr) == zg_metadata_dict["consolidated_metadata"]["metadata"][array][attr] ) - - # Failure! fill value from zarr is None, None: ipdb> np.dtype(None): dtype('float64') is coerced in zarr.py L21 to 0.0. - # assert vds[array].data.zarray.fill_value == zg_metadata_dict['consolidated_metadata']['metadata'][array]['fill_value'] From 4be4906a0b2d863eabf368ef027373975e58cce4 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 10:27:41 -0700 Subject: [PATCH 27/98] wip --- virtualizarr/readers/zarr.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index b3e49bae..305cbc38 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -213,16 +213,19 @@ async def get_chunk_size(zarr_group: zarr.core.group, chunk_key: PosixPath) -> i return await zarr_group.store.getsize(chunk_key) +async def chunk_exists(zarr_group: zarr.core.group, chunk_key: PosixPath) -> bool: + return await zarr_group.store.exists(chunk_key) + + async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str) -> dict: - # this type hint for dict is doing a lot of work. Should this be a dataclass or typed dict? chunk_paths = {} # Is there a way to call `zarr_group.store.list()` per array? async for item in zarr_group.store.list(): - if not item.endswith( - (".zarray", ".zattrs", ".zgroup", ".zmetadata") - ) and item.startswith(array_name): - # dict key is created by splitting the value from store.list() by the array_name and trailing /....yuck. - # It would be great if we can ask Zarr for this: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844486393 + if ( + not item.endswith((".zarray", ".zattrs", ".zgroup", ".zmetadata")) + and item.startswith(array_name) + and chunk_exists(zarr_group=zarr_group, chunk_key=item) + ): chunk_paths[item.split(array_name + "/")[-1]] = { "path": ( zarr_group.store.root / item From ca5ff322123867190f6b8864b5b065aeeafdd803 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 10:34:50 -0700 Subject: [PATCH 28/98] adds missing await --- virtualizarr/readers/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 305cbc38..812f079f 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -224,7 +224,7 @@ async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str) -> dict: if ( not item.endswith((".zarray", ".zattrs", ".zgroup", ".zmetadata")) and item.startswith(array_name) - and chunk_exists(zarr_group=zarr_group, chunk_key=item) + and await chunk_exists(zarr_group=zarr_group, chunk_key=item) ): chunk_paths[item.split(array_name + "/")[-1]] = { "path": ( From 88cbecab36dab4bfee284dd441649d180c935a53 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 11:10:17 -0700 Subject: [PATCH 29/98] more tests --- virtualizarr/tests/test_readers/test_zarr.py | 93 ++++++++++++-------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 0de1ffe6..3dc112b5 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -1,8 +1,9 @@ +import numpy as np import pytest import zarr +from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ManifestArray -from virtualizarr.readers.zarr import virtual_dataset_from_zarr_group from virtualizarr.tests import network, requires_zarrV3 @@ -22,44 +23,64 @@ ], indirect=True, ) -def test_dataset_from_zarr(zarr_store): - zg = zarr.open_group(zarr_store) - vds = virtual_dataset_from_zarr_group(filepath=zarr_store, indexes={}) +class TestOpenVirtualDatasetZarr: + def test_loadable_variables(self, zarr_store, loadable_variables=["time", "air"]): + # check that loadable variables works + vds = open_virtual_dataset( + filepath=zarr_store, loadable_variables=loadable_variables, indexes={} + ) + assert isinstance(vds["time"].data, np.ndarray) + assert isinstance(vds["air"].data, np.ndarray) - zg_metadata_dict = zg.metadata.to_dict() - non_var_arrays = ["time", "lat", "lon"] - # check dims and coords are present - assert set(vds.coords) == set(non_var_arrays) - assert set(vds.dims) == set(non_var_arrays) - # check vars match - assert set(vds.keys()) == set(["air"]) + def test_drop_variables(self, zarr_store, drop_variables=["air"]): + # check variable is dropped + vds = open_virtual_dataset( + filepath=zarr_store, drop_variables=drop_variables, indexes={} + ) + assert len(vds.data_vars) == 0 - # arrays are ManifestArrays - for array in list(vds): - assert isinstance(vds[array].data, ManifestArray) + def test_virtual_dataset_from_zarr_group(self, zarr_store): + # check that loadable variables works - # check top level attrs - assert zg.attrs.asdict() == vds.attrs + zg = zarr.open_group(zarr_store) + vds = open_virtual_dataset(filepath=zarr_store, indexes={}) - # check ZArray values - arrays = [val for val in zg.keys()] - zarray_checks = [ - "shape", - "chunks", - "dtype", - "order", - "compressor", - "filters", - "zarr_format", - ] # "dtype" + zg_metadata_dict = zg.metadata.to_dict() + non_var_arrays = ["time", "lat", "lon"] + # check dims and coords are present + assert set(vds.coords) == set(non_var_arrays) + assert set(vds.dims) == set(non_var_arrays) + # check vars match + assert set(vds.keys()) == set(["air"]) - # Failure! fill value from zarr is None, None: ipdb> np.dtype(None): dtype('float64') is coerced in zarr.py L21 to 0.0. - # assert vds[array].data.zarray.fill_value == zg_metadata_dict['consolidated_metadata']['metadata'][array]['fill_value'] + # arrays are ManifestArrays + for array in list(vds): + assert isinstance(vds[array].data, ManifestArray) - # loop through each array and check ZArray info - for array in arrays: - for attr in zarray_checks: - assert ( - getattr(vds[array].data.zarray, attr) - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][attr] - ) + # check top level attrs + assert zg.attrs.asdict() == vds.attrs + + # check ZArray values + arrays = [val for val in zg.keys()] + zarray_checks = [ + "shape", + "chunks", + "dtype", + "order", + "compressor", + "filters", + "zarr_format", + ] # "dtype" + + # Failure! fill value from zarr is None, None: ipdb> np.dtype(None): dtype('float64') is coerced in zarr.py L21 to 0.0. + # assert vds[array].data.zarray.fill_value == zg_metadata_dict['consolidated_metadata']['metadata'][array]['fill_value'] + + # loop through each array and check ZArray info + for array in arrays: + for attr in zarray_checks: + assert ( + getattr(vds[array].data.zarray, attr) + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + attr + ] + ) From 1fbdc9c4b12e16519fcded0d1882b741b7e205f9 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 11:23:17 -0700 Subject: [PATCH 30/98] wip --- virtualizarr/tests/test_readers/test_zarr.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 3dc112b5..7f7a626e 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -70,12 +70,8 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): "compressor", "filters", "zarr_format", - ] # "dtype" - - # Failure! fill value from zarr is None, None: ipdb> np.dtype(None): dtype('float64') is coerced in zarr.py L21 to 0.0. - # assert vds[array].data.zarray.fill_value == zg_metadata_dict['consolidated_metadata']['metadata'][array]['fill_value'] - - # loop through each array and check ZArray info + "dtype", + ] for array in arrays: for attr in zarray_checks: assert ( From 370621f3b0d3287ff669daa22ae3b94ba775266e Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 12:43:54 -0700 Subject: [PATCH 31/98] wip on v3 --- virtualizarr/readers/zarr.py | 65 ++++++++++++++------ virtualizarr/tests/test_readers/test_zarr.py | 6 +- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 812f079f..82e8b68e 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -247,24 +247,12 @@ def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str) -> def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): array_metadata = zarr_group[var_name].metadata - array_metadata_dict = array_metadata.to_dict() - - if zarr_group[var_name].metadata.zarr_format == 3: - array_dims = zarr_group[var_name].metadata.dimension_names - + array_dims = array_metadata.dimension_names + attrs = array_metadata.attributes + if array_metadata.zarr_format == 3: + array_zarray = _parse_zarr_v3_metadata(metadata=array_metadata) else: - array_dims = array_metadata_dict.get("attributes").pop("_ARRAY_DIMENSIONS") - - array_zarray = ZArray( - shape=array_metadata.shape, - chunks=array_metadata.chunks, - dtype=array_metadata.dtype, - fill_value=array_metadata.fill_value, - order=array_metadata.order, - compressor=array_metadata.compressor, - filters=array_metadata.filters, - zarr_format=array_metadata.zarr_format, - ) + array_zarray = _parse_zarr_v2_metadata(metadata=array_metadata) array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name) @@ -277,12 +265,52 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): array_variable = Variable( dims=array_dims, data=array_manifest_array, - attrs=array_metadata_dict.get("attributes", {}), + attrs=attrs, ) return array_variable +def _parse_zarr_v2_metadata(metadata: zarr.core.group.GroupMetadata) -> ZArray: + return ZArray( + shape=metadata.shape, + chunks=metadata.chunks, + dtype=metadata.dtype, + fill_value=metadata.fill_value, + order="C", + compressor=metadata.compressor, + filters=metadata.filters, + zarr_format=metadata.zarr_format, + ) + + +def _parse_zarr_v3_metadata(metadata: zarr.core.group.GroupMetadata) -> ZArray: + if metadata.fill_value is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + else: + fill_value = metadata.fill_value + all_codecs = [ + codec + for codec in metadata.to_dict()["codecs"] + if codec["configuration"]["endian"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] + return ZArray( + chunks=metadata.chunk_grid.chunk_shape, + compressor=compressor, + dtype=np.dtype(metadata.data_type), + fill_value=fill_value, + filters=filters or None, + order="C", + shape=metadata.shape, + zarr_format=metadata.zarr_format, + ) + + def attrs_from_zarr_group_json(filepath: Path) -> dict: with open(filepath) as metadata_file: attrs = json.load(metadata_file) @@ -348,4 +376,5 @@ def _configurable_to_num_codec_config(configurable: dict) -> dict: if codec_id.startswith("numcodecs."): codec_id = codec_id[len("numcodecs.") :] configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 7f7a626e..c1986e4f 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -16,9 +16,9 @@ pytest.param( 3, id="Zarr V3", - marks=pytest.mark.skip( - reason="ToDo/WIP: Need to translate metadata naming conventions/transforms" - ), + # marks=pytest.mark.skip( + # reason="ToDo/WIP: Need to translate metadata naming conventions/transforms" + # ), ), ], indirect=True, From 9bb0653348c19d1703e254f27028f038bfcd3e83 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 12:44:55 -0700 Subject: [PATCH 32/98] add note + xfail v3 --- virtualizarr/tests/test_readers/test_zarr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index c1986e4f..1bf9c0b8 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -16,9 +16,9 @@ pytest.param( 3, id="Zarr V3", - # marks=pytest.mark.skip( - # reason="ToDo/WIP: Need to translate metadata naming conventions/transforms" - # ), + marks=pytest.mark.xfail( + reason="Need to parse codecs into filters/compressors" + ), ), ], indirect=True, From 7e03ea5cda0e6458c6d9be0057c75ede5082ecf8 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 12:50:32 -0700 Subject: [PATCH 33/98] tmp run network --- .github/workflows/upstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 9140896b..22c3215b 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -48,7 +48,7 @@ jobs: - name: Running Tests run: | - python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose + python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 From 5c1e33102d8f15db51f5bca48c13e85724b4db0a Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 21 Nov 2024 12:52:16 -0700 Subject: [PATCH 34/98] revert --- .github/workflows/upstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 22c3215b..20a20c95 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -48,7 +48,7 @@ jobs: - name: Running Tests run: | - python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose + python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 From 9404625ba16af5590a7ebcc6277009873093d5d2 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 22 Nov 2024 13:56:33 -0700 Subject: [PATCH 35/98] update construct_virtual_array ordering --- virtualizarr/readers/zarr.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 82e8b68e..2133a403 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -246,13 +246,17 @@ def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str) -> def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): array_metadata = zarr_group[var_name].metadata - - array_dims = array_metadata.dimension_names attrs = array_metadata.attributes - if array_metadata.zarr_format == 3: + + if array_metadata.zarr_format == 2: + array_zarray = _parse_zarr_v2_metadata(metadata=array_metadata) + array_dims = attrs["_ARRAY_DIMENSIONS"] + elif array_metadata.zarr_format == 3: array_zarray = _parse_zarr_v3_metadata(metadata=array_metadata) + array_dims = array_metadata.dimension_names + else: - array_zarray = _parse_zarr_v2_metadata(metadata=array_metadata) + raise NotImplementedError("Zarr format is not recognized as v2 or v3.") array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name) From cc7d68c5f93b47e2c77627ec3a59e373fa03ead2 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 3 Dec 2024 11:16:33 -0700 Subject: [PATCH 36/98] updated ABC after merge --- virtualizarr/readers/zarr.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 2133a403..38488497 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -27,34 +27,36 @@ class ZarrVirtualBackend(VirtualBackend): @staticmethod def open_virtual_dataset( - filepath: str, + path: str, group: str | None = None, drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """ Create a virtual dataset from an existing Zarr store """ + if virtual_backend_kwargs: + raise NotImplementedError( + "Zarr reader does not understand any virtual_backend_kwargs" + ) - # check that Zarr is V3 - # 1a import zarr from packaging import version if version.parse(zarr.__version__).major < 3: raise ImportError("Zarr V3 is required") - # check_for_collisions will convert them to an empty list drop_variables, loadable_variables = check_for_collisions( drop_variables, loadable_variables, ) return virtual_dataset_from_zarr_group( - filepath=filepath, + filepath=path, group=group, drop_variables=drop_variables, loadable_variables=loadable_variables, From ac105eae2a405526231d8ce76894c27b940c873b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 9 Dec 2024 11:17:39 -0700 Subject: [PATCH 37/98] wip --- virtualizarr/readers/zarr.py | 94 ++++++++++++-------- virtualizarr/tests/test_readers/test_zarr.py | 21 +++-- 2 files changed, 70 insertions(+), 45 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 38488497..3a131c9a 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -212,6 +212,7 @@ def virtual_dataset_from_zarr_group( async def get_chunk_size(zarr_group: zarr.core.group, chunk_key: PosixPath) -> int: # async get chunk size of a chunk key + return await zarr_group.store.getsize(chunk_key) @@ -219,48 +220,64 @@ async def chunk_exists(zarr_group: zarr.core.group, chunk_key: PosixPath) -> boo return await zarr_group.store.exists(chunk_key) -async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str) -> dict: +async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str, zarr_version: int) -> dict: chunk_paths = {} # Is there a way to call `zarr_group.store.list()` per array? + async for item in zarr_group.store.list(): if ( - not item.endswith((".zarray", ".zattrs", ".zgroup", ".zmetadata")) + not item.endswith((".zarray", ".zattrs", ".zgroup", ".zmetadata", ".json")) and item.startswith(array_name) and await chunk_exists(zarr_group=zarr_group, chunk_key=item) ): - chunk_paths[item.split(array_name + "/")[-1]] = { + + if zarr_version == 2: + # split on array name + trailing slash + chunk_key = item.split(array_name + "/")[-1] + elif zarr_version == 3: + # In v3 we remove the /c/ 'chunks' part of the key and + # replace trailing slashes with '.' to conform to ChunkManifest validation + chunk_key = item.split(array_name + "/")[-1].split('c/')[-1].replace('/','.') + + else: + raise NotImplementedError(f"{zarr_version} not 2 or 3.") + chunk_paths[chunk_key] = { "path": ( zarr_group.store.root / item ).as_uri(), # as_uri to comply with https://github.com/zarr-developers/VirtualiZarr/pull/243 "offset": 0, "length": await get_chunk_size(zarr_group, item), } + # This won't work for sharded stores: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844487578 return chunk_paths -def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str) -> dict: +def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str, zarr_version: int) -> dict: import asyncio - return asyncio.run(get_chunk_paths(zarr_group, array_name)) + return asyncio.run(get_chunk_paths(zarr_group=zarr_group, array_name=array_name, zarr_version=zarr_version)) def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): - array_metadata = zarr_group[var_name].metadata - attrs = array_metadata.attributes + zarr_array = zarr_group[var_name] + + attrs = zarr_array.metadata.attributes + - if array_metadata.zarr_format == 2: - array_zarray = _parse_zarr_v2_metadata(metadata=array_metadata) + + if zarr_array.metadata.zarr_format == 2: + array_zarray = _parse_zarr_v2_metadata(zarr_array=zarr_array) array_dims = attrs["_ARRAY_DIMENSIONS"] - elif array_metadata.zarr_format == 3: - array_zarray = _parse_zarr_v3_metadata(metadata=array_metadata) - array_dims = array_metadata.dimension_names + elif zarr_array.metadata.zarr_format == 3: + array_zarray = _parse_zarr_v3_metadata(zarr_array=zarr_array) + array_dims = zarr_array.metadata.dimension_names else: raise NotImplementedError("Zarr format is not recognized as v2 or v3.") - array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name) + array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name, zarr_version=zarr_array.metadata.zarr_format) array_chunkmanifest = ChunkManifest(array_chunk_sizes) @@ -277,43 +294,46 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): return array_variable -def _parse_zarr_v2_metadata(metadata: zarr.core.group.GroupMetadata) -> ZArray: +def _parse_zarr_v2_metadata(zarr_array: zarr.core.array.Array) -> ZArray: return ZArray( - shape=metadata.shape, - chunks=metadata.chunks, - dtype=metadata.dtype, - fill_value=metadata.fill_value, + shape=zarr_array.metadata.shape, + chunks=zarr_array.metadata.chunks, + dtype=zarr_array.metadata.dtype, + fill_value=zarr_array.metadata.fill_value, order="C", - compressor=metadata.compressor, - filters=metadata.filters, - zarr_format=metadata.zarr_format, + compressor=zarr_array.metadata.compressor, + filters=zarr_array.metadata.filters, + zarr_format=zarr_array.metadata.zarr_format, ) -def _parse_zarr_v3_metadata(metadata: zarr.core.group.GroupMetadata) -> ZArray: - if metadata.fill_value is None: +def _parse_zarr_v3_metadata(zarr_array: zarr.core.array.Array) -> ZArray: + from virtualizarr.codecs import get_codecs + + if zarr_array.metadata.fill_value is None: raise ValueError( "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" ) else: - fill_value = metadata.fill_value - all_codecs = [ - codec - for codec in metadata.to_dict()["codecs"] - if codec["configuration"]["endian"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] + fill_value = zarr_array.metadata.fill_value + + # Codecs from test looks like: (BytesCodec(endian=),) + # Questions: What do we do with endian info? + codecs = get_codecs(zarr_array) + + # Q: Are these ever in codecs? + compressor = getattr(codecs[0], "compressor", None) + filters = getattr(codecs[0], "filters", None) + return ZArray( - chunks=metadata.chunk_grid.chunk_shape, + chunks=zarr_array.metadata.chunk_grid.chunk_shape, compressor=compressor, - dtype=np.dtype(metadata.data_type), + dtype=zarr_array.metadata.data_type.name, fill_value=fill_value, - filters=filters or None, + filters=filters, order="C", - shape=metadata.shape, - zarr_format=metadata.zarr_format, + shape=zarr_array.metadata.shape, + zarr_format=zarr_array.metadata.zarr_format, ) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 1bf9c0b8..4bf9253f 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -16,9 +16,9 @@ pytest.param( 3, id="Zarr V3", - marks=pytest.mark.xfail( - reason="Need to parse codecs into filters/compressors" - ), + # marks=pytest.mark.xfail( + # reason="Need to parse codecs into filters/compressors" + # ), ), ], indirect=True, @@ -44,7 +44,6 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): zg = zarr.open_group(zarr_store) vds = open_virtual_dataset(filepath=zarr_store, indexes={}) - zg_metadata_dict = zg.metadata.to_dict() non_var_arrays = ["time", "lat", "lon"] # check dims and coords are present @@ -64,7 +63,7 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): arrays = [val for val in zg.keys()] zarray_checks = [ "shape", - "chunks", + # "chunks", "dtype", "order", "compressor", @@ -74,9 +73,15 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): ] for array in arrays: for attr in zarray_checks: + import ipdb; ipdb.set_trace() + + # for v3: + # schema is diff for + # chunks: zg_metadata_dict["consolidated_metadata"]["metadata"][array]['chunk_grid']['configuration']['chunk_shape'] + # + + assert ( getattr(vds[array].data.zarray, attr) - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ - attr - ] + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][attr] ) From ff01c92167d700b7f6ab29f70d30ab89814e97fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 21:48:52 +0000 Subject: [PATCH 38/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/zarr.py | 27 +++++++++++++------- virtualizarr/tests/test_readers/test_zarr.py | 13 ++++++---- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 3a131c9a..4e64ad69 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -220,7 +220,9 @@ async def chunk_exists(zarr_group: zarr.core.group, chunk_key: PosixPath) -> boo return await zarr_group.store.exists(chunk_key) -async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str, zarr_version: int) -> dict: +async def get_chunk_paths( + zarr_group: zarr.core.group, array_name: str, zarr_version: int +) -> dict: chunk_paths = {} # Is there a way to call `zarr_group.store.list()` per array? @@ -230,15 +232,16 @@ async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str, zarr_ver and item.startswith(array_name) and await chunk_exists(zarr_group=zarr_group, chunk_key=item) ): - if zarr_version == 2: # split on array name + trailing slash chunk_key = item.split(array_name + "/")[-1] elif zarr_version == 3: # In v3 we remove the /c/ 'chunks' part of the key and # replace trailing slashes with '.' to conform to ChunkManifest validation - chunk_key = item.split(array_name + "/")[-1].split('c/')[-1].replace('/','.') - + chunk_key = ( + item.split(array_name + "/")[-1].split("c/")[-1].replace("/", ".") + ) + else: raise NotImplementedError(f"{zarr_version} not 2 or 3.") chunk_paths[chunk_key] = { @@ -254,10 +257,16 @@ async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str, zarr_ver return chunk_paths -def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str, zarr_version: int) -> dict: +def construct_chunk_key_mapping( + zarr_group: zarr.core.group, array_name: str, zarr_version: int +) -> dict: import asyncio - return asyncio.run(get_chunk_paths(zarr_group=zarr_group, array_name=array_name, zarr_version=zarr_version)) + return asyncio.run( + get_chunk_paths( + zarr_group=zarr_group, array_name=array_name, zarr_version=zarr_version + ) + ) def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): @@ -265,8 +274,6 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): attrs = zarr_array.metadata.attributes - - if zarr_array.metadata.zarr_format == 2: array_zarray = _parse_zarr_v2_metadata(zarr_array=zarr_array) array_dims = attrs["_ARRAY_DIMENSIONS"] @@ -277,7 +284,9 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): else: raise NotImplementedError("Zarr format is not recognized as v2 or v3.") - array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name, zarr_version=zarr_array.metadata.zarr_format) + array_chunk_sizes = construct_chunk_key_mapping( + zarr_group, array_name=var_name, zarr_version=zarr_array.metadata.zarr_format + ) array_chunkmanifest = ChunkManifest(array_chunk_sizes) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 4bf9253f..1385c930 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -73,15 +73,18 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): ] for array in arrays: for attr in zarray_checks: - import ipdb; ipdb.set_trace() + import ipdb + + ipdb.set_trace() # for v3: - # schema is diff for + # schema is diff for # chunks: zg_metadata_dict["consolidated_metadata"]["metadata"][array]['chunk_grid']['configuration']['chunk_shape'] - # + # - assert ( getattr(vds[array].data.zarray, attr) - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][attr] + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + attr + ] ) From 4f2470a057ac4d2eda5c0d6ea3263a53558a2451 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 10 Dec 2024 16:26:14 -0700 Subject: [PATCH 39/98] working for v2 and v3, but only local --- ci/min-deps.yml | 1 + virtualizarr/manifests/manifest.py | 3 +- virtualizarr/readers/zarr.py | 40 +++++++++---- virtualizarr/tests/test_readers/test_zarr.py | 61 +++++++++++++++----- 4 files changed, 77 insertions(+), 28 deletions(-) diff --git a/ci/min-deps.yml b/ci/min-deps.yml index 344a4595..dd52d38b 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -13,6 +13,7 @@ dependencies: - ujson - universal_pathlib # Testing + - dask - codecov - pre-commit - mypy diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index cc970fb2..666c5b60 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -84,7 +84,8 @@ def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> return urlunparse(components) elif any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES): - if not PosixPath(path).suffix: + # this feels fragile, is there a better way to ID a Zarr + if not PosixPath(path).suffix and "zarr" not in path: raise ValueError( f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" ) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 3a131c9a..8a04831a 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -15,7 +15,7 @@ open_loadable_vars_and_indexes, separate_coords, ) -from virtualizarr.utils import check_for_collisions +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions from virtualizarr.zarr import ZArray if TYPE_CHECKING: @@ -27,7 +27,7 @@ class ZarrVirtualBackend(VirtualBackend): @staticmethod def open_virtual_dataset( - path: str, + filepath: str, group: str | None = None, drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, @@ -56,7 +56,7 @@ def open_virtual_dataset( ) return virtual_dataset_from_zarr_group( - filepath=path, + filepath=filepath, group=group, drop_variables=drop_variables, loadable_variables=loadable_variables, @@ -153,6 +153,13 @@ def virtual_dataset_from_zarr_group( ) -> Dataset: import zarr + # filepath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) + # This currently fails: *** TypeError: Filesystem needs to support async operations. + + filepath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).filepath + zg = zarr.open_group(filepath, mode="r") zarr_arrays = [val for val in zg.keys()] @@ -220,7 +227,9 @@ async def chunk_exists(zarr_group: zarr.core.group, chunk_key: PosixPath) -> boo return await zarr_group.store.exists(chunk_key) -async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str, zarr_version: int) -> dict: +async def get_chunk_paths( + zarr_group: zarr.core.group, array_name: str, zarr_version: int +) -> dict: chunk_paths = {} # Is there a way to call `zarr_group.store.list()` per array? @@ -230,15 +239,16 @@ async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str, zarr_ver and item.startswith(array_name) and await chunk_exists(zarr_group=zarr_group, chunk_key=item) ): - if zarr_version == 2: # split on array name + trailing slash chunk_key = item.split(array_name + "/")[-1] elif zarr_version == 3: # In v3 we remove the /c/ 'chunks' part of the key and # replace trailing slashes with '.' to conform to ChunkManifest validation - chunk_key = item.split(array_name + "/")[-1].split('c/')[-1].replace('/','.') - + chunk_key = ( + item.split(array_name + "/")[-1].split("c/")[-1].replace("/", ".") + ) + else: raise NotImplementedError(f"{zarr_version} not 2 or 3.") chunk_paths[chunk_key] = { @@ -254,10 +264,16 @@ async def get_chunk_paths(zarr_group: zarr.core.group, array_name: str, zarr_ver return chunk_paths -def construct_chunk_key_mapping(zarr_group: zarr.core.group, array_name: str, zarr_version: int) -> dict: +def construct_chunk_key_mapping( + zarr_group: zarr.core.group, array_name: str, zarr_version: int +) -> dict: import asyncio - return asyncio.run(get_chunk_paths(zarr_group=zarr_group, array_name=array_name, zarr_version=zarr_version)) + return asyncio.run( + get_chunk_paths( + zarr_group=zarr_group, array_name=array_name, zarr_version=zarr_version + ) + ) def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): @@ -265,8 +281,6 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): attrs = zarr_array.metadata.attributes - - if zarr_array.metadata.zarr_format == 2: array_zarray = _parse_zarr_v2_metadata(zarr_array=zarr_array) array_dims = attrs["_ARRAY_DIMENSIONS"] @@ -277,7 +291,9 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): else: raise NotImplementedError("Zarr format is not recognized as v2 or v3.") - array_chunk_sizes = construct_chunk_key_mapping(zarr_group, array_name=var_name, zarr_version=zarr_array.metadata.zarr_format) + array_chunk_sizes = construct_chunk_key_mapping( + zarr_group, array_name=var_name, zarr_version=zarr_array.metadata.zarr_format + ) array_chunkmanifest = ChunkManifest(array_chunk_sizes) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 4bf9253f..d4d1600d 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -45,6 +45,8 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): zg = zarr.open_group(zarr_store) vds = open_virtual_dataset(filepath=zarr_store, indexes={}) zg_metadata_dict = zg.metadata.to_dict() + zarr_format = zg_metadata_dict["zarr_format"] + non_var_arrays = ["time", "lat", "lon"] # check dims and coords are present assert set(vds.coords) == set(non_var_arrays) @@ -61,9 +63,10 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): # check ZArray values arrays = [val for val in zg.keys()] - zarray_checks = [ + + zarr_attrs = [ "shape", - # "chunks", + "chunks", "dtype", "order", "compressor", @@ -71,17 +74,45 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): "zarr_format", "dtype", ] + for array in arrays: - for attr in zarray_checks: - import ipdb; ipdb.set_trace() - - # for v3: - # schema is diff for - # chunks: zg_metadata_dict["consolidated_metadata"]["metadata"][array]['chunk_grid']['configuration']['chunk_shape'] - # - - - assert ( - getattr(vds[array].data.zarray, attr) - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][attr] - ) + for attr in zarr_attrs: + vds_attr = getattr(vds[array].data.zarray, attr) + + # Edge cases where v2 and v3 attr keys differ: order, compressor, filters, dtype & chunks + if zarr_format == 3: + if "order" in attr: + # In zarr v3, it seems like order was replaced with the transpose codec. + # skip check + zarr_metadata_attr = vds_attr + + elif "compressor" in attr: + zarr_metadata_attr = vds_attr + + elif "filters" in attr: + zarr_metadata_attr = vds_attr + + elif "chunks" in attr: + # chunks vs chunk_grid.configuration.chunk_shape + zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ + "metadata" + ][array]["chunk_grid"]["configuration"]["chunk_shape"] + + elif "dtype" in attr: + # dtype vs datatype + zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ + "metadata" + ][array]["data_type"].to_numpy() + + else: + # follows v2 dict lookup + zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ + "metadata" + ][array][attr] + + else: + zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ + "metadata" + ][array][attr] + + assert vds_attr == zarr_metadata_attr From 05d405009a1edcff1f6d83a065de9d46dde8737c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 11 Dec 2024 09:56:41 -0700 Subject: [PATCH 40/98] cleanup test_zarr reader test --- virtualizarr/tests/test_readers/test_zarr.py | 96 ++++++++++---------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index d4d1600d..ce45d4fe 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -39,9 +39,7 @@ def test_drop_variables(self, zarr_store, drop_variables=["air"]): ) assert len(vds.data_vars) == 0 - def test_virtual_dataset_from_zarr_group(self, zarr_store): - # check that loadable variables works - + def test_virtual_dataset_zarr_attrs(self, zarr_store): zg = zarr.open_group(zarr_store) vds = open_virtual_dataset(filepath=zarr_store, indexes={}) zg_metadata_dict = zg.metadata.to_dict() @@ -64,55 +62,59 @@ def test_virtual_dataset_from_zarr_group(self, zarr_store): # check ZArray values arrays = [val for val in zg.keys()] - zarr_attrs = [ + shared_v2_v3_attrs = [ "shape", - "chunks", - "dtype", - "order", - "compressor", - "filters", "zarr_format", - "dtype", ] + v2_attrs = ["chunks", "dtype", "order", "compressor", "filters"] - for array in arrays: - for attr in zarr_attrs: - vds_attr = getattr(vds[array].data.zarray, attr) - - # Edge cases where v2 and v3 attr keys differ: order, compressor, filters, dtype & chunks - if zarr_format == 3: - if "order" in attr: - # In zarr v3, it seems like order was replaced with the transpose codec. - # skip check - zarr_metadata_attr = vds_attr - - elif "compressor" in attr: - zarr_metadata_attr = vds_attr - - elif "filters" in attr: - zarr_metadata_attr = vds_attr - - elif "chunks" in attr: - # chunks vs chunk_grid.configuration.chunk_shape - zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ - "metadata" - ][array]["chunk_grid"]["configuration"]["chunk_shape"] - - elif "dtype" in attr: - # dtype vs datatype - zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ - "metadata" - ][array]["data_type"].to_numpy() - - else: - # follows v2 dict lookup - zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ - "metadata" - ][array][attr] - - else: + def _validate_v2(attrs: list[str]): + for array in arrays: + for attr in attrs: + vds_attr = getattr(vds[array].data.zarray, attr) zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ "metadata" ][array][attr] + assert vds_attr == zarr_metadata_attr + + def _validate_v3(attrs: list[str]): - assert vds_attr == zarr_metadata_attr + # check v2, v3 shared attrs + for array in arrays: + for attr in attrs: + zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ + "metadata" + ][array][attr] + vds_attr = getattr(vds[array].data.zarray, attr) + assert vds_attr == zarr_metadata_attr + + # Cases where v2 and v3 attr keys differ: order, compressor, filters, dtype & chunks + + # chunks vs chunk_grid.configuration.chunk_shape + assert ( + getattr(vds[array].data.zarray, "chunks") + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + "chunk_grid" + ]["configuration"]["chunk_shape"] + ) + + # dtype vs datatype + assert ( + getattr(vds[array].data.zarray, "dtype") + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + "data_type" + ].to_numpy() + ) + + # order: In zarr v3, it seems like order was replaced with the transpose codec. + # compressor: removed in v3 and built into codecs + # filters: removed in v3 and built into codecs + + if zarr_format == 2: + _validate_v2(shared_v2_v3_attrs + v2_attrs) + + elif zarr_format == 3: + _validate_v3(shared_v2_v3_attrs) + + else: + raise NotImplementedError(f'Zarr format {zarr_format} not in [2,3]') From f40ba28e00565916bd14ad835f786b47f5f7be79 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:56:49 +0000 Subject: [PATCH 41/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/tests/test_readers/test_zarr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index ce45d4fe..15e92801 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -78,7 +78,6 @@ def _validate_v2(attrs: list[str]): assert vds_attr == zarr_metadata_attr def _validate_v3(attrs: list[str]): - # check v2, v3 shared attrs for array in arrays: for attr in attrs: @@ -117,4 +116,4 @@ def _validate_v3(attrs: list[str]): _validate_v3(shared_v2_v3_attrs) else: - raise NotImplementedError(f'Zarr format {zarr_format} not in [2,3]') + raise NotImplementedError(f"Zarr format {zarr_format} not in [2,3]") From b5fb8027af4195ac5f3ed8c7f4785d55f32c2d82 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 12 Dec 2024 15:02:12 -0700 Subject: [PATCH 42/98] cleanup after zarr-python issue report --- conftest.py | 16 +++++ virtualizarr/readers/zarr.py | 12 ++-- virtualizarr/tests/test_integration.py | 61 ++++++++------------ virtualizarr/tests/test_readers/conftest.py | 16 ----- virtualizarr/tests/test_readers/test_zarr.py | 6 +- virtualizarr/zarr.py | 9 ++- 6 files changed, 55 insertions(+), 65 deletions(-) diff --git a/conftest.py b/conftest.py index e86b9244..22265b32 100644 --- a/conftest.py +++ b/conftest.py @@ -24,6 +24,22 @@ def pytest_runtest_setup(item): ) +def _xarray_subset(): + ds = xr.tutorial.open_dataset("air_temperature", chunks={}) + return ds.isel(time=slice(0, 10), lat=slice(0, 9), lon=slice(0, 18)).chunk( + {"time": 5} + ) + + +@pytest.fixture(params=[2, 3]) +def zarr_store(tmpdir, request): + ds = _xarray_subset() + filepath = f"{tmpdir}/air.zarr" + ds.to_zarr(filepath, zarr_format=request.param) + ds.close() + return filepath + + @pytest.fixture def netcdf4_file(tmpdir): # Set up example xarray dataset diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 8a04831a..c08cbf43 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -9,6 +9,7 @@ from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, @@ -153,14 +154,13 @@ def virtual_dataset_from_zarr_group( ) -> Dataset: import zarr - # filepath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) + vfpath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) # This currently fails: *** TypeError: Filesystem needs to support async operations. + # https://github.com/zarr-developers/zarr-python/issues/2554 - filepath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).filepath + fss = _FsspecFSFromFilepath(filepath=vfpath, reader_options=reader_options) - zg = zarr.open_group(filepath, mode="r") + zg = zarr.open_group(fss.get_mapper(), mode="r") zarr_arrays = [val for val in zg.keys()] @@ -219,7 +219,6 @@ def virtual_dataset_from_zarr_group( async def get_chunk_size(zarr_group: zarr.core.group, chunk_key: PosixPath) -> int: # async get chunk size of a chunk key - return await zarr_group.store.getsize(chunk_key) @@ -251,6 +250,7 @@ async def get_chunk_paths( else: raise NotImplementedError(f"{zarr_version} not 2 or 3.") + chunk_paths[chunk_key] = { "path": ( zarr_group.store.root / item diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 91045598..2d0c9d06 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -10,7 +10,7 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.readers import HDF5VirtualBackend from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import network, requires_kerchunk, requires_zarrV3 from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, find_var_names, @@ -94,41 +94,30 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] -# @pytest.mark.parametrize( -# "input_params", -# [inputs for inputs in ZARR_V2_PARAMS], -# ) -# def test_zarrV2_roundtrip(zarr_v2_store, input_params): -# ds = open_virtual_dataset( -# zarr_v2_store, -# loadable_variables=input_params.loadable_variables, -# drop_variables=input_params.drop_variables, -# indexes={}, -# ) - -# # THIS FAILS! TypeError: np.float32(nan) is not JSON serializable -# # Question: How do we handle this fill value: fill_value=np.float32(nan) -# ds_refs = ds.virtualize.to_kerchunk(format="dict") - -# # tmp fix if you want to override the fill vals! -# ds.lat.data.zarray.fill_value = float("nan") -# ds.time.data.zarray.fill_value = float("nan") -# ds.lon.data.zarray.fill_value = float("nan") - -# # Use dataset_from_kerchunk_refs to reconstruct the dataset -# roundtrip = dataset_from_kerchunk_refs(ds_refs) - -# # Assert equal to original dataset -# xrt.assert_equal(roundtrip, ds) - -# # assert vds has: -# # loadable vars are np arrays? -# # drop vars are not present -# # virtual vars are manifest arrays, not loaded arrays - -# # Do we have a good way in XRT to compare virtual datasets to xarray datasets? assert_duckarray_allclose? or just roundtrip it. -# # from xarray.testing import assert_duckarray_allclose -# # xrt.assert_allclose(ds, vds) +@requires_zarrV3 +@network +@pytest.mark.parametrize( + "zarr_store", + [ + pytest.param(2, id="Zarr V2"), + pytest.param( + 3, + id="Zarr V3", + ), + ], + indirect=True, +) +def test_zarrV2_roundtrip(zarr_store): + ds = open_virtual_dataset( + zarr_store, + indexes={}, + ) + ds_refs = ds.virtualize.to_kerchunk(format="dict") + + roundtrip = dataset_from_kerchunk_refs(ds_refs) + + # Assert equal to original dataset + xrt.assert_equal(roundtrip, ds) @requires_kerchunk diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 03c90bd2..f96447db 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -15,22 +15,6 @@ warnings.warn("hdf5plugin is required for HDF reader") -def _xarray_subset(): - ds = xr.tutorial.open_dataset("air_temperature", chunks={}) - return ds.isel(time=slice(0, 10), lat=slice(0, 9), lon=slice(0, 18)).chunk( - {"time": 5} - ) - - -@pytest.fixture(params=[2, 3]) -def zarr_store(tmpdir, request): - ds = _xarray_subset() - filepath = f"{tmpdir}/air.zarr" - ds.to_zarr(filepath, zarr_format=request.param) - ds.close() - return filepath - - @pytest.fixture def empty_chunks_hdf5_file(tmpdir): ds = xr.Dataset({"data": []}) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index ce45d4fe..7b4f5fbf 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -16,9 +16,6 @@ pytest.param( 3, id="Zarr V3", - # marks=pytest.mark.xfail( - # reason="Need to parse codecs into filters/compressors" - # ), ), ], indirect=True, @@ -78,7 +75,6 @@ def _validate_v2(attrs: list[str]): assert vds_attr == zarr_metadata_attr def _validate_v3(attrs: list[str]): - # check v2, v3 shared attrs for array in arrays: for attr in attrs: @@ -117,4 +113,4 @@ def _validate_v3(attrs: list[str]): _validate_v3(shared_v2_v3_attrs) else: - raise NotImplementedError(f'Zarr format {zarr_format} not in [2,3]') + raise NotImplementedError(f"Zarr format {zarr_format} not in [2,3]") diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index e339a3f4..2956143a 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -61,8 +61,12 @@ def __post_init__(self) -> None: # Convert dtype string to numpy.dtype self.dtype = np.dtype(self.dtype) - if self.fill_value is None: - self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) + # Question: Why is this applied only to fill values of None? + # It was skipping np dtypes such np.float32(nan) and np.int16(0) + # which causes serialization issues when writing to Kerchunk + + # if self.fill_value is None: + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) @property def codec(self) -> Codec: @@ -106,6 +110,7 @@ def to_kerchunk_json(self) -> str: zarray_dict = self.dict() if zarray_dict["fill_value"] is np.nan: zarray_dict["fill_value"] = None + return ujson.dumps(zarray_dict) # ZArray.dict seems to shadow "dict", so we need the type ignore in From 690ffee1dcb8f17ba0baa00aa7ced0fe6aaccb24 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 16 Dec 2024 12:45:15 -0700 Subject: [PATCH 43/98] temp disabled validate_and_normalize_path_to_uri due to issue in zarr-python v3: https://github.com/zarr-developers/zarr-python/issues/2554 --- virtualizarr/manifests/manifest.py | 3 ++- virtualizarr/readers/zarr.py | 11 ++++++----- virtualizarr/tests/test_integration.py | 5 ++++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 666c5b60..489af740 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -49,7 +49,8 @@ def with_validation( # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg - path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) + # commenting out for now: https://github.com/zarr-developers/zarr-python/issues/2554 + # path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) validate_byte_range(offset=offset, length=length) return ChunkEntry(path=path, offset=offset, length=length) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index c08cbf43..02e739a4 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -9,14 +9,13 @@ from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, open_loadable_vars_and_indexes, separate_coords, ) -from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions +from virtualizarr.utils import check_for_collisions from virtualizarr.zarr import ZArray if TYPE_CHECKING: @@ -154,13 +153,15 @@ def virtual_dataset_from_zarr_group( ) -> Dataset: import zarr - vfpath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) + # vfpath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) # This currently fails: *** TypeError: Filesystem needs to support async operations. # https://github.com/zarr-developers/zarr-python/issues/2554 - fss = _FsspecFSFromFilepath(filepath=vfpath, reader_options=reader_options) + # import ipdb; ipdb.set_trace() - zg = zarr.open_group(fss.get_mapper(), mode="r") + zg = zarr.open_group( + filepath, storage_options=reader_options.get("storage_options"), mode="r" + ) zarr_arrays = [val for val in zg.keys()] diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 2d0c9d06..cd31b5d7 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -107,11 +107,14 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( ], indirect=True, ) -def test_zarrV2_roundtrip(zarr_store): +def test_zarr_roundtrip(zarr_store): ds = open_virtual_dataset( zarr_store, indexes={}, ) + import ipdb + + ipdb.set_trace() ds_refs = ds.virtualize.to_kerchunk(format="dict") roundtrip = dataset_from_kerchunk_refs(ds_refs) From 31a1b94cefd0ed234394e2e1be210d87627ce2e6 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 16 Dec 2024 15:11:55 -0700 Subject: [PATCH 44/98] marked zarr integration test skipped b/c of zarr-v3 and kerchunk incompatability --- virtualizarr/tests/test_integration.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 3061b155..62aef376 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -89,14 +89,12 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( @requires_zarrV3 @network +@pytest.mark.skip(reason="Kerchunk & zarr-python v3 incompatibility") @pytest.mark.parametrize( "zarr_store", [ pytest.param(2, id="Zarr V2"), - pytest.param( - 3, - id="Zarr V3", - ), + pytest.param(3, id="Zarr V3"), ], indirect=True, ) @@ -105,14 +103,25 @@ def test_zarr_roundtrip(zarr_store): zarr_store, indexes={}, ) - import ipdb - ipdb.set_trace() ds_refs = ds.virtualize.to_kerchunk(format="dict") roundtrip = dataset_from_kerchunk_refs(ds_refs) - # Assert equal to original dataset + # This won't work right now b/c of the Kerchunk zarr-v3 incompatibility + # roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) + + def add_prefix(file_path: str) -> str: + return "file://" + file_path + + for array in ["lat", "lon", "time", "air"]: + # V2: What should the behavior here be? Should the RT dataset have _ARRAY_DIMS? + ds[array].attrs.pop("_ARRAY_DIMENSIONS", None) + + # temp workaround b/c of the zarr-python-v3 filepath issue: https://github.com/zarr-developers/zarr-python/issues/2554 + roundtrip[array].data = roundtrip[array].data.rename_paths(add_prefix) + + # Assert equal to original dataset - ManifestArrays xrt.assert_equal(roundtrip, ds) From 795c42864a604915209eed52d29e8690eabfd38d Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 16 Dec 2024 19:23:25 -0700 Subject: [PATCH 45/98] fixes some async behavior, reading from s3 seems to work --- virtualizarr/readers/zarr.py | 78 ++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 02e739a4..7182dde5 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,14 +1,18 @@ from __future__ import annotations import json -from pathlib import Path, PosixPath +from pathlib import Path from typing import TYPE_CHECKING, Iterable, Mapping, Optional import numcodecs import numpy as np +import zarr from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.manifest import ( + validate_and_normalize_path_to_uri, +) from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, @@ -153,12 +157,10 @@ def virtual_dataset_from_zarr_group( ) -> Dataset: import zarr - # vfpath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) + filepath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) # This currently fails: *** TypeError: Filesystem needs to support async operations. # https://github.com/zarr-developers/zarr-python/issues/2554 - # import ipdb; ipdb.set_trace() - zg = zarr.open_group( filepath, storage_options=reader_options.get("storage_options"), mode="r" ) @@ -175,9 +177,10 @@ def virtual_dataset_from_zarr_group( virtual_vars = list( set(zarr_arrays) - set(loadable_variables) - set(drop_variables) ) - virtual_variable_mapping = { - f"{var}": construct_virtual_array(zarr_group=zg, var_name=var) + f"{var}": construct_virtual_array( + zarr_group=zg, var_name=var, filepath=filepath + ) for var in virtual_vars } @@ -227,22 +230,37 @@ async def chunk_exists(zarr_group: zarr.core.group, chunk_key: PosixPath) -> boo return await zarr_group.store.exists(chunk_key) +async def list_store_keys(zarr_group: zarr.core.group) -> list[str]: + return [item async for item in zarr_group.store.list()] + + async def get_chunk_paths( - zarr_group: zarr.core.group, array_name: str, zarr_version: int + zarr_group: zarr.core.group, + array_name: str, + store_path: str, # should this be UPath or? ) -> dict: + # use UPath to for combining store path + chunk key + from upath import UPath + + store_path = UPath(store_path) + chunk_paths = {} - # Is there a way to call `zarr_group.store.list()` per array? - async for item in zarr_group.store.list(): + # can we call list() on an array? + store_keys = zarr.core.sync.sync(list_store_keys(zarr_group)) + + for item in store_keys: + # should we move these filters/checks into list_store_keys? if ( not item.endswith((".zarray", ".zattrs", ".zgroup", ".zmetadata", ".json")) and item.startswith(array_name) - and await chunk_exists(zarr_group=zarr_group, chunk_key=item) + and zarr.core.sync.sync(chunk_exists(zarr_group=zarr_group, chunk_key=item)) ): - if zarr_version == 2: + if zarr_group.metadata.zarr_format == 2: # split on array name + trailing slash chunk_key = item.split(array_name + "/")[-1] - elif zarr_version == 3: + + elif zarr_group.metadata.zarr_format == 3: # In v3 we remove the /c/ 'chunks' part of the key and # replace trailing slashes with '.' to conform to ChunkManifest validation chunk_key = ( @@ -250,14 +268,19 @@ async def get_chunk_paths( ) else: - raise NotImplementedError(f"{zarr_version} not 2 or 3.") + raise NotImplementedError( + f"{zarr_group.metadata.zarr_format} not 2 or 3." + ) + + # Can we ask Zarr-python for the path and protocol? + # zarr_group.store.path chunk_paths[chunk_key] = { "path": ( - zarr_group.store.root / item - ).as_uri(), # as_uri to comply with https://github.com/zarr-developers/VirtualiZarr/pull/243 + (store_path / item).as_uri() + ), # as_uri to comply with https://github.com/zarr-developers/VirtualiZarr/pull/243 "offset": 0, - "length": await get_chunk_size(zarr_group, item), + "length": zarr.core.sync.sync(get_chunk_size(zarr_group, item)), } # This won't work for sharded stores: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844487578 @@ -265,19 +288,9 @@ async def get_chunk_paths( return chunk_paths -def construct_chunk_key_mapping( - zarr_group: zarr.core.group, array_name: str, zarr_version: int -) -> dict: - import asyncio - - return asyncio.run( - get_chunk_paths( - zarr_group=zarr_group, array_name=array_name, zarr_version=zarr_version - ) - ) - - -def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): +def construct_virtual_array( + zarr_group: zarr.core.group.Group, var_name: str, filepath: str +): zarr_array = zarr_group[var_name] attrs = zarr_array.metadata.attributes @@ -285,6 +298,7 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): if zarr_array.metadata.zarr_format == 2: array_zarray = _parse_zarr_v2_metadata(zarr_array=zarr_array) array_dims = attrs["_ARRAY_DIMENSIONS"] + elif zarr_array.metadata.zarr_format == 3: array_zarray = _parse_zarr_v3_metadata(zarr_array=zarr_array) array_dims = zarr_array.metadata.dimension_names @@ -292,8 +306,10 @@ def construct_virtual_array(zarr_group: zarr.core.group.Group, var_name: str): else: raise NotImplementedError("Zarr format is not recognized as v2 or v3.") - array_chunk_sizes = construct_chunk_key_mapping( - zarr_group, array_name=var_name, zarr_version=zarr_array.metadata.zarr_format + import asyncio + + array_chunk_sizes = asyncio.run( + get_chunk_paths(zarr_group=zarr_group, array_name=var_name, store_path=filepath) ) array_chunkmanifest = ChunkManifest(array_chunk_sizes) From c0004c67c383c871f1281a2b9fb3389bd3d5f9e4 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 16 Dec 2024 20:34:53 -0700 Subject: [PATCH 46/98] lint + uri_fmt --- .github/workflows/upstream.yml | 2 +- virtualizarr/manifests/manifest.py | 3 +-- virtualizarr/zarr.py | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml index 20a20c95..9140896b 100644 --- a/.github/workflows/upstream.yml +++ b/.github/workflows/upstream.yml @@ -48,7 +48,7 @@ jobs: - name: Running Tests run: | - python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose + python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 489af740..666c5b60 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -49,8 +49,7 @@ def with_validation( # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg - # commenting out for now: https://github.com/zarr-developers/zarr-python/issues/2554 - # path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) + path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) validate_byte_range(offset=offset, length=length) return ChunkEntry(path=path, offset=offset, length=length) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 2956143a..3d62d002 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -110,7 +110,6 @@ def to_kerchunk_json(self) -> str: zarray_dict = self.dict() if zarray_dict["fill_value"] is np.nan: zarray_dict["fill_value"] = None - return ujson.dumps(zarray_dict) # ZArray.dict seems to shadow "dict", so we need the type ignore in From 60b8912d654a1d36dab29ba922e73c1fe7cd06fd Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 16 Dec 2024 20:37:42 -0700 Subject: [PATCH 47/98] adds to releases.rst --- docs/releases.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/releases.rst b/docs/releases.rst index 79b8e261..27093d0c 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.2.1 (unreleased) New Features ~~~~~~~~~~~~ +- Adds a Zarr reader to ``open_virtual_dataset``, which allows opening Zarr V2 and V3 stores as virtual datasets. + (:pull:`#271`) By `Raphael Hagen `_. + Breaking changes ~~~~~~~~~~~~~~~~ @@ -48,6 +51,7 @@ as well as many other bugfixes and documentation improvements. New Features ~~~~~~~~~~~~ + - Add a ``virtual_backend_kwargs`` keyword argument to file readers and to ``open_virtual_dataset``, to allow reader-specific options to be passed down. (:pull:`315`) By `Tom Nicholas `_. - Added append functionality to `to_icechunk` (:pull:`272`) By `Aimee Barciauskas `_. From 824099779f7a3de909c38bdcca7f6a448b82dec2 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 16 Dec 2024 20:49:12 -0700 Subject: [PATCH 48/98] nit --- ci/min-deps.yml | 1 - ci/upstream.yml | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/min-deps.yml b/ci/min-deps.yml index dd52d38b..344a4595 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -13,7 +13,6 @@ dependencies: - ujson - universal_pathlib # Testing - - dask - codecov - pre-commit - mypy diff --git a/ci/upstream.yml b/ci/upstream.yml index feee5044..a6e86755 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -26,6 +26,7 @@ dependencies: - pytest - pooch - fsspec + - dask - pip - pip: - icechunk>=0.1.0a7 # Installs zarr v3 as dependency From 816e6962bfec67f82191e74771a352a2111331d5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 17 Dec 2024 10:27:12 -0700 Subject: [PATCH 49/98] cleanup, comments and nits --- docs/releases.rst | 1 - virtualizarr/manifests/manifest.py | 2 +- virtualizarr/readers/common.py | 2 +- virtualizarr/readers/zarr.py | 53 +++++++++++++----------------- 4 files changed, 24 insertions(+), 34 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 27093d0c..131e6efc 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -51,7 +51,6 @@ as well as many other bugfixes and documentation improvements. New Features ~~~~~~~~~~~~ - - Add a ``virtual_backend_kwargs`` keyword argument to file readers and to ``open_virtual_dataset``, to allow reader-specific options to be passed down. (:pull:`315`) By `Tom Nicholas `_. - Added append functionality to `to_icechunk` (:pull:`272`) By `Aimee Barciauskas `_. diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 666c5b60..77282095 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -84,7 +84,7 @@ def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> return urlunparse(components) elif any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES): - # this feels fragile, is there a better way to ID a Zarr + # Question: This feels fragile, is there a better way to ID a Zarr if not PosixPath(path).suffix and "zarr" not in path: raise ValueError( f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 1174be7b..5065373e 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -47,7 +47,7 @@ def open_loadable_vars_and_indexes( # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. fpath = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) - # Update the xarray open_dataset kwargs if Zarr + # Updates the Xarray open_dataset kwargs if Zarr if fpath.filepath.suffix == ".zarr": engine = "zarr" diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 7182dde5..e710d1af 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -6,7 +6,6 @@ import numcodecs import numpy as np -import zarr from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -25,6 +24,7 @@ if TYPE_CHECKING: from pathlib import PosixPath + import upath import zarr @@ -40,9 +40,7 @@ def open_virtual_dataset( virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: - """ - Create a virtual dataset from an existing Zarr store - """ + # Question: Is this something we want to pass through? if virtual_backend_kwargs: raise NotImplementedError( "Zarr reader does not understand any virtual_backend_kwargs" @@ -156,11 +154,16 @@ def virtual_dataset_from_zarr_group( reader_options: Optional[dict] = None, ) -> Dataset: import zarr + from upath import UPath filepath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) - # This currently fails: *** TypeError: Filesystem needs to support async operations. + # This currently fails for local filepaths (ie. tests): + # *** TypeError: Filesystem needs to support async operations. # https://github.com/zarr-developers/zarr-python/issues/2554 + # use UPath for combining store path + chunk key when building chunk manifests + store_path = UPath(filepath) + zg = zarr.open_group( filepath, storage_options=reader_options.get("storage_options"), mode="r" ) @@ -173,18 +176,17 @@ def virtual_dataset_from_zarr_group( f"Some loadable variables specified are not present in this zarr store: {missing_vars}" ) - # virtual variables are available variables minus drop variables & loadable variables virtual_vars = list( set(zarr_arrays) - set(loadable_variables) - set(drop_variables) ) + virtual_variable_mapping = { f"{var}": construct_virtual_array( - zarr_group=zg, var_name=var, filepath=filepath + zarr_group=zg, var_name=var, filepath=store_path ) for var in virtual_vars } - # list comp hell coord_names = list( set( item @@ -195,12 +197,8 @@ def virtual_dataset_from_zarr_group( ) ) - # 4 Get the loadable_variables by just using xr.open_zarr on the same store (should use drop_variables to avoid handling the virtual variables that we already have). - # We want to drop 'drop_variables' but also virtual variables since we already **manifested** them. - non_loadable_variables = list(set(virtual_vars).union(set(drop_variables))) - # pre made func for this?! Woohoo loadable_vars, indexes = open_loadable_vars_and_indexes( filepath, loadable_variables=loadable_variables, @@ -211,7 +209,6 @@ def virtual_dataset_from_zarr_group( decode_times=decode_times, ) - # 6 Merge all the variables into one xr.Dataset and return it. return construct_virtual_dataset( virtual_vars=virtual_variable_mapping, loadable_vars=loadable_vars, @@ -222,31 +219,26 @@ def virtual_dataset_from_zarr_group( async def get_chunk_size(zarr_group: zarr.core.group, chunk_key: PosixPath) -> int: - # async get chunk size of a chunk key + # User zarr-pythons `getsize` method to get bytes per chunk return await zarr_group.store.getsize(chunk_key) async def chunk_exists(zarr_group: zarr.core.group, chunk_key: PosixPath) -> bool: + # calls zarr-pythons `exists` to check for a chunk return await zarr_group.store.exists(chunk_key) async def list_store_keys(zarr_group: zarr.core.group) -> list[str]: + # Lists all keys in a store return [item async for item in zarr_group.store.list()] async def get_chunk_paths( - zarr_group: zarr.core.group, - array_name: str, - store_path: str, # should this be UPath or? + zarr_group: zarr.core.group, array_name: str, store_path: upath.core.UPath ) -> dict: - # use UPath to for combining store path + chunk key - from upath import UPath - - store_path = UPath(store_path) - chunk_paths = {} - # can we call list() on an array? + # Can we call list() on an array instead of the entire store? store_keys = zarr.core.sync.sync(list_store_keys(zarr_group)) for item in store_keys: @@ -273,23 +265,21 @@ async def get_chunk_paths( ) # Can we ask Zarr-python for the path and protocol? - # zarr_group.store.path + # This gives path, but no protocol: zarr_group.store.path chunk_paths[chunk_key] = { - "path": ( - (store_path / item).as_uri() - ), # as_uri to comply with https://github.com/zarr-developers/VirtualiZarr/pull/243 + "path": ((store_path / item).as_uri()), "offset": 0, "length": zarr.core.sync.sync(get_chunk_size(zarr_group, item)), } - # This won't work for sharded stores: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844487578 + # Note: This won't work for sharded stores: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844487578 return chunk_paths def construct_virtual_array( - zarr_group: zarr.core.group.Group, var_name: str, filepath: str + zarr_group: zarr.core.group.Group, var_name: str, store_path: upath.core.UPath ): zarr_array = zarr_group[var_name] @@ -309,7 +299,9 @@ def construct_virtual_array( import asyncio array_chunk_sizes = asyncio.run( - get_chunk_paths(zarr_group=zarr_group, array_name=var_name, store_path=filepath) + get_chunk_paths( + zarr_group=zarr_group, array_name=var_name, store_path=store_path + ) ) array_chunkmanifest = ChunkManifest(array_chunk_sizes) @@ -354,7 +346,6 @@ def _parse_zarr_v3_metadata(zarr_array: zarr.core.array.Array) -> ZArray: # Questions: What do we do with endian info? codecs = get_codecs(zarr_array) - # Q: Are these ever in codecs? compressor = getattr(codecs[0], "compressor", None) filters = getattr(codecs[0], "filters", None) From 31aacf9024901351005502027ad6da2ea20dbbab Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 17 Dec 2024 12:43:22 -0700 Subject: [PATCH 50/98] progress on mypy --- virtualizarr/readers/common.py | 4 ++-- virtualizarr/readers/zarr.py | 28 +++++++++++++++++++++------- virtualizarr/utils.py | 7 +++++-- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 5065373e..553032b3 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -49,13 +49,13 @@ def open_loadable_vars_and_indexes( # Updates the Xarray open_dataset kwargs if Zarr - if fpath.filepath.suffix == ".zarr": + if fpath.filepath.suffix == ".zarr": # type: ignore engine = "zarr" xr_input = fpath.filepath else: engine = None - xr_input = fpath.open_file() + xr_input = fpath.open_file() # type: ignore ds = open_dataset( xr_input, # type: ignore[arg-type] diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index e710d1af..f11f0c27 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -77,6 +77,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """ @@ -84,7 +85,10 @@ def open_virtual_dataset( This is experimental - chunk manifests are not part of the Zarr v3 Spec. """ - + if virtual_backend_kwargs: + raise NotImplementedError( + "Zarr V3 Chunk Manifest reader does not understand any virtual_backend_kwargs" + ) storepath = Path(filepath) if group: @@ -164,12 +168,22 @@ def virtual_dataset_from_zarr_group( # use UPath for combining store path + chunk key when building chunk manifests store_path = UPath(filepath) + if reader_options is None: + reader_options = {} + zg = zarr.open_group( filepath, storage_options=reader_options.get("storage_options"), mode="r" ) zarr_arrays = [val for val in zg.keys()] + # mypy typing + if loadable_variables is None: + loadable_variables = set() + + if drop_variables is None: + drop_variables = set() + missing_vars = set(loadable_variables) - set(zarr_arrays) if missing_vars: raise ValueError( @@ -182,7 +196,7 @@ def virtual_dataset_from_zarr_group( virtual_variable_mapping = { f"{var}": construct_virtual_array( - zarr_group=zg, var_name=var, filepath=store_path + zarr_group=zg, var_name=var, store_path=store_path ) for var in virtual_vars } @@ -218,23 +232,23 @@ def virtual_dataset_from_zarr_group( ) -async def get_chunk_size(zarr_group: zarr.core.group, chunk_key: PosixPath) -> int: +async def get_chunk_size(zarr_group: zarr.Group, chunk_key: PosixPath) -> int: # User zarr-pythons `getsize` method to get bytes per chunk return await zarr_group.store.getsize(chunk_key) -async def chunk_exists(zarr_group: zarr.core.group, chunk_key: PosixPath) -> bool: +async def chunk_exists(zarr_group: zarr.Group, chunk_key: PosixPath) -> bool: # calls zarr-pythons `exists` to check for a chunk return await zarr_group.store.exists(chunk_key) -async def list_store_keys(zarr_group: zarr.core.group) -> list[str]: +async def list_store_keys(zarr_group: zarr.Group) -> list[str]: # Lists all keys in a store return [item async for item in zarr_group.store.list()] async def get_chunk_paths( - zarr_group: zarr.core.group, array_name: str, store_path: upath.core.UPath + zarr_group: zarr.Group, array_name: str, store_path: upath.core.UPath ) -> dict: chunk_paths = {} @@ -279,7 +293,7 @@ async def get_chunk_paths( def construct_virtual_array( - zarr_group: zarr.core.group.Group, var_name: str, store_path: upath.core.UPath + zarr_group: zarr.Group, var_name: str, store_path: upath.core.UPath ): zarr_array = zarr_group[var_name] diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index b1de5dac..47e5e2da 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: import fsspec.core import fsspec.spec + import upath # See pangeo_forge_recipes.storage OpenFileType = Union[ @@ -32,7 +33,7 @@ class _FsspecFSFromFilepath: """ - filepath: str + filepath: str | upath.core.UPath reader_options: Optional[dict] = field(default_factory=dict) fs: fsspec.AbstractFileSystem = field(init=False) @@ -59,7 +60,9 @@ def __post_init__(self) -> None: import fsspec from upath import UPath - self.filepath = UPath(self.filepath) + if not isinstance(self.filepath, UPath): + self.filepath = UPath(self.filepath) + protocol = self.filepath.protocol self.reader_options = self.reader_options or {} From 5d14b20794e6034926ec3bc527daf153b2e8d653 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 17 Dec 2024 13:49:42 -0700 Subject: [PATCH 51/98] make mypy happy --- virtualizarr/readers/common.py | 2 +- virtualizarr/readers/zarr.py | 38 ++++++++++++++++++---------------- virtualizarr/utils.py | 11 ++++++---- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 553032b3..a737d0ca 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -49,7 +49,7 @@ def open_loadable_vars_and_indexes( # Updates the Xarray open_dataset kwargs if Zarr - if fpath.filepath.suffix == ".zarr": # type: ignore + if fpath.upath.suffix == ".zarr": engine = "zarr" xr_input = fpath.filepath diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index f11f0c27..ff1bfafa 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -22,8 +22,6 @@ from virtualizarr.zarr import ZArray if TYPE_CHECKING: - from pathlib import PosixPath - import upath import zarr @@ -232,12 +230,12 @@ def virtual_dataset_from_zarr_group( ) -async def get_chunk_size(zarr_group: zarr.Group, chunk_key: PosixPath) -> int: +async def get_chunk_size(zarr_group: zarr.Group, chunk_key: str) -> int: # User zarr-pythons `getsize` method to get bytes per chunk return await zarr_group.store.getsize(chunk_key) -async def chunk_exists(zarr_group: zarr.Group, chunk_key: PosixPath) -> bool: +async def chunk_exists(zarr_group: zarr.Group, chunk_key: str) -> bool: # calls zarr-pythons `exists` to check for a chunk return await zarr_group.store.exists(chunk_key) @@ -300,12 +298,12 @@ def construct_virtual_array( attrs = zarr_array.metadata.attributes if zarr_array.metadata.zarr_format == 2: - array_zarray = _parse_zarr_v2_metadata(zarr_array=zarr_array) + array_zarray = _parse_zarr_v2_metadata(zarr_array=zarr_array) # type: ignore[arg-type] array_dims = attrs["_ARRAY_DIMENSIONS"] elif zarr_array.metadata.zarr_format == 3: - array_zarray = _parse_zarr_v3_metadata(zarr_array=zarr_array) - array_dims = zarr_array.metadata.dimension_names + array_zarray = _parse_zarr_v3_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + array_dims = zarr_array.metadata.dimension_names # type: ignore[union-attr] else: raise NotImplementedError("Zarr format is not recognized as v2 or v3.") @@ -333,20 +331,20 @@ def construct_virtual_array( return array_variable -def _parse_zarr_v2_metadata(zarr_array: zarr.core.array.Array) -> ZArray: +def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: return ZArray( shape=zarr_array.metadata.shape, - chunks=zarr_array.metadata.chunks, + chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] dtype=zarr_array.metadata.dtype, - fill_value=zarr_array.metadata.fill_value, + fill_value=zarr_array.metadata.fill_value, # type: ignore[arg-type] order="C", - compressor=zarr_array.metadata.compressor, - filters=zarr_array.metadata.filters, + compressor=zarr_array.metadata.compressor, # type: ignore[union-attr] + filters=zarr_array.metadata.filters, # type: ignore zarr_format=zarr_array.metadata.zarr_format, ) -def _parse_zarr_v3_metadata(zarr_array: zarr.core.array.Array) -> ZArray: +def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: from virtualizarr.codecs import get_codecs if zarr_array.metadata.fill_value is None: @@ -360,14 +358,18 @@ def _parse_zarr_v3_metadata(zarr_array: zarr.core.array.Array) -> ZArray: # Questions: What do we do with endian info? codecs = get_codecs(zarr_array) - compressor = getattr(codecs[0], "compressor", None) - filters = getattr(codecs[0], "filters", None) + # Question: How should we parse the values from get_codecs? + # typing: Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]] + # mypy: ... is not indexable [index] + # added tmp bypyass for mypy + compressor = getattr(codecs[0], "compressor", None) # type: ignore + filters = getattr(codecs[0], "filters", None) # type: ignore return ZArray( - chunks=zarr_array.metadata.chunk_grid.chunk_shape, + chunks=zarr_array.metadata.chunk_grid.chunk_shape, # type: ignore[attr-defined] compressor=compressor, - dtype=zarr_array.metadata.data_type.name, - fill_value=fill_value, + dtype=zarr_array.metadata.data_type.name, # type: ignore + fill_value=fill_value, # type: ignore[arg-type] filters=filters, order="C", shape=zarr_array.metadata.shape, diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 47e5e2da..ca1f22c4 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -33,9 +33,10 @@ class _FsspecFSFromFilepath: """ - filepath: str | upath.core.UPath + filepath: str reader_options: Optional[dict] = field(default_factory=dict) fs: fsspec.AbstractFileSystem = field(init=False) + upath: upath.core.UPath = field(init=False) def open_file(self) -> OpenFileType: """Calls `.open` on fsspec.Filesystem instantiation using self.filepath as an input. @@ -61,14 +62,16 @@ def __post_init__(self) -> None: from upath import UPath if not isinstance(self.filepath, UPath): - self.filepath = UPath(self.filepath) + upath = UPath(self.filepath) - protocol = self.filepath.protocol + self.upath = upath + self.protocol = upath.protocol + self.filepath = upath.as_uri() self.reader_options = self.reader_options or {} storage_options = self.reader_options.get("storage_options", {}) # type: ignore - self.fs = fsspec.filesystem(protocol, **storage_options) + self.fs = fsspec.filesystem(self.protocol, **storage_options) def check_for_collisions( From fb844b62045bf5c13d601c00c1164675b3a97d2b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 18 Dec 2024 17:26:02 -0500 Subject: [PATCH 52/98] adds option for AsyncArray to _is_zarr_array --- virtualizarr/codecs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index ad2a3d9b..b544b82d 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -65,9 +65,9 @@ def _get_manifestarray_codecs( def _is_zarr_array(array: object) -> bool: """Check if the array is an instance of Zarr Array.""" try: - from zarr import Array + from zarr import Array, AsyncArray - return isinstance(array, Array) + return isinstance(array, (Array, AsyncArray)) except ImportError: return False From 421f53f382ee4d7879684cd00d0a95d1f2a9572e Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 19 Dec 2024 17:40:17 -0500 Subject: [PATCH 53/98] big async rewrite --- virtualizarr/backend.py | 2 - virtualizarr/manifests/manifest.py | 1 - virtualizarr/readers/__init__.py | 2 - virtualizarr/readers/zarr.py | 561 +++++++++++------------------ 4 files changed, 201 insertions(+), 365 deletions(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 99d5aec7..15681db1 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -17,7 +17,6 @@ KerchunkVirtualBackend, NetCDF3VirtualBackend, TIFFVirtualBackend, - ZarrV3ChunkManifestVirtualBackend, # If this is kept, we should incorporate it into ZarrVirtualBackend ZarrVirtualBackend, ) from virtualizarr.readers.common import VirtualBackend @@ -26,7 +25,6 @@ # TODO add entrypoint to allow external libraries to add to this mapping VIRTUAL_BACKENDS = { "kerchunk": KerchunkVirtualBackend, - "zarr_v3": ZarrV3ChunkManifestVirtualBackend, "zarr": ZarrVirtualBackend, "dmrpp": DMRPPVirtualBackend, # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 77282095..6d6b4daf 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -48,7 +48,6 @@ def with_validation( """ # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg - path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) validate_byte_range(offset=offset, length=length) return ChunkEntry(path=path, offset=offset, length=length) diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py index ef5d3568..3d887844 100644 --- a/virtualizarr/readers/__init__.py +++ b/virtualizarr/readers/__init__.py @@ -6,7 +6,6 @@ from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend from virtualizarr.readers.tiff import TIFFVirtualBackend from virtualizarr.readers.zarr import ( - ZarrV3ChunkManifestVirtualBackend, ZarrVirtualBackend, ) @@ -19,5 +18,4 @@ "NetCDF3VirtualBackend", "TIFFVirtualBackend", "ZarrVirtualBackend", - "ZarrV3ChunkManifestVirtualBackend", ] diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index ff1bfafa..b950b81a 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,215 +1,200 @@ from __future__ import annotations -import json +import asyncio from pathlib import Path from typing import TYPE_CHECKING, Iterable, Mapping, Optional -import numcodecs -import numpy as np from xarray import Dataset, Index, Variable +from zarr.core.common import concurrent_map from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.manifests.manifest import ( - validate_and_normalize_path_to_uri, -) +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, open_loadable_vars_and_indexes, - separate_coords, ) from virtualizarr.utils import check_for_collisions from virtualizarr.zarr import ZArray if TYPE_CHECKING: - import upath import zarr -class ZarrVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - # Question: Is this something we want to pass through? - if virtual_backend_kwargs: - raise NotImplementedError( - "Zarr reader does not understand any virtual_backend_kwargs" - ) +async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: + return ZArray( + shape=zarr_array.metadata.shape, + chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] + dtype=zarr_array.metadata.dtype, + fill_value=zarr_array.metadata.fill_value, # type: ignore[arg-type] + order="C", + compressor=zarr_array.metadata.compressor, # type: ignore[union-attr] + filters=zarr_array.metadata.filters, # type: ignore + zarr_format=zarr_array.metadata.zarr_format, + ) - import zarr - from packaging import version - if version.parse(zarr.__version__).major < 3: - raise ImportError("Zarr V3 is required") +async def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: + from virtualizarr.codecs import get_codecs - drop_variables, loadable_variables = check_for_collisions( - drop_variables, - loadable_variables, + if zarr_array.metadata.fill_value is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" ) + else: + fill_value = zarr_array.metadata.fill_value - return virtual_dataset_from_zarr_group( - filepath=filepath, - group=group, - drop_variables=drop_variables, - loadable_variables=loadable_variables, - decode_times=decode_times, - indexes=indexes, - reader_options=reader_options, - ) + # Codecs from test looks like: (BytesCodec(endian=),) + # Questions: What do we do with endian info? + codecs = get_codecs(zarr_array) + # Question: How should we parse the values from get_codecs? + # typing: Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]] + # mypy: ... is not indexable [index] + # added tmp bypyass for mypy + compressor = getattr(codecs[0], "compressor", None) # type: ignore + filters = getattr(codecs[0], "filters", None) # type: ignore -class ZarrV3ChunkManifestVirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - """ - Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. + return ZArray( + chunks=zarr_array.metadata.chunk_grid.chunk_shape, # type: ignore[attr-defined] + compressor=compressor, + dtype=zarr_array.metadata.data_type.name, # type: ignore + fill_value=fill_value, # type: ignore[arg-type] + filters=filters, + order="C", + shape=zarr_array.metadata.shape, + zarr_format=zarr_array.metadata.zarr_format, + ) - This is experimental - chunk manifests are not part of the Zarr v3 Spec. - """ - if virtual_backend_kwargs: - raise NotImplementedError( - "Zarr V3 Chunk Manifest reader does not understand any virtual_backend_kwargs" + +async def build_chunk_manifest( + store_path: str, chunk_mapping_dict: dict, array_name: str, zarr_format: int +) -> ChunkManifest: + chunk_manifest_dict = {} + + for key, value in chunk_mapping_dict.items(): + if zarr_format == 2: + # split on array name + trailing slash + chunk_key = key.split(array_name + "/")[-1] + + elif zarr_format == 3: + # In v3 we remove the /c/ 'chunks' part of the key and + # replace trailing slashes with '.' to conform to ChunkManifest validation + chunk_key = ( + key.split(array_name + "/")[-1].split("c/")[-1].replace("/", ".") ) - storepath = Path(filepath) - - if group: - raise NotImplementedError() - - if loadable_variables or decode_times: - raise NotImplementedError() - - if reader_options: - raise NotImplementedError() - - drop_vars: list[str] - if drop_variables is None: - drop_vars = [] - else: - drop_vars = list(drop_variables) - - ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_vars: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) + # key.split('/c/')[-1] + chunk_manifest_dict[chunk_key] = { + "path": store_path + "/" + key, + "offset": 0, + "length": value, + } + + return ChunkManifest(chunk_manifest_dict) + + +async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> dict: + """Create a chunk map""" + + keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] + + sizes = await concurrent_map(keys, zarr_array.store.getsize) + return {key[0]: size for key, size in zip(keys, sizes)} + + +async def build_zarray_metadata(zarr_array: zarr.AsyncArray): + attrs = zarr_array.metadata.attributes + + if zarr_array.metadata.zarr_format == 2: + array_zarray = await _parse_zarr_v2_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + array_dims = attrs["_ARRAY_DIMENSIONS"] + + elif zarr_array.metadata.zarr_format == 3: + array_zarray = await _parse_zarr_v3_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + array_dims = zarr_array.metadata.dimension_names # type: ignore[union-attr] - return ds + else: + raise NotImplementedError("Zarr format is not recognized as v2 or v3.") + return { + "zarray_array": array_zarray, + "array_dims": array_dims, + "array_metadata": attrs, + } -def virtual_dataset_from_zarr_group( + +async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath: str): + # keys: array_zarray & array_dims + zarray_array = await build_zarray_metadata(zarr_array=zarr_array) + + array_name = zarr_array.basename + # build mapping between chunks and # of bytes (size) + chunk_map = await get_chunk_mapping_prefix(zarr_array, prefix=f"{array_name}/c") + # transform chunk_map into ChunkManifest that fits into ManifestArray + chunk_manifest = await build_chunk_manifest( + store_path=filepath, + chunk_mapping_dict=chunk_map, + array_name=array_name, + zarr_format=zarray_array["zarray_array"].zarr_format, + ) + + # build ManifestArray from dict + manifest_array = ManifestArray( + zarray=zarray_array["zarray_array"], chunkmanifest=chunk_manifest + ) + + return Variable( + dims=zarray_array["array_dims"], + data=manifest_array, + attrs=zarray_array["array_metadata"], + ) + + +async def virtual_dataset_from_zarr_group( filepath: str, group: str | None = None, drop_variables: Iterable[str] | None = [], + virtual_variables: Iterable[str] | None = [], loadable_variables: Iterable[str] | None = [], decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, - reader_options: Optional[dict] = None, -) -> Dataset: + reader_options: dict = {}, +): import zarr - from upath import UPath - - filepath = validate_and_normalize_path_to_uri(filepath, fs_root=Path.cwd().as_uri()) - # This currently fails for local filepaths (ie. tests): - # *** TypeError: Filesystem needs to support async operations. - # https://github.com/zarr-developers/zarr-python/issues/2554 - # use UPath for combining store path + chunk key when building chunk manifests - store_path = UPath(filepath) - - if reader_options is None: - reader_options = {} - - zg = zarr.open_group( + zg = await zarr.api.asynchronous.open_group( filepath, storage_options=reader_options.get("storage_options"), mode="r" ) - zarr_arrays = [val for val in zg.keys()] - - # mypy typing - if loadable_variables is None: - loadable_variables = set() - - if drop_variables is None: - drop_variables = set() - - missing_vars = set(loadable_variables) - set(zarr_arrays) - if missing_vars: - raise ValueError( - f"Some loadable variables specified are not present in this zarr store: {missing_vars}" - ) + virtual_zarr_arrays = await asyncio.gather( + *[zg.getitem(var) for var in virtual_variables] + ) - virtual_vars = list( - set(zarr_arrays) - set(loadable_variables) - set(drop_variables) + virtual_variable_arrays = await asyncio.gather( + *[ + virtual_variable_from_zarr_array(zarr_array=array, filepath=filepath) + for array in virtual_zarr_arrays + ] ) - virtual_variable_mapping = { - f"{var}": construct_virtual_array( - zarr_group=zg, var_name=var, store_path=store_path - ) - for var in virtual_vars + # build a dict mapping for use later in construct_virtual_dataset + virtual_variable_array_mapping = { + array.basename: result + for array, result in zip(virtual_zarr_arrays, virtual_variable_arrays) } + # flatten nested tuples and get set -> list coord_names = list( set( - item - for tup in [ - virtual_variable_mapping[val].dims for val in virtual_variable_mapping + [ + item + for tup in [val.dims for val in virtual_variable_arrays] + for item in tup ] - for item in tup ) ) - non_loadable_variables = list(set(virtual_vars).union(set(drop_variables))) + non_loadable_variables = list(set(virtual_variables).union(set(drop_variables))) loadable_vars, indexes = open_loadable_vars_and_indexes( filepath, @@ -222,225 +207,81 @@ def virtual_dataset_from_zarr_group( ) return construct_virtual_dataset( - virtual_vars=virtual_variable_mapping, + virtual_vars=virtual_variable_array_mapping, loadable_vars=loadable_vars, indexes=indexes, coord_names=coord_names, - attrs=zg.attrs.asdict(), + attrs=zg.attrs, ) -async def get_chunk_size(zarr_group: zarr.Group, chunk_key: str) -> int: - # User zarr-pythons `getsize` method to get bytes per chunk - return await zarr_group.store.getsize(chunk_key) - - -async def chunk_exists(zarr_group: zarr.Group, chunk_key: str) -> bool: - # calls zarr-pythons `exists` to check for a chunk - return await zarr_group.store.exists(chunk_key) - - -async def list_store_keys(zarr_group: zarr.Group) -> list[str]: - # Lists all keys in a store - return [item async for item in zarr_group.store.list()] - - -async def get_chunk_paths( - zarr_group: zarr.Group, array_name: str, store_path: upath.core.UPath -) -> dict: - chunk_paths = {} - - # Can we call list() on an array instead of the entire store? - store_keys = zarr.core.sync.sync(list_store_keys(zarr_group)) - - for item in store_keys: - # should we move these filters/checks into list_store_keys? - if ( - not item.endswith((".zarray", ".zattrs", ".zgroup", ".zmetadata", ".json")) - and item.startswith(array_name) - and zarr.core.sync.sync(chunk_exists(zarr_group=zarr_group, chunk_key=item)) - ): - if zarr_group.metadata.zarr_format == 2: - # split on array name + trailing slash - chunk_key = item.split(array_name + "/")[-1] - - elif zarr_group.metadata.zarr_format == 3: - # In v3 we remove the /c/ 'chunks' part of the key and - # replace trailing slashes with '.' to conform to ChunkManifest validation - chunk_key = ( - item.split(array_name + "/")[-1].split("c/")[-1].replace("/", ".") - ) - - else: - raise NotImplementedError( - f"{zarr_group.metadata.zarr_format} not 2 or 3." - ) - - # Can we ask Zarr-python for the path and protocol? - # This gives path, but no protocol: zarr_group.store.path - - chunk_paths[chunk_key] = { - "path": ((store_path / item).as_uri()), - "offset": 0, - "length": zarr.core.sync.sync(get_chunk_size(zarr_group, item)), - } - - # Note: This won't work for sharded stores: https://github.com/zarr-developers/VirtualiZarr/pull/271#discussion_r1844487578 - - return chunk_paths - - -def construct_virtual_array( - zarr_group: zarr.Group, var_name: str, store_path: upath.core.UPath -): - zarr_array = zarr_group[var_name] - - attrs = zarr_array.metadata.attributes - - if zarr_array.metadata.zarr_format == 2: - array_zarray = _parse_zarr_v2_metadata(zarr_array=zarr_array) # type: ignore[arg-type] - array_dims = attrs["_ARRAY_DIMENSIONS"] +class ZarrVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + # Question: Is this something we want to pass through? + if virtual_backend_kwargs: + raise NotImplementedError( + "Zarr reader does not understand any virtual_backend_kwargs" + ) - elif zarr_array.metadata.zarr_format == 3: - array_zarray = _parse_zarr_v3_metadata(zarr_array=zarr_array) # type: ignore[arg-type] - array_dims = zarr_array.metadata.dimension_names # type: ignore[union-attr] + import asyncio - else: - raise NotImplementedError("Zarr format is not recognized as v2 or v3.") + import zarr + from packaging import version - import asyncio + if version.parse(zarr.__version__).major < 3: + raise ImportError("Zarr V3 is required") - array_chunk_sizes = asyncio.run( - get_chunk_paths( - zarr_group=zarr_group, array_name=var_name, store_path=store_path + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, ) - ) - - array_chunkmanifest = ChunkManifest(array_chunk_sizes) - array_manifest_array = ManifestArray( - zarray=array_zarray, chunkmanifest=array_chunkmanifest - ) - - array_variable = Variable( - dims=array_dims, - data=array_manifest_array, - attrs=attrs, - ) - - return array_variable - - -def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: - return ZArray( - shape=zarr_array.metadata.shape, - chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] - dtype=zarr_array.metadata.dtype, - fill_value=zarr_array.metadata.fill_value, # type: ignore[arg-type] - order="C", - compressor=zarr_array.metadata.compressor, # type: ignore[union-attr] - filters=zarr_array.metadata.filters, # type: ignore - zarr_format=zarr_array.metadata.zarr_format, - ) - - -def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: - from virtualizarr.codecs import get_codecs - - if zarr_array.metadata.fill_value is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() ) - else: - fill_value = zarr_array.metadata.fill_value - - # Codecs from test looks like: (BytesCodec(endian=),) - # Questions: What do we do with endian info? - codecs = get_codecs(zarr_array) + # This currently fails for local filepaths (ie. tests): + # *** TypeError: Filesystem needs to support async operations. + # https://github.com/zarr-developers/zarr-python/issues/2554 - # Question: How should we parse the values from get_codecs? - # typing: Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]] - # mypy: ... is not indexable [index] - # added tmp bypyass for mypy - compressor = getattr(codecs[0], "compressor", None) # type: ignore - filters = getattr(codecs[0], "filters", None) # type: ignore + if reader_options is None: + reader_options = {} - return ZArray( - chunks=zarr_array.metadata.chunk_grid.chunk_shape, # type: ignore[attr-defined] - compressor=compressor, - dtype=zarr_array.metadata.data_type.name, # type: ignore - fill_value=fill_value, # type: ignore[arg-type] - filters=filters, - order="C", - shape=zarr_array.metadata.shape, - zarr_format=zarr_array.metadata.zarr_format, - ) - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + # This is just to grab array keys, so is sync alright? + zg = zarr.open_group( + filepath, storage_options=reader_options.get("storage_options"), mode="r" ) - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) - shape = tuple(metadata["shape"]) - zarr_format = metadata["zarr_format"] + zarr_array_keys = [val for val in zg.array_keys()] - if metadata["fill_value"] is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + missing_vars = set(loadable_variables) - set(zarr_array_keys) + if missing_vars: + raise ValueError( + f"Some loadable variables specified are not present in this zarr store: {missing_vars}" + ) + virtual_vars = list( + set(zarr_array_keys) - set(loadable_variables) - set(drop_variables) ) - else: - fill_value = metadata["fill_value"] - - all_codecs = [ - codec - for codec in metadata["codecs"] - if codec["name"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] - zarray = ZArray( - chunks=chunk_shape, - compressor=compressor, - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=filters or None, - order="C", - shape=shape, - zarr_format=zarr_format, - ) - - return zarray, dim_names, attrs - -def _configurable_to_num_codec_config(configurable: dict) -> dict: - """ - Convert a zarr v3 configurable into a numcodecs codec. - """ - configurable_copy = configurable.copy() - codec_id = configurable_copy.pop("name") - if codec_id.startswith("numcodecs."): - codec_id = codec_id[len("numcodecs.") :] - configuration = configurable_copy.pop("configuration") - - return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() + # How does this asyncio.run call interact with zarr-pythons async event loop? + return asyncio.run( + virtual_dataset_from_zarr_group( + filepath=filepath, + group=group, + virtual_variables=virtual_vars, + drop_variables=drop_variables, + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + reader_options=reader_options, + ) + ) From 1c5e42dcdc183b8d7bb5671bac081817069d8586 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 19 Dec 2024 17:47:19 -0500 Subject: [PATCH 54/98] fixes merge conflict --- docs/releases.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 617b5cc4..4db590f5 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,13 +9,11 @@ v1.2.1 (unreleased) New Features ~~~~~~~~~~~~ -<<<<<<< HEAD - Adds a Zarr reader to ``open_virtual_dataset``, which allows opening Zarr V2 and V3 stores as virtual datasets. (:pull:`#271`) By `Raphael Hagen `_. -======= + - Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory. (:issue:`167`, :pull:`227`) By `Tom Nicholas `_. ->>>>>>> main Breaking changes ~~~~~~~~~~~~~~~~ From 89d8555ef8567a5376ef4a38dfa2839f64baf633 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 19 Dec 2024 18:15:12 -0500 Subject: [PATCH 55/98] bit of restructure --- virtualizarr/readers/zarr.py | 104 ++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 51 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index a1257877..6c9fa16c 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -44,14 +44,9 @@ async def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: else: fill_value = zarr_array.metadata.fill_value - # Codecs from test looks like: (BytesCodec(endian=),) - # Questions: What do we do with endian info? codecs = get_codecs(zarr_array) # Question: How should we parse the values from get_codecs? - # typing: Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]] - # mypy: ... is not indexable [index] - # added tmp bypyass for mypy compressor = getattr(codecs[0], "compressor", None) # type: ignore filters = getattr(codecs[0], "filters", None) # type: ignore @@ -71,7 +66,7 @@ async def build_chunk_manifest( store_path: str, chunk_mapping_dict: dict, array_name: str, zarr_format: int ) -> ChunkManifest: chunk_manifest_dict = {} - + # ToDo: We could skip the dict creation and built the Manifest from arrays directly for key, value in chunk_mapping_dict.items(): if zarr_format == 2: # split on array name + trailing slash @@ -83,7 +78,7 @@ async def build_chunk_manifest( chunk_key = ( key.split(array_name + "/")[-1].split("c/")[-1].replace("/", ".") ) - # key.split('/c/')[-1] + chunk_manifest_dict[chunk_key] = { "path": store_path + "/" + key, "offset": 0, @@ -124,12 +119,12 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath: str): - # keys: array_zarray & array_dims + array_name = zarr_array.basename zarray_array = await build_zarray_metadata(zarr_array=zarr_array) - array_name = zarr_array.basename # build mapping between chunks and # of bytes (size) chunk_map = await get_chunk_mapping_prefix(zarr_array, prefix=f"{array_name}/c") + # transform chunk_map into ChunkManifest that fits into ManifestArray chunk_manifest = await build_chunk_manifest( store_path=filepath, @@ -151,8 +146,9 @@ async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath async def virtual_dataset_from_zarr_group( + zarr_group: zarr.AsyncGroup, filepath: str, - group: str | None = None, + group: str, drop_variables: Iterable[str] | None = [], virtual_variables: Iterable[str] | None = [], loadable_variables: Iterable[str] | None = [], @@ -160,14 +156,8 @@ async def virtual_dataset_from_zarr_group( indexes: Mapping[str, Index] | None = None, reader_options: dict = {}, ): - import zarr - - zg = await zarr.api.asynchronous.open_group( - filepath, storage_options=reader_options.get("storage_options"), mode="r" - ) - virtual_zarr_arrays = await asyncio.gather( - *[zg.getitem(var) for var in virtual_variables] + *[zarr_group.getitem(var) for var in virtual_variables] ) virtual_variable_arrays = await asyncio.gather( @@ -211,7 +201,7 @@ async def virtual_dataset_from_zarr_group( loadable_vars=loadable_vars, indexes=indexes, coord_names=coord_names, - attrs=zg.attrs, + attrs=zarr_group.attrs, ) @@ -228,11 +218,6 @@ def open_virtual_dataset( reader_options: Optional[dict] = None, ) -> Dataset: # Question: Is this something we want to pass through? - if virtual_backend_kwargs: - raise NotImplementedError( - "Zarr reader does not understand any virtual_backend_kwargs" - ) - import asyncio import zarr @@ -241,40 +226,56 @@ def open_virtual_dataset( if version.parse(zarr.__version__).major < 3: raise ImportError("Zarr V3 is required") - drop_variables, loadable_variables = check_for_collisions( - drop_variables, - loadable_variables, - ) + async def _open_virtual_dataset( + filepath=filepath, + group=group, + drop_variables=drop_variables, + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + virtual_backend_kwargs=virtual_backend_kwargs, + reader_options=reader_options, + ): + if virtual_backend_kwargs: + raise NotImplementedError( + "Zarr reader does not understand any virtual_backend_kwargs" + ) + + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) - filepath = validate_and_normalize_path_to_uri( - filepath, fs_root=Path.cwd().as_uri() - ) - # This currently fails for local filepaths (ie. tests): - # *** TypeError: Filesystem needs to support async operations. - # https://github.com/zarr-developers/zarr-python/issues/2554 + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() + ) + # This currently fails for local filepaths (ie. tests): + # *** TypeError: Filesystem needs to support async operations. + # https://github.com/zarr-developers/zarr-python/issues/2554 - if reader_options is None: - reader_options = {} + if reader_options is None: + reader_options = {} - # This is just to grab array keys, so is sync alright? - zg = zarr.open_group( - filepath, storage_options=reader_options.get("storage_options"), mode="r" - ) + zg = await zarr.api.asynchronous.open_group( + filepath, + storage_options=reader_options.get("storage_options"), + mode="r", + ) - zarr_array_keys = [val for val in zg.array_keys()] + zarr_array_keys = [key async for key in zg.array_keys()] - missing_vars = set(loadable_variables) - set(zarr_array_keys) - if missing_vars: - raise ValueError( - f"Some loadable variables specified are not present in this zarr store: {missing_vars}" + missing_vars = set(loadable_variables) - set(zarr_array_keys) + if missing_vars: + raise ValueError( + f"Some loadable variables specified are not present in this zarr store: {missing_vars}" + ) + virtual_vars = list( + set(zarr_array_keys) - set(loadable_variables) - set(drop_variables) ) - virtual_vars = list( - set(zarr_array_keys) - set(loadable_variables) - set(drop_variables) - ) - # How does this asyncio.run call interact with zarr-pythons async event loop? - return asyncio.run( - virtual_dataset_from_zarr_group( + # How does this asyncio.run call interact with zarr-pythons async event loop? + return await virtual_dataset_from_zarr_group( + zarr_group=zg, filepath=filepath, group=group, virtual_variables=virtual_vars, @@ -284,4 +285,5 @@ def open_virtual_dataset( indexes=indexes, reader_options=reader_options, ) - ) + + return asyncio.run(_open_virtual_dataset()) From c1a521852e71492e3ba2052e93d42a67bdd95998 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 19 Dec 2024 18:24:25 -0500 Subject: [PATCH 56/98] nit --- virtualizarr/readers/zarr.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 6c9fa16c..03035f3b 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -217,7 +217,6 @@ def open_virtual_dataset( virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: - # Question: Is this something we want to pass through? import asyncio import zarr @@ -249,7 +248,7 @@ async def _open_virtual_dataset( filepath = validate_and_normalize_path_to_uri( filepath, fs_root=Path.cwd().as_uri() ) - # This currently fails for local filepaths (ie. tests): + # This currently fails for local filepaths (ie. tests) but works for s3: # *** TypeError: Filesystem needs to support async operations. # https://github.com/zarr-developers/zarr-python/issues/2554 @@ -273,7 +272,6 @@ async def _open_virtual_dataset( set(zarr_array_keys) - set(loadable_variables) - set(drop_variables) ) - # How does this asyncio.run call interact with zarr-pythons async event loop? return await virtual_dataset_from_zarr_group( zarr_group=zg, filepath=filepath, @@ -286,4 +284,5 @@ async def _open_virtual_dataset( reader_options=reader_options, ) + # How does this asyncio.run call interact with zarr-pythons async event loop? return asyncio.run(_open_virtual_dataset()) From 6af84b448fd7f40dfe55d2456467e284f5dfe3b5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 19 Dec 2024 22:06:29 -0500 Subject: [PATCH 57/98] WIP on ChunkManifest.from_arrays --- virtualizarr/readers/zarr.py | 57 ++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 03035f3b..1fabf1bb 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -62,7 +62,7 @@ async def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: ) -async def build_chunk_manifest( +async def build_chunk_manifest_from_dict_mapping( store_path: str, chunk_mapping_dict: dict, array_name: str, zarr_format: int ) -> ChunkManifest: chunk_manifest_dict = {} @@ -88,12 +88,59 @@ async def build_chunk_manifest( return ChunkManifest(chunk_manifest_dict) +async def build_chunk_manifest( + zarr_array: zarr.AsyncArray, prefix: str, filepath: str +) -> ChunkManifest: + """Build a ChunkManifest from arrays + keys will be chunks + add in filepath to chunks to make '_paths' + offsets are array of 0 of length (len(keys)) or len(paths)) np.ndarray[Any, np.dtype[np.uint64]] + sizes are '_lengths' + + """ + import numpy as np + + keys = [x async for x in zarr_array.store.list_prefix(prefix)] + filepath_list = [filepath] * len(keys) + + # can this be lambda'ed? + # stolen from manifest.py + def combine_path_chunk(filepath: str, chunk_key: str): + return filepath + chunk_key + + vectorized_chunk_path_combine_func = np.vectorize( + combine_path_chunk, otypes=[np.dtypes.StringDType()] + ) + + # _paths: np.ndarray[Any, np.dtypes.StringDType] + _paths = vectorized_chunk_path_combine_func(filepath_list, keys) + + # _offsets: np.ndarray[Any, np.dtype[np.uint64]] + # this seems like a very overly complicated way to make a list of len n of 0s with a + # certain dtype... I might have gotten carried away on the np.vectorize hypetrain + _offsets = np.vectorize(lambda x: [x] * len(_paths), otypes=[np.uint64])(0) + + # _lengths: np.ndarray[Any, np.dtype[np.uint64]] + # maybe concurrent_map isn't the way to go, I think it expects tuples... + _lengths = await concurrent_map((keys), zarr_array.store.getsize) + + import ipdb + + ipdb.set_trace() + return ChunkManifest.from_arrays( + paths=_paths, # type: ignore + offsets=_offsets, + lengths=_lengths, + ) + + async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> dict: """Create a chunk map""" keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] sizes = await concurrent_map(keys, zarr_array.store.getsize) + return {key[0]: size for key, size in zip(keys, sizes)} @@ -122,11 +169,17 @@ async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath array_name = zarr_array.basename zarray_array = await build_zarray_metadata(zarr_array=zarr_array) + ## TEST - build chunk manifest from arrays + chunk_manifest = await build_chunk_manifest( + zarr_array, prefix=f"{array_name}/c", filepath=filepath + ) + # build mapping between chunks and # of bytes (size) + # FIXME!!!!: This is hardcoded for v3! chunk_map = await get_chunk_mapping_prefix(zarr_array, prefix=f"{array_name}/c") # transform chunk_map into ChunkManifest that fits into ManifestArray - chunk_manifest = await build_chunk_manifest( + chunk_manifest = await build_chunk_manifest_from_dict_mapping( store_path=filepath, chunk_mapping_dict=chunk_map, array_name=array_name, From 349386fc8485a2a54d40bbff8dfae4d653552ec9 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Sat, 21 Dec 2024 12:44:35 -0500 Subject: [PATCH 58/98] v2/v3 c chunk fix + build ChunkManifest from numpy arrays --- virtualizarr/manifests/manifest.py | 11 ----- virtualizarr/readers/zarr.py | 66 ++++++++++++++---------------- 2 files changed, 31 insertions(+), 46 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index bab1448a..6d6b4daf 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -357,17 +357,6 @@ def shape_chunk_grid(self) -> tuple[int, ...]: def __repr__(self) -> str: return f"ChunkManifest" - @property - def nbytes(self) -> int: - """ - Size required to hold these references in memory in bytes. - - Note this is not the size of the referenced chunks if they were actually loaded into memory, - this is only the size of the pointers to the chunk locations. - If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. - """ - return self._paths.nbytes + self._offsets.nbytes + self._lengths.nbytes - def __getitem__(self, key: ChunkKey) -> ChunkEntry: indices = split(key) path = self._paths[indices] diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 1fabf1bb..9161a650 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -91,42 +91,36 @@ async def build_chunk_manifest_from_dict_mapping( async def build_chunk_manifest( zarr_array: zarr.AsyncArray, prefix: str, filepath: str ) -> ChunkManifest: - """Build a ChunkManifest from arrays - keys will be chunks - add in filepath to chunks to make '_paths' - offsets are array of 0 of length (len(keys)) or len(paths)) np.ndarray[Any, np.dtype[np.uint64]] - sizes are '_lengths' - - """ + """Build a ChunkManifest with the from_arrays method""" import numpy as np - keys = [x async for x in zarr_array.store.list_prefix(prefix)] - filepath_list = [filepath] * len(keys) + # import pdb; pdb.set_trace() + key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] + + filepath_list = [filepath] * len(key_tuples) # can this be lambda'ed? # stolen from manifest.py def combine_path_chunk(filepath: str, chunk_key: str): - return filepath + chunk_key + return filepath + "/" + chunk_key vectorized_chunk_path_combine_func = np.vectorize( combine_path_chunk, otypes=[np.dtypes.StringDType()] ) # _paths: np.ndarray[Any, np.dtypes.StringDType] - _paths = vectorized_chunk_path_combine_func(filepath_list, keys) + # turn the tuples of chunks to a flattened list with :list(sum(key_tuples, ())) + _paths = vectorized_chunk_path_combine_func( + filepath_list, list(sum(key_tuples, ())) + ) # _offsets: np.ndarray[Any, np.dtype[np.uint64]] - # this seems like a very overly complicated way to make a list of len n of 0s with a - # certain dtype... I might have gotten carried away on the np.vectorize hypetrain - _offsets = np.vectorize(lambda x: [x] * len(_paths), otypes=[np.uint64])(0) + _offsets = np.array([0] * len(_paths), dtype=np.uint64) # _lengths: np.ndarray[Any, np.dtype[np.uint64]] - # maybe concurrent_map isn't the way to go, I think it expects tuples... - _lengths = await concurrent_map((keys), zarr_array.store.getsize) + lengths = await concurrent_map((key_tuples), zarr_array.store.getsize) + _lengths = np.array(lengths, dtype=np.uint64) - import ipdb - - ipdb.set_trace() return ChunkManifest.from_arrays( paths=_paths, # type: ignore offsets=_offsets, @@ -140,7 +134,6 @@ async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] sizes = await concurrent_map(keys, zarr_array.store.getsize) - return {key[0]: size for key, size in zip(keys, sizes)} @@ -166,31 +159,34 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath: str): - array_name = zarr_array.basename + # zarr_prefix = "/"+zarr_array.basename + zarr_prefix = zarr_array.basename + + if zarr_array.metadata.zarr_format == 3: + # if we have zarr_v3, we add /c/ to that chunk paths + zarr_prefix = f"{zarr_prefix}/c" + zarray_array = await build_zarray_metadata(zarr_array=zarr_array) - ## TEST - build chunk manifest from arrays + # build mapping between chunks and # of bytes (size) chunk_manifest = await build_chunk_manifest( - zarr_array, prefix=f"{array_name}/c", filepath=filepath + zarr_array, prefix=zarr_prefix, filepath=filepath ) - # build mapping between chunks and # of bytes (size) - # FIXME!!!!: This is hardcoded for v3! - chunk_map = await get_chunk_mapping_prefix(zarr_array, prefix=f"{array_name}/c") - - # transform chunk_map into ChunkManifest that fits into ManifestArray - chunk_manifest = await build_chunk_manifest_from_dict_mapping( - store_path=filepath, - chunk_mapping_dict=chunk_map, - array_name=array_name, - zarr_format=zarray_array["zarray_array"].zarr_format, - ) + # old method -> building chunk manifests from dicts + # chunk_map = await get_chunk_mapping_prefix(zarr_array, prefix=f"{array_name}/c") + # # transform chunk_map into ChunkManifest that fits into ManifestArray + # chunk_manifest = await build_chunk_manifest_from_dict_mapping( + # store_path=filepath, + # chunk_mapping_dict=chunk_map, + # array_name=array_name, + # zarr_format=zarray_array["zarray_array"].zarr_format, + # ) # build ManifestArray from dict manifest_array = ManifestArray( zarray=zarray_array["zarray_array"], chunkmanifest=chunk_manifest ) - return Variable( dims=zarray_array["array_dims"], data=manifest_array, From c776ab9250153c7453edb0fd993f08eca48f5be7 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Sat, 21 Dec 2024 12:51:12 -0500 Subject: [PATCH 59/98] removed method of creating ChunkManifests from dicts --- virtualizarr/readers/zarr.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 9161a650..c26eb62e 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -159,31 +159,18 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath: str): - # zarr_prefix = "/"+zarr_array.basename zarr_prefix = zarr_array.basename if zarr_array.metadata.zarr_format == 3: - # if we have zarr_v3, we add /c/ to that chunk paths + # if we have Zarr format/version 3, we add /c/ to the chunk paths zarr_prefix = f"{zarr_prefix}/c" zarray_array = await build_zarray_metadata(zarr_array=zarr_array) - # build mapping between chunks and # of bytes (size) chunk_manifest = await build_chunk_manifest( zarr_array, prefix=zarr_prefix, filepath=filepath ) - # old method -> building chunk manifests from dicts - # chunk_map = await get_chunk_mapping_prefix(zarr_array, prefix=f"{array_name}/c") - # # transform chunk_map into ChunkManifest that fits into ManifestArray - # chunk_manifest = await build_chunk_manifest_from_dict_mapping( - # store_path=filepath, - # chunk_mapping_dict=chunk_map, - # array_name=array_name, - # zarr_format=zarray_array["zarray_array"].zarr_format, - # ) - - # build ManifestArray from dict manifest_array = ManifestArray( zarray=zarray_array["zarray_array"], chunkmanifest=chunk_manifest ) From fb6fff75652851567c440af8cc384914b78e643b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Sat, 21 Dec 2024 13:54:04 -0500 Subject: [PATCH 60/98] cleanup --- virtualizarr/readers/zarr_v3.py | 161 ------------------- virtualizarr/tests/test_writers/test_zarr.py | 62 ------- 2 files changed, 223 deletions(-) delete mode 100644 virtualizarr/readers/zarr_v3.py delete mode 100644 virtualizarr/tests/test_writers/test_zarr.py diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py deleted file mode 100644 index 70bf66e8..00000000 --- a/virtualizarr/readers/zarr_v3.py +++ /dev/null @@ -1,161 +0,0 @@ -import json -from pathlib import Path -from typing import Iterable, Mapping, Optional - -import numcodecs -import numpy as np -from xarray import Dataset, Index, Variable - -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.readers.common import VirtualBackend, separate_coords -from virtualizarr.zarr import ZArray - - -class ZarrV3VirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - """ - Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. - - This is experimental - chunk manifests are not part of the Zarr v3 Spec. - """ - - if virtual_backend_kwargs: - raise NotImplementedError( - "Zarr_v3 reader does not understand any virtual_backend_kwargs" - ) - - storepath = Path(filepath) - - if group: - raise NotImplementedError() - - if loadable_variables or decode_times: - raise NotImplementedError() - - if reader_options: - raise NotImplementedError() - - drop_vars: list[str] - if drop_variables is None: - drop_vars = [] - else: - drop_vars = list(drop_variables) - - ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_vars: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return ds - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." - ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) - shape = tuple(metadata["shape"]) - zarr_format = metadata["zarr_format"] - - if metadata["fill_value"] is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" - ) - else: - fill_value = metadata["fill_value"] - - all_codecs = [ - codec - for codec in metadata["codecs"] - if codec["name"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] - zarray = ZArray( - chunks=chunk_shape, - compressor=compressor, - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=filters or None, - order="C", - shape=shape, - zarr_format=zarr_format, - ) - - return zarray, dim_names, attrs - - -def _configurable_to_num_codec_config(configurable: dict) -> dict: - """ - Convert a zarr v3 configurable into a numcodecs codec. - """ - configurable_copy = configurable.copy() - codec_id = configurable_copy.pop("name") - if codec_id.startswith("numcodecs."): - codec_id = codec_id[len("numcodecs.") :] - configuration = configurable_copy.pop("configuration") - return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py deleted file mode 100644 index 19c4263b..00000000 --- a/virtualizarr/tests/test_writers/test_zarr.py +++ /dev/null @@ -1,62 +0,0 @@ -import json - -import pytest -import xarray.testing as xrt -from xarray import Dataset - -pytest.importorskip("zarr.core.metadata.v3") - -from virtualizarr import open_virtual_dataset -from virtualizarr.backend import FileType -from virtualizarr.readers.zarr import metadata_from_zarr_json -from virtualizarr.writers.zarr import dataset_to_zarr - - -def isconfigurable(value: dict) -> bool: - """ - Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict - """ - return "name" in value and "configuration" in value - - -def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset): - """ - Checks that the output metadata of an array variable conforms to this spec - for the required attributes: - https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata - """ - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - # read the a variable's metadata - with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: - metadata = json.loads(f.read()) - assert metadata["zarr_format"] == 3 - assert metadata["node_type"] == "array" - assert isinstance(metadata["shape"], list) and all( - isinstance(dim, int) for dim in metadata["shape"] - ) - assert isinstance(metadata["data_type"], str) or isconfigurable( - metadata["data_type"] - ) - assert isconfigurable(metadata["chunk_grid"]) - assert isconfigurable(metadata["chunk_key_encoding"]) - assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) - assert ( - isinstance(metadata["codecs"], list) - and len(metadata["codecs"]) > 1 - and all(isconfigurable(codec) for codec in metadata["codecs"]) - ) - - -def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): - vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") - roundtrip = open_virtual_dataset( - tmpdir / "store.zarr", filetype=FileType.zarr_v3, indexes={} - ) - - xrt.assert_identical(roundtrip, vds_with_manifest_arrays) - - -def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") - assert zarray == vds_with_manifest_arrays.a.data.zarray From 87c74d439b7c1c3ccab1799d81c7fab84a4ce6b0 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Sat, 21 Dec 2024 14:02:26 -0500 Subject: [PATCH 61/98] adds xfails to TestOpenVirtualDatasetZarr due to local filesystem zarr issue --- virtualizarr/tests/test_readers/test_zarr.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 7b4f5fbf..622bf211 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -12,10 +12,19 @@ @pytest.mark.parametrize( "zarr_store", [ - pytest.param(2, id="Zarr V2"), + pytest.param( + 2, + id="Zarr V2", + marks=pytest.mark.xfail( + reason="https://github.com/zarr-developers/zarr-python/issues/2554" + ), + ), pytest.param( 3, id="Zarr V3", + marks=pytest.mark.xfail( + reason="https://github.com/zarr-developers/zarr-python/issues/2554" + ), ), ], indirect=True, From 87dbdaed5e1225850311acf4269481bf6a4395e2 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 11:18:32 -0700 Subject: [PATCH 62/98] some nits after merging w/ main --- ci/upstream.yml | 6 ++++-- virtualizarr/manifests/manifest.py | 2 +- virtualizarr/readers/common.py | 2 +- virtualizarr/readers/zarr.py | 8 +++----- virtualizarr/tests/test_readers/test_zarr.py | 19 +++++++++++-------- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index 174857cc..ab6a73c0 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -3,7 +3,6 @@ channels: - conda-forge - nodefaults dependencies: - - xarray>=2024.10.0 - h5netcdf - h5py - hdf5 @@ -27,8 +26,11 @@ dependencies: - pooch - fsspec - dask + - zarr>=3.0.0 - pip - pip: - - icechunk>=0.1.0a8 # Installs zarr v3 as dependency + - xarray>=2025.1.1 + + - icechunk>=0.1.0a9 # Installs zarr v3 as dependency # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) - imagecodecs-numcodecs==2024.6.1 diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 6d6b4daf..6d4d783e 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -96,7 +96,7 @@ def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> # using PosixPath here ensures a clear error would be thrown on windows (whose paths and platform are not officially supported) _path = PosixPath(path) - if not _path.suffix: + if not _path.suffix and "zarr" not in path: raise ValueError( f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" ) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 89657b5f..1b3c6b1f 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -51,7 +51,7 @@ def maybe_open_loadable_vars_and_indexes( if fpath.upath.suffix == ".zarr": engine = "zarr" - xr_input = fpath.filepath + xr_input = fpath.upath else: engine = None diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index c26eb62e..72a7e241 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -94,7 +94,6 @@ async def build_chunk_manifest( """Build a ChunkManifest with the from_arrays method""" import numpy as np - # import pdb; pdb.set_trace() key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] filepath_list = [filepath] * len(key_tuples) @@ -281,13 +280,12 @@ async def _open_virtual_dataset( loadable_variables, ) - filepath = validate_and_normalize_path_to_uri( - filepath, fs_root=Path.cwd().as_uri() - ) + # filepath = validate_and_normalize_path_to_uri( + # filepath, fs_root=Path.cwd().as_uri() + # ) # This currently fails for local filepaths (ie. tests) but works for s3: # *** TypeError: Filesystem needs to support async operations. # https://github.com/zarr-developers/zarr-python/issues/2554 - if reader_options is None: reader_options = {} diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 622bf211..8f9c50d5 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -4,27 +4,30 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ManifestArray -from virtualizarr.tests import network, requires_zarrV3 +from virtualizarr.tests import requires_network, requires_zarrV3 + +# It seems like this PR: https://github.com/zarr-developers/zarr-python/pull/2533 +# might fix this issue: https://github.com/zarr-developers/zarr-python/issues/2554 @requires_zarrV3 -@network +@requires_network @pytest.mark.parametrize( "zarr_store", [ pytest.param( 2, id="Zarr V2", - marks=pytest.mark.xfail( - reason="https://github.com/zarr-developers/zarr-python/issues/2554" - ), + # marks=pytest.mark.xfail( + # reason="https://github.com/zarr-developers/zarr-python/issues/2554" + # ), ), pytest.param( 3, id="Zarr V3", - marks=pytest.mark.xfail( - reason="https://github.com/zarr-developers/zarr-python/issues/2554" - ), + # marks=pytest.mark.xfail( + # reason="https://github.com/zarr-developers/zarr-python/issues/2554" + # ), ), ], indirect=True, From 855fb5a364505be5ed82b09eb509ace0039b36bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 18:20:21 +0000 Subject: [PATCH 63/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/zarr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 72a7e241..90b7e557 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,14 +1,12 @@ from __future__ import annotations import asyncio -from pathlib import Path from typing import TYPE_CHECKING, Iterable, Mapping, Optional from xarray import Dataset, Index, Variable from zarr.core.common import concurrent_map from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, From 29434f163213b1e37401e10a035dee2fa82346ee Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 11:30:10 -0700 Subject: [PATCH 64/98] updates zarr v3 req --- virtualizarr/tests/__init__.py | 1 - virtualizarr/tests/test_integration.py | 4 ++-- virtualizarr/tests/test_readers/test_zarr.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index f8a4d66a..b1fbdf11 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -37,7 +37,6 @@ def _importorskip( has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") -has_zarrV3, requires_zarrV3 = _importorskip("zarr", minversion="2.99.0") has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs") has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin") has_zarr_python, requires_zarr_python = _importorskip("zarr") diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 62aef376..42cc88cf 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -10,7 +10,7 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.readers import HDF5VirtualBackend from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import network, requires_kerchunk, requires_zarrV3 +from virtualizarr.tests import network, requires_kerchunk, requires_zarr_python_v3 from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, ) @@ -87,7 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] -@requires_zarrV3 +@requires_zarr_python_v3 @network @pytest.mark.skip(reason="Kerchunk & zarr-python v3 incompatibility") @pytest.mark.parametrize( diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 8f9c50d5..2afa8ead 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -4,13 +4,13 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ManifestArray -from virtualizarr.tests import requires_network, requires_zarrV3 +from virtualizarr.tests import requires_network, requires_zarr_python_v3 # It seems like this PR: https://github.com/zarr-developers/zarr-python/pull/2533 # might fix this issue: https://github.com/zarr-developers/zarr-python/issues/2554 -@requires_zarrV3 +@requires_zarr_python_v3 @requires_network @pytest.mark.parametrize( "zarr_store", From 5f7040c678688f2e9240e12a0079430f4d58ae5d Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 11:30:59 -0700 Subject: [PATCH 65/98] lint --- virtualizarr/readers/zarr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 90b7e557..72a7e241 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,12 +1,14 @@ from __future__ import annotations import asyncio +from pathlib import Path from typing import TYPE_CHECKING, Iterable, Mapping, Optional from xarray import Dataset, Index, Variable from zarr.core.common import concurrent_map from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, From d3b0a9263e1d1fe9da0413c0c8bf9cac55fad398 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 18:31:07 +0000 Subject: [PATCH 66/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/zarr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 72a7e241..90b7e557 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,14 +1,12 @@ from __future__ import annotations import asyncio -from pathlib import Path from typing import TYPE_CHECKING, Iterable, Mapping, Optional from xarray import Dataset, Index, Variable from zarr.core.common import concurrent_map from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, From 4716114583e57881695e1bd843d6d763aa84cbd0 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 12:15:42 -0700 Subject: [PATCH 67/98] remove build_chunk_manifest_from_dict_mapping function since manifest are build from np.ndarrays --- virtualizarr/readers/zarr.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 72a7e241..26078075 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -62,32 +62,6 @@ async def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: ) -async def build_chunk_manifest_from_dict_mapping( - store_path: str, chunk_mapping_dict: dict, array_name: str, zarr_format: int -) -> ChunkManifest: - chunk_manifest_dict = {} - # ToDo: We could skip the dict creation and built the Manifest from arrays directly - for key, value in chunk_mapping_dict.items(): - if zarr_format == 2: - # split on array name + trailing slash - chunk_key = key.split(array_name + "/")[-1] - - elif zarr_format == 3: - # In v3 we remove the /c/ 'chunks' part of the key and - # replace trailing slashes with '.' to conform to ChunkManifest validation - chunk_key = ( - key.split(array_name + "/")[-1].split("c/")[-1].replace("/", ".") - ) - - chunk_manifest_dict[chunk_key] = { - "path": store_path + "/" + key, - "offset": 0, - "length": value, - } - - return ChunkManifest(chunk_manifest_dict) - - async def build_chunk_manifest( zarr_array: zarr.AsyncArray, prefix: str, filepath: str ) -> ChunkManifest: From dc6e6f8ff0b67213e8612e78899e453f1d31e94c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 12:16:29 -0700 Subject: [PATCH 68/98] tmp ignore lint --- virtualizarr/readers/zarr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 3fd883fe..45dfa4e6 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,12 +1,14 @@ from __future__ import annotations import asyncio +from pathlib import Path # noqa from typing import TYPE_CHECKING, Iterable, Mapping, Optional from xarray import Dataset, Index, Variable from zarr.core.common import concurrent_map from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri # noqa from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, From 9db43397fa1101d16b20d8e869bb924d103ba22c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 13:52:51 -0700 Subject: [PATCH 69/98] remove zarr fill_value skip --- virtualizarr/zarr.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 3d62d002..e339a3f4 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -61,12 +61,8 @@ def __post_init__(self) -> None: # Convert dtype string to numpy.dtype self.dtype = np.dtype(self.dtype) - # Question: Why is this applied only to fill values of None? - # It was skipping np dtypes such np.float32(nan) and np.int16(0) - # which causes serialization issues when writing to Kerchunk - - # if self.fill_value is None: - self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) + if self.fill_value is None: + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) @property def codec(self) -> Codec: From 4e0fb99628805663337b92bf37e70a364b39f690 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 14:00:35 -0700 Subject: [PATCH 70/98] fixes network req import in test_integration --- virtualizarr/tests/test_integration.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 42cc88cf..414baf7a 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -10,7 +10,11 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.readers import HDF5VirtualBackend from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import network, requires_kerchunk, requires_zarr_python_v3 +from virtualizarr.tests import ( + requires_kerchunk, + requires_network, + requires_zarr_python_v3, +) from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, ) @@ -88,7 +92,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( @requires_zarr_python_v3 -@network +@requires_network @pytest.mark.skip(reason="Kerchunk & zarr-python v3 incompatibility") @pytest.mark.parametrize( "zarr_store", From 72ae8b06877c75215721c9d2be45cf3043066f64 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 14:09:29 -0700 Subject: [PATCH 71/98] bump xarray to 2025.1.1 and icechunk to 0.1.0a10 in upstream --- ci/environment.yml | 2 +- ci/min-deps.yml | 2 +- ci/upstream.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index b26bb440..4befa1ce 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,7 +7,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.10.0 + - xarray>=2025.1.1 - kerchunk>=0.2.5 - numpy>=2.0.0 - ujson diff --git a/ci/min-deps.yml b/ci/min-deps.yml index 344a4595..14631e83 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -6,7 +6,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.10.0 + - xarray>=2025.1.1 - numpy>=2.0.0 - numcodecs - packaging diff --git a/ci/upstream.yml b/ci/upstream.yml index da7aef36..70e2235a 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -3,7 +3,7 @@ channels: - conda-forge - nodefaults dependencies: - - xarray>=2024.10.0 + - xarray>=2025.1.1 - h5netcdf - h5py - hdf5 @@ -28,6 +28,6 @@ dependencies: - fsspec - pip - pip: - - icechunk>=0.1.0a8 # Installs zarr v3 as dependency + - icechunk>=0.1.0a10 # Installs zarr v3 as dependency # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) - imagecodecs-numcodecs==2024.6.1 From d61e593764ee48ec3ff789e7053e392e8396ed49 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 14:22:13 -0700 Subject: [PATCH 72/98] move zarr import into type checking --- virtualizarr/readers/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 45dfa4e6..ef12f7ba 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Iterable, Mapping, Optional from xarray import Dataset, Index, Variable -from zarr.core.common import concurrent_map from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri # noqa @@ -19,6 +18,7 @@ if TYPE_CHECKING: import zarr + from zarr.core.common import concurrent_map async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: From 9edf706d16f48a0683591af5b67c3a7f7174e0b4 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 14:25:26 -0700 Subject: [PATCH 73/98] move zarr import in test_zarr --- virtualizarr/tests/test_readers/test_zarr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 2afa8ead..6588e9e9 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import zarr from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ManifestArray @@ -49,6 +48,8 @@ def test_drop_variables(self, zarr_store, drop_variables=["air"]): assert len(vds.data_vars) == 0 def test_virtual_dataset_zarr_attrs(self, zarr_store): + import zarr + zg = zarr.open_group(zarr_store) vds = open_virtual_dataset(filepath=zarr_store, indexes={}) zg_metadata_dict = zg.metadata.to_dict() From 3e6853795bb836bb6cdb053c393cafd0fcfb2508 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 14:46:49 -0700 Subject: [PATCH 74/98] adding back in missing nbytes property --- virtualizarr/manifests/manifest.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 6d4d783e..e79c3728 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -357,6 +357,17 @@ def shape_chunk_grid(self) -> tuple[int, ...]: def __repr__(self) -> str: return f"ChunkManifest" + @property + def nbytes(self) -> int: + """ + Size required to hold these references in memory in bytes. + + Note this is not the size of the referenced chunks if they were actually loaded into memory, + this is only the size of the pointers to the chunk locations. + If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. + """ + return self._paths.nbytes + self._offsets.nbytes + self._lengths.nbytes + def __getitem__(self, key: ChunkKey) -> ChunkEntry: indices = split(key) path = self._paths[indices] From 594d4a80b2483e9ccbcde5fe99d7f63a7dcc4e14 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 9 Jan 2025 14:51:49 -0700 Subject: [PATCH 75/98] typing --- virtualizarr/readers/zarr.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index ef12f7ba..9992fa99 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -165,6 +165,12 @@ async def virtual_dataset_from_zarr_group( indexes: Mapping[str, Index] | None = None, reader_options: dict = {}, ): + # appease the mypy gods + if virtual_variables is None: + virtual_variables = [] + if drop_variables is None: + drop_variables = [] + virtual_zarr_arrays = await asyncio.gather( *[zarr_group.getitem(var) for var in virtual_variables] ) From 3c6dc54b5c0652656ded4407b551c76556ac669a Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 10 Jan 2025 11:49:59 -0700 Subject: [PATCH 76/98] tmp testing & removing old xfail --- virtualizarr/readers/zarr.py | 5 ++--- virtualizarr/tests/test_readers/test_zarr.py | 6 ------ 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 9992fa99..f79051b1 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -18,7 +18,6 @@ if TYPE_CHECKING: import zarr - from zarr.core.common import concurrent_map async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: @@ -67,7 +66,7 @@ async def build_chunk_manifest( ) -> ChunkManifest: """Build a ChunkManifest with the from_arrays method""" import numpy as np - + from zarr.core.common import concurrent_map key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] filepath_list = [filepath] * len(key_tuples) @@ -103,7 +102,7 @@ def combine_path_chunk(filepath: str, chunk_key: str): async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> dict: """Create a chunk map""" - + from zarr.core.common import concurrent_map keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] sizes = await concurrent_map(keys, zarr_array.store.getsize) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 6588e9e9..cd329b4e 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -17,16 +17,10 @@ pytest.param( 2, id="Zarr V2", - # marks=pytest.mark.xfail( - # reason="https://github.com/zarr-developers/zarr-python/issues/2554" - # ), ), pytest.param( 3, id="Zarr V3", - # marks=pytest.mark.xfail( - # reason="https://github.com/zarr-developers/zarr-python/issues/2554" - # ), ), ], indirect=True, From dd20c8ac2a2f99246d093b68c8d75f0951122d26 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 18:51:18 +0000 Subject: [PATCH 77/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/zarr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index f79051b1..e5c97db8 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -67,6 +67,7 @@ async def build_chunk_manifest( """Build a ChunkManifest with the from_arrays method""" import numpy as np from zarr.core.common import concurrent_map + key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] filepath_list = [filepath] * len(key_tuples) @@ -103,6 +104,7 @@ def combine_path_chunk(filepath: str, chunk_key: str): async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> dict: """Create a chunk map""" from zarr.core.common import concurrent_map + keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] sizes = await concurrent_map(keys, zarr_array.store.getsize) From dac6c7702a813043a97bf8e72187dbfcc53bbf52 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 27 Jan 2025 11:29:20 -0700 Subject: [PATCH 78/98] adds back in validate_and_normalize_path_to_uri after upstream zarr fix & vendors concurrent map from zarr-python --- virtualizarr/readers/zarr.py | 45 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index e5c97db8..e9b77d51 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -2,8 +2,9 @@ import asyncio from pathlib import Path # noqa -from typing import TYPE_CHECKING, Iterable, Mapping, Optional - +from typing import TYPE_CHECKING, Iterable, Mapping, Optional, Callable, Awaitable, TypeVar, Any +from itertools import starmap +import numpy as np from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -19,6 +20,28 @@ if TYPE_CHECKING: import zarr +# Vendored directly from Zarr-python V3's private API +# https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/common.py#L53 +T = TypeVar("T", bound=tuple[Any, ...]) +V = TypeVar("V") +async def _concurrent_map( + items: Iterable[T], + func: Callable[..., Awaitable[V]], + limit: int | None = None, +) -> list[V]: + if limit is None: + return await asyncio.gather(*list(starmap(func, items))) + + else: + sem = asyncio.Semaphore(limit) + + async def run(item: tuple[Any]) -> V: + async with sem: + return await func(*item) + + return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items]) + + async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: return ZArray( @@ -65,8 +88,7 @@ async def build_chunk_manifest( zarr_array: zarr.AsyncArray, prefix: str, filepath: str ) -> ChunkManifest: """Build a ChunkManifest with the from_arrays method""" - import numpy as np - from zarr.core.common import concurrent_map + key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] @@ -91,7 +113,7 @@ def combine_path_chunk(filepath: str, chunk_key: str): _offsets = np.array([0] * len(_paths), dtype=np.uint64) # _lengths: np.ndarray[Any, np.dtype[np.uint64]] - lengths = await concurrent_map((key_tuples), zarr_array.store.getsize) + lengths = await _concurrent_map((key_tuples), zarr_array.store.getsize) _lengths = np.array(lengths, dtype=np.uint64) return ChunkManifest.from_arrays( @@ -103,11 +125,10 @@ def combine_path_chunk(filepath: str, chunk_key: str): async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> dict: """Create a chunk map""" - from zarr.core.common import concurrent_map keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] - sizes = await concurrent_map(keys, zarr_array.store.getsize) + sizes = await _concurrent_map(keys, zarr_array.store.getsize) return {key[0]: size for key, size in zip(keys, sizes)} @@ -261,12 +282,10 @@ async def _open_virtual_dataset( loadable_variables, ) - # filepath = validate_and_normalize_path_to_uri( - # filepath, fs_root=Path.cwd().as_uri() - # ) - # This currently fails for local filepaths (ie. tests) but works for s3: - # *** TypeError: Filesystem needs to support async operations. - # https://github.com/zarr-developers/zarr-python/issues/2554 + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() + ) + if reader_options is None: reader_options = {} From 3d230dc772721fb2220b5b8d99cb9a3315f6e6cd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 18:29:28 +0000 Subject: [PATCH 79/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/zarr.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index e9b77d51..f17a6082 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,10 +1,20 @@ from __future__ import annotations import asyncio -from pathlib import Path # noqa -from typing import TYPE_CHECKING, Iterable, Mapping, Optional, Callable, Awaitable, TypeVar, Any from itertools import starmap -import numpy as np +from pathlib import Path # noqa +from typing import ( + TYPE_CHECKING, + Any, + Awaitable, + Callable, + Iterable, + Mapping, + Optional, + TypeVar, +) + +import numpy as np from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -24,6 +34,8 @@ # https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/common.py#L53 T = TypeVar("T", bound=tuple[Any, ...]) V = TypeVar("V") + + async def _concurrent_map( items: Iterable[T], func: Callable[..., Awaitable[V]], @@ -39,8 +51,9 @@ async def run(item: tuple[Any]) -> V: async with sem: return await func(*item) - return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items]) - + return await asyncio.gather( + *[asyncio.ensure_future(run(item)) for item in items] + ) async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: @@ -89,7 +102,6 @@ async def build_chunk_manifest( ) -> ChunkManifest: """Build a ChunkManifest with the from_arrays method""" - key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] filepath_list = [filepath] * len(key_tuples) From 62ffe0259315e7abbe9c49e8767c2d4211dfe6dc Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 27 Jan 2025 17:17:41 -0700 Subject: [PATCH 80/98] removing kerchunk from zarr integration test --- ci/upstream.yml | 5 ++--- virtualizarr/readers/zarr.py | 24 +++++++++++++++----- virtualizarr/tests/test_integration.py | 31 ++++++++++++-------------- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index 3acd0977..26302d0d 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -28,9 +28,8 @@ dependencies: - pooch - fsspec - dask - - zarr>=3.0.0 - pip - pip: - - icechunk>=0.1.0a10 # Installs zarr v3 as dependency - # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) + - git+https://github.com/zarr-developers/zarr-python # installing from source until >= 3.0.2 + - icechunk>=0.1.0a15 - imagecodecs-numcodecs==2024.6.1 diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index e9b77d51..f17a6082 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -1,10 +1,20 @@ from __future__ import annotations import asyncio -from pathlib import Path # noqa -from typing import TYPE_CHECKING, Iterable, Mapping, Optional, Callable, Awaitable, TypeVar, Any from itertools import starmap -import numpy as np +from pathlib import Path # noqa +from typing import ( + TYPE_CHECKING, + Any, + Awaitable, + Callable, + Iterable, + Mapping, + Optional, + TypeVar, +) + +import numpy as np from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -24,6 +34,8 @@ # https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/common.py#L53 T = TypeVar("T", bound=tuple[Any, ...]) V = TypeVar("V") + + async def _concurrent_map( items: Iterable[T], func: Callable[..., Awaitable[V]], @@ -39,8 +51,9 @@ async def run(item: tuple[Any]) -> V: async with sem: return await func(*item) - return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items]) - + return await asyncio.gather( + *[asyncio.ensure_future(run(item)) for item in items] + ) async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: @@ -89,7 +102,6 @@ async def build_chunk_manifest( ) -> ChunkManifest: """Build a ChunkManifest with the from_arrays method""" - key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] filepath_list = [filepath] * len(key_tuples) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 414baf7a..ad654a58 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -93,7 +93,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( @requires_zarr_python_v3 @requires_network -@pytest.mark.skip(reason="Kerchunk & zarr-python v3 incompatibility") +@pytest.mark.skip(reason="WIP on icechunk round-trip/") @pytest.mark.parametrize( "zarr_store", [ @@ -103,30 +103,27 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( indirect=True, ) def test_zarr_roundtrip(zarr_store): + import icechunk + + # open zarr store with Xarray for comparison + comparion_ds = xr.open_zarr(zarr_store) + ds = open_virtual_dataset( zarr_store, indexes={}, ) - ds_refs = ds.virtualize.to_kerchunk(format="dict") - - roundtrip = dataset_from_kerchunk_refs(ds_refs) - - # This won't work right now b/c of the Kerchunk zarr-v3 incompatibility - # roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) + # Note: this was done with icechunk 0.1.0a15 - syntax could be incorrect + storage = icechunk.storage.in_memory_storage() + repo = icechunk.Repository.open_or_create(storage=storage) + session = repo.writable_session("main") - def add_prefix(file_path: str) -> str: - return "file://" + file_path + # Write the virtual dataset to icechunk + ds.virtualize.to_icechunk(session.store) - for array in ["lat", "lon", "time", "air"]: - # V2: What should the behavior here be? Should the RT dataset have _ARRAY_DIMS? - ds[array].attrs.pop("_ARRAY_DIMENSIONS", None) + rtds = xr.open_zarr(session.store) - # temp workaround b/c of the zarr-python-v3 filepath issue: https://github.com/zarr-developers/zarr-python/issues/2554 - roundtrip[array].data = roundtrip[array].data.rename_paths(add_prefix) - - # Assert equal to original dataset - ManifestArrays - xrt.assert_equal(roundtrip, ds) + xrt.assert_equal(comparion_ds, rtds) @requires_kerchunk From 231b7034f5f05274a59518307b00c177642e6181 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 27 Jan 2025 17:32:16 -0700 Subject: [PATCH 81/98] removed zarr manifest + lint --- virtualizarr/accessor.py | 16 ---------------- virtualizarr/readers/zarr.py | 4 ---- 2 files changed, 20 deletions(-) diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index f0dfb966..597a9f36 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -7,7 +7,6 @@ from virtualizarr.manifests import ManifestArray from virtualizarr.types.kerchunk import KerchunkStoreRefs from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs -from virtualizarr.writers.zarr import dataset_to_zarr if TYPE_CHECKING: from icechunk import IcechunkStore # type: ignore[import-not-found] @@ -24,21 +23,6 @@ class VirtualiZarrDatasetAccessor: def __init__(self, ds: Dataset): self.ds: Dataset = ds - def to_zarr(self, storepath: str) -> None: - """ - Serialize all virtualized arrays in this xarray dataset as a Zarr store. - - Currently requires all variables to be backed by ManifestArray objects. - - Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. - See https://github.com/zarr-developers/zarr-specs/issues/287 - - Parameters - ---------- - storepath : str - """ - dataset_to_zarr(self.ds, storepath) - def to_icechunk( self, store: "IcechunkStore", diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index f17a6082..26e2b404 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -106,8 +106,6 @@ async def build_chunk_manifest( filepath_list = [filepath] * len(key_tuples) - # can this be lambda'ed? - # stolen from manifest.py def combine_path_chunk(filepath: str, chunk_key: str): return filepath + "/" + chunk_key @@ -115,7 +113,6 @@ def combine_path_chunk(filepath: str, chunk_key: str): combine_path_chunk, otypes=[np.dtypes.StringDType()] ) - # _paths: np.ndarray[Any, np.dtypes.StringDType] # turn the tuples of chunks to a flattened list with :list(sum(key_tuples, ())) _paths = vectorized_chunk_path_combine_func( filepath_list, list(sum(key_tuples, ())) @@ -330,5 +327,4 @@ async def _open_virtual_dataset( reader_options=reader_options, ) - # How does this asyncio.run call interact with zarr-pythons async event loop? return asyncio.run(_open_virtual_dataset()) From 0d4d653e1ba214ff827535dcce5e554aa4ebd393 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 28 Jan 2025 18:21:25 -0700 Subject: [PATCH 82/98] wip on testing --- virtualizarr/readers/zarr.py | 26 +++++++++++++++++----- virtualizarr/tests/test_integration.py | 30 ++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 26e2b404..616fc3c2 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -25,7 +25,7 @@ maybe_open_loadable_vars_and_indexes, ) from virtualizarr.utils import check_for_collisions -from virtualizarr.zarr import ZArray +from virtualizarr.zarr import ZARR_DEFAULT_FILL_VALUE, ZArray if TYPE_CHECKING: import zarr @@ -57,14 +57,29 @@ async def run(item: tuple[Any]) -> V: async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: + from virtualizarr.codecs import get_codecs + + codecs = get_codecs(zarr_array) + if codecs.filters == []: + filters = None + + fill_value = zarr_array.metadata.fill_value + + if fill_value is not None: + fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] + + # Zarr-python gives us the compressor object. Ex: Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + # This won't serialize into a Kerchunk ref, so we use the .get_config to retrieve a dict. + compressor_dict = codecs.compressor.get_config() + return ZArray( shape=zarr_array.metadata.shape, chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] dtype=zarr_array.metadata.dtype, - fill_value=zarr_array.metadata.fill_value, # type: ignore[arg-type] + fill_value=fill_value, # type: ignore[arg-type] order="C", - compressor=zarr_array.metadata.compressor, # type: ignore[union-attr] - filters=zarr_array.metadata.filters, # type: ignore + compressor=compressor_dict, # type: ignore[union-attr] + filters=filters, # type: ignore zarr_format=zarr_array.metadata.zarr_format, ) @@ -77,7 +92,7 @@ async def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" ) else: - fill_value = zarr_array.metadata.fill_value + fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] codecs = get_codecs(zarr_array) @@ -146,6 +161,7 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): if zarr_array.metadata.zarr_format == 2: array_zarray = await _parse_zarr_v2_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + array_dims = attrs["_ARRAY_DIMENSIONS"] elif zarr_array.metadata.zarr_format == 3: diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index ad654a58..766cd4e8 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -91,6 +91,33 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] + +@requires_zarr_python_v3 +@requires_network +@requires_kerchunk +@pytest.mark.skip(reason="WIP on kerchunk v3 RT- fails in RT: RuntimeError: 'error during blosc decompression: -1' for Zarr v3 + kerchunk v3 branch") +@pytest.mark.parametrize( + "zarr_store", + [ + pytest.param(2, id="Zarr V2"), + pytest.param(3, id="Zarr V3"), + ], + indirect=True, +) +def test_zarr_roundtrip_kerchunk(zarr_store): + + comparion_ds = xr.open_zarr(zarr_store) + + ds = open_virtual_dataset( + zarr_store, + indexes={}, + ) + ds_refs = ds.virtualize.to_kerchunk(format='dict') + roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) + + xrt.assert_equal(comparion_ds, roundtrip) + + @requires_zarr_python_v3 @requires_network @pytest.mark.skip(reason="WIP on icechunk round-trip/") @@ -102,7 +129,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( ], indirect=True, ) -def test_zarr_roundtrip(zarr_store): +def test_zarr_roundtrip_icechunk(zarr_store): import icechunk # open zarr store with Xarray for comparison @@ -122,7 +149,6 @@ def test_zarr_roundtrip(zarr_store): ds.virtualize.to_icechunk(session.store) rtds = xr.open_zarr(session.store) - xrt.assert_equal(comparion_ds, rtds) From 7724969121ce8f145f5cea9e30d7c077b4f65cee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 Jan 2025 01:21:39 +0000 Subject: [PATCH 83/98] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/tests/test_integration.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 766cd4e8..57b85ff8 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -91,11 +91,12 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] - @requires_zarr_python_v3 @requires_network @requires_kerchunk -@pytest.mark.skip(reason="WIP on kerchunk v3 RT- fails in RT: RuntimeError: 'error during blosc decompression: -1' for Zarr v3 + kerchunk v3 branch") +@pytest.mark.skip( + reason="WIP on kerchunk v3 RT- fails in RT: RuntimeError: 'error during blosc decompression: -1' for Zarr v3 + kerchunk v3 branch" +) @pytest.mark.parametrize( "zarr_store", [ @@ -105,14 +106,13 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( indirect=True, ) def test_zarr_roundtrip_kerchunk(zarr_store): - comparion_ds = xr.open_zarr(zarr_store) ds = open_virtual_dataset( zarr_store, indexes={}, ) - ds_refs = ds.virtualize.to_kerchunk(format='dict') + ds_refs = ds.virtualize.to_kerchunk(format="dict") roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) xrt.assert_equal(comparion_ds, roundtrip) From a8c498a78bb18a5435d082b8af64af1a83d0c234 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 28 Jan 2025 18:38:04 -0700 Subject: [PATCH 84/98] revert min-deps change --- ci/min-deps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/min-deps.yml b/ci/min-deps.yml index 14631e83..344a4595 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -6,7 +6,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2025.1.1 + - xarray>=2024.10.0 - numpy>=2.0.0 - numcodecs - packaging From 9684dd3a2ec98e7436871bac670cfbb3a6e3c837 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 28 Jan 2025 18:38:32 -0700 Subject: [PATCH 85/98] merge --- virtualizarr/tests/test_integration.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 766cd4e8..57b85ff8 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -91,11 +91,12 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] - @requires_zarr_python_v3 @requires_network @requires_kerchunk -@pytest.mark.skip(reason="WIP on kerchunk v3 RT- fails in RT: RuntimeError: 'error during blosc decompression: -1' for Zarr v3 + kerchunk v3 branch") +@pytest.mark.skip( + reason="WIP on kerchunk v3 RT- fails in RT: RuntimeError: 'error during blosc decompression: -1' for Zarr v3 + kerchunk v3 branch" +) @pytest.mark.parametrize( "zarr_store", [ @@ -105,14 +106,13 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( indirect=True, ) def test_zarr_roundtrip_kerchunk(zarr_store): - comparion_ds = xr.open_zarr(zarr_store) ds = open_virtual_dataset( zarr_store, indexes={}, ) - ds_refs = ds.virtualize.to_kerchunk(format='dict') + ds_refs = ds.virtualize.to_kerchunk(format="dict") roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) xrt.assert_equal(comparion_ds, roundtrip) From e310f040089a8cf05191cee982b00f06c9ccedd6 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 28 Jan 2025 18:39:18 -0700 Subject: [PATCH 86/98] revert environment.yaml --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index 4befa1ce..b26bb440 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,7 +7,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2025.1.1 + - xarray>=2024.10.0 - kerchunk>=0.2.5 - numpy>=2.0.0 - ujson From 800a1655996cca341204f1a862c385eace1082c5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 28 Jan 2025 18:43:36 -0700 Subject: [PATCH 87/98] removed zarr manifest writing --- docs/api.rst | 1 - docs/usage.md | 25 ------- virtualizarr/manifests/manifest.py | 16 ---- virtualizarr/writers/zarr.py | 114 ----------------------------- 4 files changed, 156 deletions(-) delete mode 100644 virtualizarr/writers/zarr.py diff --git a/docs/api.rst b/docs/api.rst index 60716898..951b53f7 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -29,7 +29,6 @@ Serialization :toctree: generated/ VirtualiZarrDatasetAccessor.to_kerchunk - VirtualiZarrDatasetAccessor.to_zarr VirtualiZarrDatasetAccessor.to_icechunk Information diff --git a/docs/usage.md b/docs/usage.md index 3f8ff84c..bd41354e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -435,31 +435,6 @@ combined_vds.virtualize.to_icechunk(session.store) See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details. -### Writing as Zarr - -Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`vds.virtualize.to_zarr ` accessor method. - -```python -combined_vds.virtualize.to_zarr('combined.zarr') -``` - -The result is a zarr v3 store on disk which contains the chunk manifest information written out as `manifest.json` files, so the store looks like this: - -``` -combined/zarr.json <- group metadata -combined/air/zarr.json <- array metadata -combined/air/manifest.json <- array manifest -... -``` - -The advantage of this format is that any zarr v3 reader that understands the chunk manifest ZEP could read from this store, no matter what language it is written in (e.g. via `zarr-python`, `zarr-js`, or rust). This reading would also not require `fsspec`. - -```{note} -Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this feature cannot be used for data processing. - -This store can however be read by {py:func}`~virtualizarr.open_virtual_dataset`, by passing `filetype="zarr_v3"`. -``` - ## Opening Kerchunk references as virtual datasets You can open existing Kerchunk `json` or `parquet` references as Virtualizarr virtual datasets. This may be useful for converting existing Kerchunk formatted references to storage formats like [Icechunk](https://icechunk.io/). diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index e79c3728..01ac3577 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,4 +1,3 @@ -import json import re from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView from pathlib import PosixPath @@ -436,21 +435,6 @@ def __eq__(self, other: Any) -> bool: lengths_equal = (self._lengths == other._lengths).all() return paths_equal and offsets_equal and lengths_equal - @classmethod - def from_zarr_json(cls, filepath: str) -> "ChunkManifest": - """Create a ChunkManifest from a Zarr manifest.json file.""" - - with open(filepath, "r") as manifest_file: - entries = json.load(manifest_file) - - return cls(entries=entries) - - def to_zarr_json(self, filepath: str) -> None: - """Write the manifest to a Zarr manifest.json file.""" - entries = self.dict() - with open(filepath, "w") as json_file: - json.dump(entries, json_file, indent=4, separators=(", ", ": ")) - def rename_paths( self, new: str | Callable[[str], str], diff --git a/virtualizarr/writers/zarr.py b/virtualizarr/writers/zarr.py deleted file mode 100644 index b9529ad5..00000000 --- a/virtualizarr/writers/zarr.py +++ /dev/null @@ -1,114 +0,0 @@ -from pathlib import Path - -import numpy as np -from xarray import Dataset -from xarray.core.variable import Variable - -from virtualizarr.vendor.zarr.utils import json_dumps -from virtualizarr.zarr import ZArray - - -def dataset_to_zarr(ds: Dataset, storepath: str) -> None: - """ - Write an xarray dataset whose variables wrap ManifestArrays to a v3 Zarr store, writing chunk references into manifest.json files. - - Currently requires all variables to be backed by ManifestArray objects. - - Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. - See https://github.com/zarr-developers/zarr-specs/issues/287 - - Parameters - ---------- - ds: xr.Dataset - storepath: str - """ - - from virtualizarr.manifests import ManifestArray - - _storepath = Path(storepath) - Path.mkdir(_storepath, exist_ok=False) - - # should techically loop over groups in a tree but a dataset corresponds to only one group - group_metadata = {"zarr_format": 3, "node_type": "group", "attributes": ds.attrs} - with open(_storepath / "zarr.json", "wb") as group_metadata_file: - group_metadata_file.write(json_dumps(group_metadata)) - - for name, var in ds.variables.items(): - array_dir = _storepath / str(name) - marr = var.data - - # TODO move this check outside the writing loop so we don't write an incomplete store on failure? - # TODO at some point this should be generalized to also write in-memory arrays as normal zarr chunks, see GH isse #62. - if not isinstance(marr, ManifestArray): - raise TypeError( - "Only xarray objects wrapping ManifestArrays can be written to zarr using this method, " - f"but variable {name} wraps an array of type {type(marr)}" - ) - - Path.mkdir(array_dir, exist_ok=False) - - # write the chunk references into a manifest.json file - # and the array metadata into a zarr.json file - to_zarr_json(var, array_dir) - - -def to_zarr_json(var: Variable, array_dir: Path) -> None: - """ - Write out both the zarr.json and manifest.json file into the given zarr array directory. - - Follows the Zarr v3 manifest storage transformer ZEP (see https://github.com/zarr-developers/zarr-specs/issues/287). - - Parameters - ---------- - var : xr.Variable - Must be wrapping a ManifestArray - dirpath : str - Zarr store array directory into which to write files. - """ - - marr = var.data - - marr.manifest.to_zarr_json(array_dir / "manifest.json") - - metadata = zarr_v3_array_metadata( - marr.zarray, [str(x) for x in var.dims], var.attrs - ) - with open(array_dir / "zarr.json", "wb") as metadata_file: - metadata_file.write(json_dumps(metadata)) - - -def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> dict: - """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" - # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us - metadata = zarray.dict() - - # adjust to match v3 spec - metadata["zarr_format"] = 3 - metadata["node_type"] = "array" - metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) - metadata["chunk_grid"] = { - "name": "regular", - "configuration": {"chunk_shape": metadata.pop("chunks")}, - } - metadata["chunk_key_encoding"] = { - "name": "default", - "configuration": {"separator": "/"}, - } - metadata["codecs"] = tuple(c.to_dict() for c in zarray._v3_codec_pipeline()) - metadata.pop("filters") - metadata.pop("compressor") - metadata.pop("order") - - # indicate that we're using the manifest storage transformer ZEP - metadata["storage_transformers"] = [ - { - "name": "chunk-manifest-json", - "configuration": {"manifest": "./manifest.json"}, - } - ] - - # add information from xarray object - metadata["dimension_names"] = dim_names - metadata["attributes"] = attrs - - return metadata From ac538c786f3d970c67ff62b7d8c7dc4306ca1b6b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 28 Jan 2025 19:21:22 -0700 Subject: [PATCH 88/98] cleanup and consolidation in zarr reader --- virtualizarr/readers/zarr.py | 113 +++++++++---------- virtualizarr/tests/test_readers/test_zarr.py | 2 - 2 files changed, 53 insertions(+), 62 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 616fc3c2..45d59603 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -25,7 +25,7 @@ maybe_open_loadable_vars_and_indexes, ) from virtualizarr.utils import check_for_collisions -from virtualizarr.zarr import ZARR_DEFAULT_FILL_VALUE, ZArray +from virtualizarr.zarr import ZARR_DEFAULT_FILL_VALUE, Codec, ZArray if TYPE_CHECKING: import zarr @@ -56,62 +56,6 @@ async def run(item: tuple[Any]) -> V: ) -async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: - from virtualizarr.codecs import get_codecs - - codecs = get_codecs(zarr_array) - if codecs.filters == []: - filters = None - - fill_value = zarr_array.metadata.fill_value - - if fill_value is not None: - fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] - - # Zarr-python gives us the compressor object. Ex: Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - # This won't serialize into a Kerchunk ref, so we use the .get_config to retrieve a dict. - compressor_dict = codecs.compressor.get_config() - - return ZArray( - shape=zarr_array.metadata.shape, - chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] - dtype=zarr_array.metadata.dtype, - fill_value=fill_value, # type: ignore[arg-type] - order="C", - compressor=compressor_dict, # type: ignore[union-attr] - filters=filters, # type: ignore - zarr_format=zarr_array.metadata.zarr_format, - ) - - -async def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: - from virtualizarr.codecs import get_codecs - - if zarr_array.metadata.fill_value is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" - ) - else: - fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] - - codecs = get_codecs(zarr_array) - - # Question: How should we parse the values from get_codecs? - compressor = getattr(codecs[0], "compressor", None) # type: ignore - filters = getattr(codecs[0], "filters", None) # type: ignore - - return ZArray( - chunks=zarr_array.metadata.chunk_grid.chunk_shape, # type: ignore[attr-defined] - compressor=compressor, - dtype=zarr_array.metadata.data_type.name, # type: ignore - fill_value=fill_value, # type: ignore[arg-type] - filters=filters, - order="C", - shape=zarr_array.metadata.shape, - zarr_format=zarr_array.metadata.zarr_format, - ) - - async def build_chunk_manifest( zarr_array: zarr.AsyncArray, prefix: str, filepath: str ) -> ChunkManifest: @@ -157,15 +101,64 @@ async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> async def build_zarray_metadata(zarr_array: zarr.AsyncArray): + from virtualizarr.codecs import get_codecs + + codecs = get_codecs(zarr_array) + + # Mypy does not like the Tuple | Codec type from get_codecs + if isinstance(codecs, Codec): + filters = codecs.filters + compressor = codecs.compressor + + elif isinstance(codecs, tuple): + # is this parsing right for v3? + compressor = getattr(codecs[0], "compressor", None) # type: ignore + filters = getattr(codecs[0], "filters", None) # type: ignore + + else: + raise ValueError("codecs should be of dtype tuple or Codec") + attrs = zarr_array.metadata.attributes - if zarr_array.metadata.zarr_format == 2: - array_zarray = await _parse_zarr_v2_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + fill_value = zarr_array.metadata.fill_value + if fill_value is not None: + fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] + if zarr_array.metadata.zarr_format == 2: + if codecs.filters == []: + filters = None + # Zarr-python gives us the compressor object. Ex: Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + # This won't serialize into a Kerchunk ref, so we can use the .get_config to retrieve a dict. + # compressor_dict = codecs.compressor.get_config() + + array_zarray = ZArray( + shape=zarr_array.metadata.shape, + chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] + dtype=zarr_array.metadata.dtype, + fill_value=fill_value, # type: ignore[arg-type] + order="C", + compressor=codecs.compressor, # type: ignore[union-attr] + filters=filters, # type: ignore + zarr_format=zarr_array.metadata.zarr_format, + ) array_dims = attrs["_ARRAY_DIMENSIONS"] elif zarr_array.metadata.zarr_format == 3: - array_zarray = await _parse_zarr_v3_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + if zarr_array.metadata.fill_value is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + + array_zarray = ZArray( + chunks=zarr_array.metadata.chunk_grid.chunk_shape, # type: ignore[attr-defined] + compressor=compressor, + dtype=zarr_array.metadata.data_type.name, # type: ignore + fill_value=fill_value, # type: ignore[arg-type] + filters=filters, + order="C", + shape=zarr_array.metadata.shape, + zarr_format=zarr_array.metadata.zarr_format, + ) array_dims = zarr_array.metadata.dimension_names # type: ignore[union-attr] else: diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index cd329b4e..c0de1d0e 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -110,8 +110,6 @@ def _validate_v3(attrs: list[str]): ) # order: In zarr v3, it seems like order was replaced with the transpose codec. - # compressor: removed in v3 and built into codecs - # filters: removed in v3 and built into codecs if zarr_format == 2: _validate_v2(shared_v2_v3_attrs + v2_attrs) From 3be76ffc0c8b9de765e68a03df84405d56618683 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 28 Jan 2025 19:24:57 -0700 Subject: [PATCH 89/98] typing --- virtualizarr/readers/zarr.py | 2 +- virtualizarr/tests/test_integration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 45d59603..2b554d47 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -125,7 +125,7 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] if zarr_array.metadata.zarr_format == 2: - if codecs.filters == []: + if filters == []: filters = None # Zarr-python gives us the compressor object. Ex: Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) # This won't serialize into a Kerchunk ref, so we can use the .get_config to retrieve a dict. diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 57b85ff8..ce8a055a 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -130,7 +130,7 @@ def test_zarr_roundtrip_kerchunk(zarr_store): indirect=True, ) def test_zarr_roundtrip_icechunk(zarr_store): - import icechunk + import icechunk # type: ignore[import-not-found] # open zarr store with Xarray for comparison comparion_ds = xr.open_zarr(zarr_store) From 1e60835f8b5c8a22d84115be07a9d0d809cb243a Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 29 Jan 2025 11:31:48 -0700 Subject: [PATCH 90/98] test_unsupported_zarr_python to zarr v3 --- virtualizarr/codecs.py | 9 +++++---- virtualizarr/tests/test_codecs.py | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index b544b82d..c7c17cde 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -3,7 +3,7 @@ from virtualizarr.zarr import Codec if TYPE_CHECKING: - from zarr import Array # type: ignore + from zarr import Array, AsyncArray # type: ignore from zarr.core.abc.codec import ( # type: ignore ArrayArrayCodec, ArrayBytesCodec, @@ -14,14 +14,14 @@ def get_codecs( - array: Union["ManifestArray", "Array"], + array: Union["ManifestArray", "Array", "AsyncArray"], normalize_to_zarr_v3: bool = False, ) -> Union[Codec, tuple["ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec", ...]]: """ - Get the codecs for either a ManifestArray or a Zarr Array. + Get the codecs for either a ManifestArray, a Zarr Array or an Async Zarr Array. Parameters: - array (Union[ManifestArray, ZarrArray]): The input array, either ManifestArray or Zarr Array. + array (Union[ManifestArray, Array, AsyncArray]): The input array, either ManifestArray or Zarr Array. Returns: List[Optional[Codec]]: A list of codecs or an empty list if no codecs are found. @@ -30,6 +30,7 @@ def get_codecs( ImportError: If `zarr` is required but not installed. ValueError: If the array type is unsupported. """ + if _is_manifest_array(array): return _get_manifestarray_codecs(array, normalize_to_zarr_v3) # type: ignore[arg-type] diff --git a/virtualizarr/tests/test_codecs.py b/virtualizarr/tests/test_codecs.py index 41d16b5b..47025605 100644 --- a/virtualizarr/tests/test_codecs.py +++ b/virtualizarr/tests/test_codecs.py @@ -7,7 +7,6 @@ from virtualizarr import ChunkManifest, ManifestArray from virtualizarr.codecs import get_codecs from virtualizarr.tests import ( - requires_zarr_python, requires_zarr_python_v3, ) from virtualizarr.zarr import Codec @@ -147,7 +146,7 @@ def test_zarr_v3(self): expected_codecs = tuple([BytesCodec(endian="little")]) assert actual_codecs == expected_codecs - @requires_zarr_python + @requires_zarr_python_v3 def test_unsupported_zarr_python(self): zarr_array = self.create_zarr_array() unsupported_zarr_version = "2.18.3" From 5d916793441b3f2c98b9f0d11053bb6b74549270 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 29 Jan 2025 11:38:14 -0700 Subject: [PATCH 91/98] rel path issue? --- virtualizarr/tests/test_integration.py | 4 +--- virtualizarr/writers/icechunk.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index ce8a055a..cd910c95 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,4 +1,3 @@ -from os.path import relpath from pathlib import Path import numpy as np @@ -371,8 +370,7 @@ def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend): @requires_kerchunk @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend): - relative_path = relpath(netcdf4_file) - vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend) + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) expected_path = Path(netcdf4_file).as_uri() diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 6ed535e5..f0edfac9 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -108,7 +108,6 @@ def dataset_to_icechunk( group_object.update_attributes( {k: encode_zarr_attr_value(v) for k, v in ds.attrs.items()} ) - return write_variables_to_icechunk_group( ds.variables, ds.attrs, @@ -142,6 +141,7 @@ def write_variables_to_icechunk_group( # will overwrite the root group's attributes with the dataset's attributes. We take advantage # of xarrays zarr integration to ignore having to format the attributes ourselves. ds = Dataset(loadable_variables, attrs=attrs) + ds.to_zarr( store, group=group.name, @@ -243,16 +243,28 @@ def write_virtual_variable_to_icechunk( else: append_axis = None # create array if it doesn't already exist - + # import ipdb; ipdb.set_trace() + # https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/group.py#L1004 arr = group.require_array( name=name, shape=zarray.shape, - chunk_shape=zarray.chunks, + chunks=zarray.chunks, dtype=encode_dtype(zarray.dtype), - codecs=zarray._v3_codec_pipeline(), + # filters=zarray._v3_codec_pipeline()[0], dimension_names=var.dims, fill_value=zarray.fill_value, ) + # import ipdb; ipdb.set_trace() + + # arr = group.require_array( + # name=name, + # shape=zarray.shape, + # chunks=zarray.chunks, + # dtype=encode_dtype(zarray.dtype), + # codecs=zarray._v3_codec_pipeline(), + # dimension_names=var.dims, + # fill_value=zarray.fill_value, + # ) arr.update_attributes( {k: encode_zarr_attr_value(v) for k, v in var.attrs.items()} From 5084adf6a40a3d5b1cb5e9ae206424ac4003ff14 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 29 Jan 2025 16:36:14 -0700 Subject: [PATCH 92/98] revert accidental icechunk commit --- ci/upstream.yml | 2 +- virtualizarr/writers/icechunk.py | 20 ++++---------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index 5d5ea9c2..ff9cc58b 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -30,6 +30,6 @@ dependencies: - dask - pip - pip: - - git+https://github.com/zarr-developers/zarr-python # installing from source until >= 3.0.2 - icechunk>=0.1.0a15 + - git+https://github.com/zarr-developers/zarr-python # installing from source until >= 3.0.2 - imagecodecs-numcodecs==2024.6.1 diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index f0edfac9..6ed535e5 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -108,6 +108,7 @@ def dataset_to_icechunk( group_object.update_attributes( {k: encode_zarr_attr_value(v) for k, v in ds.attrs.items()} ) + return write_variables_to_icechunk_group( ds.variables, ds.attrs, @@ -141,7 +142,6 @@ def write_variables_to_icechunk_group( # will overwrite the root group's attributes with the dataset's attributes. We take advantage # of xarrays zarr integration to ignore having to format the attributes ourselves. ds = Dataset(loadable_variables, attrs=attrs) - ds.to_zarr( store, group=group.name, @@ -243,28 +243,16 @@ def write_virtual_variable_to_icechunk( else: append_axis = None # create array if it doesn't already exist - # import ipdb; ipdb.set_trace() - # https://github.com/zarr-developers/zarr-python/blob/458299857141a5470ba3956d8a1607f52ac33857/src/zarr/core/group.py#L1004 + arr = group.require_array( name=name, shape=zarray.shape, - chunks=zarray.chunks, + chunk_shape=zarray.chunks, dtype=encode_dtype(zarray.dtype), - # filters=zarray._v3_codec_pipeline()[0], + codecs=zarray._v3_codec_pipeline(), dimension_names=var.dims, fill_value=zarray.fill_value, ) - # import ipdb; ipdb.set_trace() - - # arr = group.require_array( - # name=name, - # shape=zarray.shape, - # chunks=zarray.chunks, - # dtype=encode_dtype(zarray.dtype), - # codecs=zarray._v3_codec_pipeline(), - # dimension_names=var.dims, - # fill_value=zarray.fill_value, - # ) arr.update_attributes( {k: encode_zarr_attr_value(v) for k, v in var.attrs.items()} From e7a36d736d5529233e37805a6d8e007a16bc1e5d Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 3 Feb 2025 17:09:48 -0700 Subject: [PATCH 93/98] wip on fixing codecs --- ci/upstream.yml | 3 +- virtualizarr/readers/zarr.py | 81 ++++++++------------ virtualizarr/tests/test_integration.py | 14 +++- virtualizarr/tests/test_readers/test_zarr.py | 3 + 4 files changed, 50 insertions(+), 51 deletions(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index 7a41acd6..9a446d8c 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -28,9 +28,10 @@ dependencies: - pooch - fsspec - dask + - zarr>=3.0.2 - pip - pip: - icechunk>=0.1.0a15 - - git+https://github.com/zarr-developers/zarr-python # installing from source until >= 3.0.2 + # - git+https://github.com/zarr-developers/zarr-python # installing from source until >= 3.0.2 - imagecodecs-numcodecs==2024.6.1 - git+https://github.com/fsspec/kerchunk.git@main diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 2b554d47..524f4757 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -25,7 +25,7 @@ maybe_open_loadable_vars_and_indexes, ) from virtualizarr.utils import check_for_collisions -from virtualizarr.zarr import ZARR_DEFAULT_FILL_VALUE, Codec, ZArray +from virtualizarr.zarr import ZARR_DEFAULT_FILL_VALUE, ZArray if TYPE_CHECKING: import zarr @@ -101,69 +101,48 @@ async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> async def build_zarray_metadata(zarr_array: zarr.AsyncArray): - from virtualizarr.codecs import get_codecs - - codecs = get_codecs(zarr_array) - - # Mypy does not like the Tuple | Codec type from get_codecs - if isinstance(codecs, Codec): - filters = codecs.filters - compressor = codecs.compressor - - elif isinstance(codecs, tuple): - # is this parsing right for v3? - compressor = getattr(codecs[0], "compressor", None) # type: ignore - filters = getattr(codecs[0], "filters", None) # type: ignore - - else: - raise ValueError("codecs should be of dtype tuple or Codec") - attrs = zarr_array.metadata.attributes fill_value = zarr_array.metadata.fill_value if fill_value is not None: fill_value = ZARR_DEFAULT_FILL_VALUE[zarr_array.metadata.fill_value.dtype.kind] - if zarr_array.metadata.zarr_format == 2: - if filters == []: - filters = None - # Zarr-python gives us the compressor object. Ex: Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - # This won't serialize into a Kerchunk ref, so we can use the .get_config to retrieve a dict. - # compressor_dict = codecs.compressor.get_config() - - array_zarray = ZArray( - shape=zarr_array.metadata.shape, - chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] - dtype=zarr_array.metadata.dtype, - fill_value=fill_value, # type: ignore[arg-type] - order="C", - compressor=codecs.compressor, # type: ignore[union-attr] - filters=filters, # type: ignore - zarr_format=zarr_array.metadata.zarr_format, - ) + zarr_format = zarr_array.metadata.zarr_format + # set ZArray specific values depending on Zarr version + if zarr_format == 2: + compressors = zarr_array.compressors[0].get_config() array_dims = attrs["_ARRAY_DIMENSIONS"] - elif zarr_array.metadata.zarr_format == 3: - if zarr_array.metadata.fill_value is None: + elif zarr_format == 3: + serializer = zarr_array.serializer.to_dict() # unused in ZArray? + compressors = zarr_array.compressors[ + 0 + ].to_dict() # ZArray expects a dict, not a list of dicts, so only the first val from the tuples? + array_dims = zarr_array.metadata.dimension_names # type: ignore[union-attr] + if fill_value is None: raise ValueError( "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" ) - array_zarray = ZArray( - chunks=zarr_array.metadata.chunk_grid.chunk_shape, # type: ignore[attr-defined] - compressor=compressor, - dtype=zarr_array.metadata.data_type.name, # type: ignore - fill_value=fill_value, # type: ignore[arg-type] - filters=filters, - order="C", - shape=zarr_array.metadata.shape, - zarr_format=zarr_array.metadata.zarr_format, - ) - array_dims = zarr_array.metadata.dimension_names # type: ignore[union-attr] - else: raise NotImplementedError("Zarr format is not recognized as v2 or v3.") + filters = ( + zarr_array.filters if zarr_array.filters else None + ) # if tuple is empty, assign filters to None + # import ipdb; ipdb.set_trace() + + array_zarray = ZArray( + shape=zarr_array.shape, + chunks=zarr_array.chunks, # type: ignore[attr-defined] + dtype=zarr_array.dtype.name, # type: ignore + fill_value=fill_value, + order=zarr_array.order, + compressor=compressors, + filters=filters, + zarr_format=zarr_format, + ) + return { "zarray_array": array_zarray, "array_dims": array_dims, @@ -275,10 +254,14 @@ def open_virtual_dataset( import asyncio import zarr + + # We should remove this once zarr v3 is pinned! + # --------------------------- from packaging import version if version.parse(zarr.__version__).major < 3: raise ImportError("Zarr V3 is required") + # --------------------------- async def _open_virtual_dataset( filepath=filepath, diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 375261ba..ec5f42ed 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -121,7 +121,7 @@ def test_zarr_roundtrip_kerchunk(zarr_store): @requires_zarr_python_v3 @requires_network -@pytest.mark.skip(reason="WIP on icechunk round-trip/") +# @pytest.mark.skip(reason="WIP on icechunk round-trip/") @pytest.mark.parametrize( "zarr_store", [ @@ -147,6 +147,18 @@ def test_zarr_roundtrip_icechunk(zarr_store): session = repo.writable_session("main") # Write the virtual dataset to icechunk + import ipdb + + ipdb.set_trace() + + # ipdb> ds.air.data.zarray + # ZArray(shape=(10, 9, 18), chunks=(5, 9, 18), dtype=dtype('int16'), fill_value=0, order='C', compressor=Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0), filters=None, zarr_format=2) + # + # zg['air'].compressor -> Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + + # so we can't represent the compressor like that in ZArray, how is it supposed to look. + # is there a explicit test? + # what does, for ex, the hdf reader look like? ds.virtualize.to_icechunk(session.store) rtds = xr.open_zarr(session.store) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index c0de1d0e..5684b4eb 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -79,6 +79,9 @@ def _validate_v2(attrs: list[str]): zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ "metadata" ][array][attr] + import ipdb + + ipdb.set_trace() assert vds_attr == zarr_metadata_attr def _validate_v3(attrs: list[str]): From df5a19e946c05360ab8155354913c2820e52839f Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 3 Feb 2025 18:16:55 -0700 Subject: [PATCH 94/98] cleaup of tests + codecs --- virtualizarr/readers/zarr.py | 5 +- virtualizarr/tests/test_integration.py | 11 -- virtualizarr/tests/test_readers/test_zarr.py | 111 +++++++++---------- virtualizarr/zarr.py | 2 + 4 files changed, 59 insertions(+), 70 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 524f4757..744c35fb 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -114,7 +114,9 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): array_dims = attrs["_ARRAY_DIMENSIONS"] elif zarr_format == 3: - serializer = zarr_array.serializer.to_dict() # unused in ZArray? + serializer = zarr_array.serializer.to_dict() # noqa: F841 + # serializer is unused in ZArray. Maybe we will need this in the ZArray refactor + # https://github.com/zarr-developers/VirtualiZarr/issues/411 compressors = zarr_array.compressors[ 0 ].to_dict() # ZArray expects a dict, not a list of dicts, so only the first val from the tuples? @@ -130,7 +132,6 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): filters = ( zarr_array.filters if zarr_array.filters else None ) # if tuple is empty, assign filters to None - # import ipdb; ipdb.set_trace() array_zarray = ZArray( shape=zarr_array.shape, diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index ec5f42ed..a1fb0572 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -147,18 +147,7 @@ def test_zarr_roundtrip_icechunk(zarr_store): session = repo.writable_session("main") # Write the virtual dataset to icechunk - import ipdb - ipdb.set_trace() - - # ipdb> ds.air.data.zarray - # ZArray(shape=(10, 9, 18), chunks=(5, 9, 18), dtype=dtype('int16'), fill_value=0, order='C', compressor=Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0), filters=None, zarr_format=2) - # - # zg['air'].compressor -> Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) - - # so we can't represent the compressor like that in ZArray, how is it supposed to look. - # is there a explicit test? - # what does, for ex, the hdf reader look like? ds.virtualize.to_icechunk(session.store) rtds = xr.open_zarr(session.store) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index 5684b4eb..a1e1b0ed 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -5,9 +5,6 @@ from virtualizarr.manifests import ManifestArray from virtualizarr.tests import requires_network, requires_zarr_python_v3 -# It seems like this PR: https://github.com/zarr-developers/zarr-python/pull/2533 -# might fix this issue: https://github.com/zarr-developers/zarr-python/issues/2554 - @requires_zarr_python_v3 @requires_network @@ -44,10 +41,10 @@ def test_drop_variables(self, zarr_store, drop_variables=["air"]): def test_virtual_dataset_zarr_attrs(self, zarr_store): import zarr + from virtualizarr.zarr import ZARR_DEFAULT_FILL_VALUE, ZArray + zg = zarr.open_group(zarr_store) vds = open_virtual_dataset(filepath=zarr_store, indexes={}) - zg_metadata_dict = zg.metadata.to_dict() - zarr_format = zg_metadata_dict["zarr_format"] non_var_arrays = ["time", "lat", "lon"] # check dims and coords are present @@ -66,59 +63,59 @@ def test_virtual_dataset_zarr_attrs(self, zarr_store): # check ZArray values arrays = [val for val in zg.keys()] - shared_v2_v3_attrs = [ - "shape", - "zarr_format", - ] - v2_attrs = ["chunks", "dtype", "order", "compressor", "filters"] - - def _validate_v2(attrs: list[str]): - for array in arrays: - for attr in attrs: - vds_attr = getattr(vds[array].data.zarray, attr) - zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ - "metadata" - ][array][attr] - import ipdb - - ipdb.set_trace() - assert vds_attr == zarr_metadata_attr - - def _validate_v3(attrs: list[str]): - # check v2, v3 shared attrs - for array in arrays: - for attr in attrs: - zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ - "metadata" - ][array][attr] - vds_attr = getattr(vds[array].data.zarray, attr) - assert vds_attr == zarr_metadata_attr - - # Cases where v2 and v3 attr keys differ: order, compressor, filters, dtype & chunks - - # chunks vs chunk_grid.configuration.chunk_shape - assert ( - getattr(vds[array].data.zarray, "chunks") - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ - "chunk_grid" - ]["configuration"]["chunk_shape"] + def _validate_attr_match( + array: str, zarr_array: zarr.Array | zarr.AsyncArray, zarray: ZArray + ): + zarr_array_fill_value = zarr_array.fill_value + + if zarr_array_fill_value: + zarr_array_fill_value = ZARR_DEFAULT_FILL_VALUE[ + zarr_array_fill_value.dtype.kind + ] + else: + zarr_array_fill_value = 0 + zarr_array_filters = ( + zarr_array.filters if zarr_array.filters else None + ) # if tuple is empty, assign filters to None to match ZArray def + + assert zarr_array.shape == zarray.shape, ( + f"Mismatch in [shape] for {array} between Zarr Array: {zarr_array.shape} and ZArray: {zarray.shape}" ) - - # dtype vs datatype - assert ( - getattr(vds[array].data.zarray, "dtype") - == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ - "data_type" - ].to_numpy() + assert zarr_array.chunks == zarray.chunks, ( + f"Mismatch in [chunks] for {array} between Zarr Array: {zarr_array.chunks} and ZArray: {zarray.chunks}" + ) + assert zarr_array.dtype == zarray.dtype, ( + f"Mismatch in [dtype] between Zarr Array: {zarr_array.dtype} and ZArray: {(zarray.dtype,)}" + ) + assert zarr_array_fill_value == zarray.fill_value, ( + f"Mismatch in [fill_value] for {array} between Zarr Array: {zarr_array_fill_value} and ZArray: {zarray.fill_value}" + ) + assert zarr_array.order == zarray.order, ( + f"Mismatch in [order] for {array} between Zarr Array: {zarr_array.order} and ZArray: {zarray.order}" + ) + assert zarr_array_filters == zarray.filters, ( + f"Mismatch in [filters] for {array} between Zarr Array: {zarr_array_filters} and ZArray: {(zarray.filters,)}" + ) + assert zarr_array.metadata.zarr_format == zarray.zarr_format, ( + f"Mismatch in [zarr_format] for {array} between Zarr Array: {zarr_array.metadata.zarr_format} and ZArray: {(zarray.zarr_format,)}" ) - # order: In zarr v3, it seems like order was replaced with the transpose codec. - - if zarr_format == 2: - _validate_v2(shared_v2_v3_attrs + v2_attrs) - - elif zarr_format == 3: - _validate_v3(shared_v2_v3_attrs) + if zarr_array.metadata.zarr_format == 2: + zarr_array_compressor = zarr_array.compressor.get_config() + elif zarr_array.metadata.zarr_format == 3: + zarr_array_compressor = zarr_array.compressors[0].to_dict() + else: + raise NotImplementedError( + f"Zarr format {zarr_array.metadata.zarr_format} not in [2,3]" + ) + + assert zarr_array_compressor == zarray.compressor, ( + f"Mismatch in [compressor] for {array} between Zarr Array: {zarr_array_compressor} and ZArray: {zarray.compressor}" + ) - else: - raise NotImplementedError(f"Zarr format {zarr_format} not in [2,3]") + [ + _validate_attr_match( + array=array, zarr_array=zg[array], zarray=vds[array].data.zarray + ) + for array in arrays + ] diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 4c36e7dc..57f36854 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -183,6 +183,8 @@ def _v3_codec_pipeline(self) -> Any: [_num_codec_config_to_configurable(filter) for filter in self.filters] ) + # compressor {'name': 'zstd', 'configuration': {'level': 0, 'checksum': False}} + # gives: *** KeyError: 'id' for zarr v3 if self.compressor: codec_configs.append(_num_codec_config_to_configurable(self.compressor)) From efd00640a25dcd6c5324ba87fb9527cbf58a82ae Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 3 Feb 2025 18:29:25 -0700 Subject: [PATCH 95/98] renived test_zarr writer --- virtualizarr/tests/test_writers/test_zarr.py | 62 -------------------- 1 file changed, 62 deletions(-) delete mode 100644 virtualizarr/tests/test_writers/test_zarr.py diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py deleted file mode 100644 index 9ca281cb..00000000 --- a/virtualizarr/tests/test_writers/test_zarr.py +++ /dev/null @@ -1,62 +0,0 @@ -import json - -import pytest -import xarray.testing as xrt -from xarray import Dataset - -pytest.importorskip("zarr.core.metadata.v3") - -from virtualizarr import open_virtual_dataset -from virtualizarr.backend import FileType -from virtualizarr.readers.zarr_v3 import metadata_from_zarr_json -from virtualizarr.writers.zarr import dataset_to_zarr - - -def isconfigurable(value: dict) -> bool: - """ - Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict - """ - return "name" in value and "configuration" in value - - -def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset): - """ - Checks that the output metadata of an array variable conforms to this spec - for the required attributes: - https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata - """ - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - # read the a variable's metadata - with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: - metadata = json.loads(f.read()) - assert metadata["zarr_format"] == 3 - assert metadata["node_type"] == "array" - assert isinstance(metadata["shape"], list) and all( - isinstance(dim, int) for dim in metadata["shape"] - ) - assert isinstance(metadata["data_type"], str) or isconfigurable( - metadata["data_type"] - ) - assert isconfigurable(metadata["chunk_grid"]) - assert isconfigurable(metadata["chunk_key_encoding"]) - assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) - assert ( - isinstance(metadata["codecs"], list) - and len(metadata["codecs"]) == 1 - and all(isconfigurable(codec) for codec in metadata["codecs"]) - ) - - -def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): - vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") - roundtrip = open_virtual_dataset( - tmpdir / "store.zarr", filetype=FileType.zarr_v3, indexes={} - ) - - xrt.assert_identical(roundtrip, vds_with_manifest_arrays) - - -def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") - assert zarray == vds_with_manifest_arrays.a.data.zarray From 442e519e3378f422d6b92e5be6bd1c6155919eb8 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 3 Feb 2025 18:32:48 -0700 Subject: [PATCH 96/98] bumping icechunk for now --- ci/upstream.yml | 2 +- virtualizarr/tests/test_integration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/upstream.yml b/ci/upstream.yml index afcd5040..44b81533 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -31,7 +31,7 @@ dependencies: - zarr>=3.0.2 - pip - pip: - - icechunk>=0.1.0a12 # Installs zarr-python v3.0.0 as dependency + - icechunk>=0.1.1 # Installs zarr-python v3.0.0 as dependency - git+https://github.com/fsspec/kerchunk.git@main - imagecodecs-numcodecs==2024.6.1 - git+https://github.com/fsspec/kerchunk.git@main diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index a1fb0572..c713a02e 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -121,7 +121,7 @@ def test_zarr_roundtrip_kerchunk(zarr_store): @requires_zarr_python_v3 @requires_network -# @pytest.mark.skip(reason="WIP on icechunk round-trip/") +@pytest.mark.skip(reason="WIP on icechunk round-trip/") @pytest.mark.parametrize( "zarr_store", [ From 54308fb2fa9441f47bb87f536f2dafa813a30500 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 3 Feb 2025 18:55:24 -0700 Subject: [PATCH 97/98] typing lint --- virtualizarr/readers/zarr.py | 23 +++++++------------- virtualizarr/tests/test_readers/test_zarr.py | 4 ++-- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index 744c35fb..787d22c4 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -91,16 +91,7 @@ def combine_path_chunk(filepath: str, chunk_key: str): ) -async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> dict: - """Create a chunk map""" - - keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] - - sizes = await _concurrent_map(keys, zarr_array.store.getsize) - return {key[0]: size for key, size in zip(keys, sizes)} - - -async def build_zarray_metadata(zarr_array: zarr.AsyncArray): +async def build_zarray_metadata(zarr_array: zarr.AsyncArray[Any]): attrs = zarr_array.metadata.attributes fill_value = zarr_array.metadata.fill_value @@ -110,11 +101,11 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): zarr_format = zarr_array.metadata.zarr_format # set ZArray specific values depending on Zarr version if zarr_format == 2: - compressors = zarr_array.compressors[0].get_config() + compressors = zarr_array.compressors[0].get_config() # type: ignore[union-attr] array_dims = attrs["_ARRAY_DIMENSIONS"] elif zarr_format == 3: - serializer = zarr_array.serializer.to_dict() # noqa: F841 + serializer = zarr_array.serializer.to_dict() # type: ignore[union-attr] # noqa: F841 # serializer is unused in ZArray. Maybe we will need this in the ZArray refactor # https://github.com/zarr-developers/VirtualiZarr/issues/411 compressors = zarr_array.compressors[ @@ -140,7 +131,7 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): fill_value=fill_value, order=zarr_array.order, compressor=compressors, - filters=filters, + filters=filters, # type: ignore[arg-type] zarr_format=zarr_format, ) @@ -151,7 +142,9 @@ async def build_zarray_metadata(zarr_array: zarr.AsyncArray): } -async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath: str): +async def virtual_variable_from_zarr_array( + zarr_array: zarr.AsyncArray[Any], filepath: str +): zarr_prefix = zarr_array.basename if zarr_array.metadata.zarr_format == 3: @@ -197,7 +190,7 @@ async def virtual_dataset_from_zarr_group( virtual_variable_arrays = await asyncio.gather( *[ - virtual_variable_from_zarr_array(zarr_array=array, filepath=filepath) + virtual_variable_from_zarr_array(zarr_array=array, filepath=filepath) # type: ignore[arg-type] for array in virtual_zarr_arrays ] ) diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py index a1e1b0ed..cd4d2cc5 100644 --- a/virtualizarr/tests/test_readers/test_zarr.py +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -66,7 +66,7 @@ def test_virtual_dataset_zarr_attrs(self, zarr_store): def _validate_attr_match( array: str, zarr_array: zarr.Array | zarr.AsyncArray, zarray: ZArray ): - zarr_array_fill_value = zarr_array.fill_value + zarr_array_fill_value = zarr_array.fill_value # type: ignore[union-attr] if zarr_array_fill_value: zarr_array_fill_value = ZARR_DEFAULT_FILL_VALUE[ @@ -101,7 +101,7 @@ def _validate_attr_match( ) if zarr_array.metadata.zarr_format == 2: - zarr_array_compressor = zarr_array.compressor.get_config() + zarr_array_compressor = zarr_array.compressor.get_config() # type: ignore[union-attr] elif zarr_array.metadata.zarr_format == 3: zarr_array_compressor = zarr_array.compressors[0].to_dict() else: From 973f6b04d78607d9cbec270bca997dcdf45afb3c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 3 Feb 2025 20:14:25 -0700 Subject: [PATCH 98/98] remove zarr writer test --- virtualizarr/tests/test_zarr.py | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 virtualizarr/tests/test_zarr.py diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py deleted file mode 100644 index 95dbf55f..00000000 --- a/virtualizarr/tests/test_zarr.py +++ /dev/null @@ -1,29 +0,0 @@ -import numpy as np - -from virtualizarr.zarr import ZArray - - -def test_replace_partial(): - arr = ZArray(shape=(2, 3), chunks=(1, 1), dtype=np.dtype("