From 63f36b6d63d1b485b34bc4e4db894c219836c657 Mon Sep 17 00:00:00 2001 From: betolink Date: Thu, 16 Jan 2025 13:06:47 -0600 Subject: [PATCH] fix serialization bug in Dask --- earthaccess/kerchunk.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/earthaccess/kerchunk.py b/earthaccess/kerchunk.py index 9ee40dec..6b7f05d4 100644 --- a/earthaccess/kerchunk.py +++ b/earthaccess/kerchunk.py @@ -6,6 +6,8 @@ import fsspec.utils import s3fs +# import ipdb + import earthaccess @@ -15,12 +17,19 @@ def _get_chunk_metadata( ) -> list[dict]: from kerchunk.hdf import SingleHdf5ToZarr + if not isinstance(granule, earthaccess.DataGranule) and isinstance(granule, dict): + # WHY: dask serialization is doing something weird, it serializes the granule as a simple dict + # we need to add cast it back to a datagranule to get the nice methods for parsing the data links + # TODO: ask James what is going on + granule = earthaccess.DataGranule(granule) + metadata = [] access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect" + # ipdb.set_trace() for url in granule.data_links(access=access): with fs.open(url) as inf: - h5chunks = SingleHdf5ToZarr(inf, url) + h5chunks = SingleHdf5ToZarr(inf, url) # type: ignore m = h5chunks.translate() metadata.append(m) @@ -50,6 +59,8 @@ def consolidate_metadata( # Get metadata for each granule get_chunk_metadata = dask.delayed(_get_chunk_metadata) # type: ignore + + # ipdb.set_trace() chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granules]) # type: ignore chunks = sum(chunks, start=[])