Skip to content

Commit

Permalink
fix serialization bug in Dask
Browse files Browse the repository at this point in the history
  • Loading branch information
betolink committed Jan 16, 2025
1 parent d5ca8c7 commit 63f36b6
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion earthaccess/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import fsspec.utils
import s3fs

# import ipdb

import earthaccess


Expand All @@ -15,12 +17,19 @@ def _get_chunk_metadata(
) -> list[dict]:
from kerchunk.hdf import SingleHdf5ToZarr

if not isinstance(granule, earthaccess.DataGranule) and isinstance(granule, dict):
# WHY: dask serialization is doing something weird, it serializes the granule as a simple dict
# we need to add cast it back to a datagranule to get the nice methods for parsing the data links
# TODO: ask James what is going on
granule = earthaccess.DataGranule(granule)

metadata = []
access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect"
# ipdb.set_trace()

for url in granule.data_links(access=access):
with fs.open(url) as inf:
h5chunks = SingleHdf5ToZarr(inf, url)
h5chunks = SingleHdf5ToZarr(inf, url) # type: ignore
m = h5chunks.translate()
metadata.append(m)

Expand Down Expand Up @@ -50,6 +59,8 @@ def consolidate_metadata(

# Get metadata for each granule
get_chunk_metadata = dask.delayed(_get_chunk_metadata) # type: ignore

# ipdb.set_trace()
chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granules]) # type: ignore
chunks = sum(chunks, start=[])

Expand Down

0 comments on commit 63f36b6

Please sign in to comment.