Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Asdf read speed #514

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
4 changes: 3 additions & 1 deletion dkist/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,12 @@ def eit_dataset():
@pytest.fixture
def simple_tiled_dataset(dataset):
datasets = [copy.deepcopy(dataset) for i in range(4)]
headers = []
for ds in datasets:
ds.meta["inventory"] = dataset.meta["inventory"]
headers.append(ds.headers)
dataset_array = np.array(datasets).reshape((2,2))
return TiledDataset(dataset_array, dataset.meta["inventory"])
return TiledDataset(dataset_array, dataset.meta["inventory"], headers=headers)


@pytest.fixture
Expand Down
4 changes: 3 additions & 1 deletion dkist/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class Dataset(NDCube):
_file_manager = FileManagerDescriptor(default_type=FileManager)

def __init__(self, data, wcs=None, uncertainty=None, mask=None, meta=None,
unit=None, copy=False):
unit=None, copy=False, is_tile=False):

# Do some validation
if (not isinstance(wcs, gwcs.WCS) and
Expand All @@ -123,6 +123,8 @@ def __init__(self, data, wcs=None, uncertainty=None, mask=None, meta=None,
if "inventory" not in meta:
raise ValueError("The meta dict must contain the inventory record.")

self._is_mosaic_tile = is_tile

super().__init__(data, wcs, uncertainty=uncertainty, mask=mask, meta=meta,
unit=unit, copy=copy)

Expand Down
3 changes: 2 additions & 1 deletion dkist/dataset/tests/test_tiled_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def test_tiled_dataset_invalid_construction(dataset, dataset_4d):
TiledDataset(np.array((dataset, ds2)), dataset.meta["inventory"])


@pytest.mark.accept_cli_dataset
def test_tiled_dataset_from_components(dataset):
shape = (2, 2)
file_managers = [dataset._file_manager] * 4
Expand All @@ -70,7 +71,7 @@ def test_tiled_dataset_from_components(dataset):
for ds, fm, headers in zip(tiled_ds.flat, file_managers, header_tables):
assert ds.files == fm
assert ds.meta["inventory"] is inventory
assert ds.meta["headers"] is headers
assert (ds.meta["headers"] == headers).all()


@figure_test
Expand Down
19 changes: 7 additions & 12 deletions dkist/dataset/tiled_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,20 @@ def _from_components(cls, shape, file_managers, wcses, header_tables, inventory)
assert len(file_managers) == len(wcses) == len(header_tables)

datasets = np.empty(len(file_managers), dtype=object)
for i, (fm, wcs, headers) in enumerate(zip(file_managers, wcses, header_tables)):
all_headers = vstack(header_tables)
for i, (fm, wcs) in enumerate(zip(file_managers, wcses)):
headers = all_headers[i*len(fm):(i+1)*len(fm)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's some promises about ordering I was not intending to make.

meta = {"inventory": inventory, "headers": headers}
datasets[i] = Dataset(fm._generate_array(), wcs=wcs, meta=meta)
datasets[i] = Dataset(fm._generate_array(), wcs=wcs, meta=meta, is_tile=True)
datasets[i]._file_manager = fm
datasets = datasets.reshape(shape)

return cls(datasets, inventory)
return cls(datasets, inventory, all_headers)

def __init__(self, dataset_array, inventory=None):
def __init__(self, dataset_array, inventory=None, headers=None):
self._data = np.array(dataset_array, dtype=object)
self._inventory = inventory or {}
self.combined_headers = vstack(headers) if headers else None
self._validate_component_datasets(self._data, inventory)

def __contains__(self, x):
Expand Down Expand Up @@ -131,14 +134,6 @@ def inventory(self):
"""
return self._inventory

@property
def combined_headers(self):
"""
A single `astropy.table.Table` containing all the FITS headers for all
files in this dataset.
"""
return vstack([ds.meta["headers"] for ds in self._data.flat])

@property
def shape(self):
"""
Expand Down
2 changes: 2 additions & 0 deletions dkist/io/asdf/converters/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
"it was not constructed from a set of FITS files.")
node = {}
node["meta"] = dataset.meta or {}
if dataset._is_mosaic_tile and node.get("meta"):
node["meta"]["headers"] = None

Check warning on line 67 in dkist/io/asdf/converters/dataset.py

View check run for this annotation

Codecov / codecov/patch

dkist/io/asdf/converters/dataset.py#L67

Added line #L67 was not covered by tests
node["wcs"] = dataset.wcs
node["data"] = dataset.files
if dataset.unit:
Expand Down
15 changes: 14 additions & 1 deletion dkist/io/asdf/converters/tiled_dataset.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
from asdf.extension import Converter
from astropy.table import Table, vstack


class TiledDatasetConverter(Converter):
tags = [
"tag:dkist.nso.edu:dkist/tiled_dataset-0.1.0",
"asdf://dkist.nso.edu/tags/tiled_dataset-1.0.0",
"asdf://dkist.nso.edu/tags/tiled_dataset-1.1.0",
]
types = ["dkist.dataset.tiled_dataset.TiledDataset"]

def from_yaml_tree(cls, node, tag, ctx):
from dkist.dataset.tiled_dataset import TiledDataset

return TiledDataset(node["datasets"], node["inventory"])
for row in node["datasets"]:
for ds in row:
if ds:
ds._is_mosaic_tile = True

if node.get("headers"):
headers = node["headers"]

Check warning on line 22 in dkist/io/asdf/converters/tiled_dataset.py

View check run for this annotation

Codecov / codecov/patch

dkist/io/asdf/converters/tiled_dataset.py#L22

Added line #L22 was not covered by tests
else:
headers = vstack([Table(ds.headers) for ds in row for row in node["datasets"]])

return TiledDataset(node["datasets"], node["inventory"], headers)

def to_yaml_tree(cls, tiled_dataset, tag, ctx):
tree = {}
tree["inventory"] = tiled_dataset._inventory
tree["datasets"] = tiled_dataset._data.tolist()
tree["headers"] = tiled_dataset.combined_headers.as_array()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should do this for dataset too.

return tree
2 changes: 2 additions & 0 deletions dkist/io/asdf/entry_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def get_extensions():
dkist_converters = [FileManagerConverter(), DatasetConverter(), TiledDatasetConverter()]
wcs_converters = [VaryingCelestialConverter(), CoupledCompoundConverter(), RavelConverter(), AsymmetricMappingConverter()]
return [
ManifestExtension.from_uri("asdf://dkist.nso.edu/manifests/dkist-1.3.0",
converters=dkist_converters),
ManifestExtension.from_uri("asdf://dkist.nso.edu/manifests/dkist-1.2.0",
converters=dkist_converters),
ManifestExtension.from_uri("asdf://dkist.nso.edu/manifests/dkist-1.1.0",
Expand Down
14 changes: 14 additions & 0 deletions dkist/io/asdf/resources/manifests/dkist-1.3.0.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
%YAML 1.1
---
id: asdf://dkist.nso.edu/manifests/dkist-1.2.0
extension_uri: asdf://dkist.nso.edu/dkist/extensions/dkist-1.3.0
title: DKIST extension
description: ASDF schemas and tags for DKIST classes.

tags:
- schema_uri: "asdf://dkist.nso.edu/schemas/file_manager-1.0.0"
tag_uri: "asdf://dkist.nso.edu/tags/file_manager-1.0.0"
- schema_uri: "asdf://dkist.nso.edu/schemas/dataset-1.2.0"
tag_uri: "asdf://dkist.nso.edu/tags/dataset-1.2.0"
- schema_uri: "asdf://dkist.nso.edu/schemas/tiled_dataset-1.1.0"
tag_uri: "asdf://dkist.nso.edu/tags/tiled_dataset-1.1.0"
50 changes: 50 additions & 0 deletions dkist/io/asdf/resources/schemas/dataset-1.2.0.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
%YAML 1.1
---
$schema: "http://stsci.edu/schemas/yaml-schema/draft-01"
id: "asdf://dkist.nso.edu/schemas/dataset-1.2.0"

title: |
A DKIST Level 1 Dataset object.
description:
The container for a distributed dataset object.

type: object
properties:
data:
tag: "asdf://dkist.nso.edu/tags/file_manager-1.0.0"

wcs:
description: The coordinate system for the complete dataset.
tag: "tag:stsci.edu:gwcs/wcs-1.*"

mask:
tag: "tag:stsci.edu:asdf/core/ndarray-1.*"

unit:
tag: "tag:stsci.edu:asdf/unit/unit-1.*"

meta:
description: Dataset metadata, describing the whole dataset.
type: object
properties:
headers:
description: A table of all the headers for the constituent files.
anyOf:
- tag: "tag:astropy.org:astropy/table/table-1.*"
- tag: 'tag:stsci.edu:asdf/core/ndarray-1.*'
- null: true

quality:
description: A copy of the quality report of these observations.
type: object

inventory:
description: A copy of the inventory record for this dataset.
type: object

required: [headers, inventory]
additionalProperties: true

required: [data, wcs, unit]
additionalProperties: true
...
31 changes: 31 additions & 0 deletions dkist/io/asdf/resources/schemas/tiled_dataset-1.1.0.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
%YAML 1.1
---
$schema: "http://stsci.edu/schemas/yaml-schema/draft-01"
id: "asdf://dkist.nso.edu/schemas/tiled_dataset-1.1.0"

title: |
A DKIST Tiled Dataset object.
description:
The container for a set of Dataset objects.

type: object
properties:
datasets:
description: A nested structure of Dataset objects
type: array
items:
type: array
items:
- tag: "asdf://dkist.nso.edu/tags/dataset-1.*"
inventory:
description: A copy of the inventory record for this dataset.
type: object
headers:
description: A table of all the headers for the constituent files.
anyOf:
- tag: "tag:astropy.org:astropy/table/table-1.*"
- tag: 'tag:stsci.edu:asdf/core/ndarray-1.*'

required: [datasets, inventory, headers]
additionalProperties: false
...
2 changes: 2 additions & 0 deletions dkist/io/level_1_dataset_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ properties:
- $ref: "asdf://dkist.nso.edu/schemas/dataset-0.3.0"
- $ref: "asdf://dkist.nso.edu/schemas/dataset-1.0.0"
- $ref: "asdf://dkist.nso.edu/schemas/dataset-1.1.0"
- $ref: "asdf://dkist.nso.edu/schemas/dataset-1.2.0"
- $ref: "asdf://dkist.nso.edu/schemas/tiled_dataset-0.1.0"
- $ref: "asdf://dkist.nso.edu/schemas/tiled_dataset-1.0.0"
- $ref: "asdf://dkist.nso.edu/schemas/tiled_dataset-1.1.0"

required: [dataset]
additionalProperties: true