-
-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added multimanager example and benchmark (#209)
- Loading branch information
Showing
2 changed files
with
384 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import h5pyd | ||
import numpy as np | ||
import random | ||
import time | ||
|
||
DOMAIN_PATH = "/home/test_user1/test/multi_mgr_benchmark.h5" | ||
DATASET_COUNT = 200 | ||
DSET_SHAPE = (10,) | ||
DSET_DTYPE = np.int32 | ||
|
||
|
||
def generate_range(ds_shape: tuple): | ||
# generate a tuple of random indices for one dataset | ||
indices = [] | ||
for axis_length in ds_shape: | ||
index = random.randint(0, axis_length - 1) | ||
indices.append(index) | ||
return tuple(indices) | ||
|
||
|
||
def generate_index_query(h5file): | ||
# generate a list of index tuples | ||
query = [] | ||
for ds in h5file.values(): | ||
ds_shape = ds.shape | ||
indices = generate_range(ds_shape) | ||
query.append(indices) | ||
return query | ||
|
||
|
||
def benchmark_multimanager(h5file, num=10): | ||
""" | ||
Benchmark retrieving one random entry from every dataset in an h5file | ||
using the MultiManager. | ||
""" | ||
ds_names = list(h5file.keys()) | ||
datsets = [h5file[name] for name in ds_names] | ||
mm = h5pyd.MultiManager(datsets) | ||
|
||
# prepare queries to exclude from runtime | ||
queries = [] | ||
for i in range(num): | ||
query = generate_index_query(h5file) | ||
queries.append(query) | ||
|
||
# accessing the data | ||
t0 = time.time() | ||
for query in queries: | ||
mm[query] | ||
|
||
runtime = time.time() - t0 | ||
print(f"Mean runtime multimanager: {runtime/num:.4f} s") | ||
# 100ms for case with 6 datasets | ||
|
||
|
||
def benchmark_sequential_ds(h5file, num=10): | ||
""" | ||
Benchmark retrieving one random entry from every dataset in | ||
an h5file by sequentially looping through the datasets | ||
""" | ||
# prepare queries to exclude this code from runtime | ||
index_lists = [] | ||
for i in range(num): | ||
index_list = [] | ||
for ds in h5file.values(): | ||
indices = generate_range(ds.shape) | ||
index_list.append(indices) | ||
index_lists.append(index_list) | ||
|
||
# accessing the data | ||
t0 = time.time() | ||
for index_list in index_lists: | ||
for indices, ds in zip(index_list, h5file.values()): | ||
ds[indices] | ||
|
||
runtime = time.time() - t0 | ||
print(f"Mean runtime sequentially: {runtime/num:.4f} s") | ||
# ~ 400ms for case with 6 datasests | ||
|
||
|
||
def run_benchmark(f): | ||
""" | ||
Initialize datasets if not done previously | ||
Then run sequential and multimanager tests | ||
""" | ||
|
||
for i in range(DATASET_COUNT): | ||
dset_name = f"dset_{i:04d}" | ||
if dset_name not in f: | ||
data = np.random.randint(0, 100, size=DSET_SHAPE, dtype=DSET_DTYPE) | ||
f.create_dataset(dset_name, data=data) | ||
|
||
benchmark_sequential_ds(f) | ||
|
||
benchmark_multimanager(f) | ||
|
||
|
||
# | ||
# main | ||
# | ||
|
||
# create domain if it does not exist already | ||
with h5pyd.File(DOMAIN_PATH, "a") as f: | ||
run_benchmark(f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,280 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", | ||
"if USE_H5PY:\n", | ||
" import h5py\n", | ||
" from h5py import MultiManager\n", | ||
"else:\n", | ||
" import h5pyd as h5py # Use the \"as\" syntax for code compatibility\n", | ||
" from h5pyd import MultiManager\n", | ||
"import numpy as np" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create a new file\n", | ||
"f = h5py.File(\"/home/test_user1/multi_try.h5\", mode=\"w\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create some datasets\n", | ||
"DSET_SHAPE = (10,)\n", | ||
"DSET_DTYPE = np.int32\n", | ||
"\n", | ||
"# create 4 datasets\n", | ||
"DSET_COUNT = 4\n", | ||
"datasets = []\n", | ||
"for i in range(DSET_COUNT):\n", | ||
" dset = f.create_dataset(f\"dset_{i}\", shape=DSET_SHAPE, dtype=DSET_DTYPE)\n", | ||
" datasets.append(dset)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# initialize some data to write\n", | ||
"data_in = []\n", | ||
"for n in range(DSET_COUNT):\n", | ||
" arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n", | ||
" arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n", | ||
" data_in.append(arr)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# instantiate a MultiManager and use it to write to all the datasets simultaneously\n", | ||
"mm = MultiManager(datasets)\n", | ||
"mm[...] = data_in" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" | ||
] | ||
}, | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# verify what get saved to the first dataset\n", | ||
"dset = f[\"dset_0\"]\n", | ||
"dset[...]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" | ||
] | ||
}, | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# and the second dataset\n", | ||
"dset = f[\"dset_1\"]\n", | ||
"dset[...]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"4" | ||
] | ||
}, | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Read all the data from all the daasets using the same MultiManager instance\n", | ||
"data_out = mm[...]\n", | ||
"len(data_out)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" | ||
] | ||
}, | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# get the first item from the returned list\n", | ||
"data_out[0]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" | ||
] | ||
}, | ||
"execution_count": 23, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# and the second item\n", | ||
"data_out[1]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 27, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([0, 1, 2, 3], dtype=int32)" | ||
] | ||
}, | ||
"execution_count": 27, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# rather than reading all the data for a dataset, you can read a given selection\n", | ||
"data_out = mm[0:4]\n", | ||
"data_out[0]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 24, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# it's also possible to pass a list of selections and have each selection\n", | ||
"# read from the corresponding dataset\n", | ||
"selections = []\n", | ||
"for n in range(DSET_COUNT):\n", | ||
" s = slice(n, n+2, 1)\n", | ||
" selections.append(s)\n", | ||
"\n", | ||
"data_out = mm.__getitem__(selections)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 25, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([0, 1], dtype=int32)" | ||
] | ||
}, | ||
"execution_count": 25, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"data_out[0]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([101, 102], dtype=int32)" | ||
] | ||
}, | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"data_out[1]" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |