Skip to content

Commit

Permalink
added multimanager example and benchmark (#209)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreadey authored Jul 9, 2024
1 parent 5e5b864 commit f060ed6
Show file tree
Hide file tree
Showing 2 changed files with 384 additions and 0 deletions.
104 changes: 104 additions & 0 deletions examples/multi_mgr_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import h5pyd
import numpy as np
import random
import time

DOMAIN_PATH = "/home/test_user1/test/multi_mgr_benchmark.h5"
DATASET_COUNT = 200
DSET_SHAPE = (10,)
DSET_DTYPE = np.int32


def generate_range(ds_shape: tuple):
# generate a tuple of random indices for one dataset
indices = []
for axis_length in ds_shape:
index = random.randint(0, axis_length - 1)
indices.append(index)
return tuple(indices)


def generate_index_query(h5file):
# generate a list of index tuples
query = []
for ds in h5file.values():
ds_shape = ds.shape
indices = generate_range(ds_shape)
query.append(indices)
return query


def benchmark_multimanager(h5file, num=10):
"""
Benchmark retrieving one random entry from every dataset in an h5file
using the MultiManager.
"""
ds_names = list(h5file.keys())
datsets = [h5file[name] for name in ds_names]
mm = h5pyd.MultiManager(datsets)

# prepare queries to exclude from runtime
queries = []
for i in range(num):
query = generate_index_query(h5file)
queries.append(query)

# accessing the data
t0 = time.time()
for query in queries:
mm[query]

runtime = time.time() - t0
print(f"Mean runtime multimanager: {runtime/num:.4f} s")
# 100ms for case with 6 datasets


def benchmark_sequential_ds(h5file, num=10):
"""
Benchmark retrieving one random entry from every dataset in
an h5file by sequentially looping through the datasets
"""
# prepare queries to exclude this code from runtime
index_lists = []
for i in range(num):
index_list = []
for ds in h5file.values():
indices = generate_range(ds.shape)
index_list.append(indices)
index_lists.append(index_list)

# accessing the data
t0 = time.time()
for index_list in index_lists:
for indices, ds in zip(index_list, h5file.values()):
ds[indices]

runtime = time.time() - t0
print(f"Mean runtime sequentially: {runtime/num:.4f} s")
# ~ 400ms for case with 6 datasests


def run_benchmark(f):
"""
Initialize datasets if not done previously
Then run sequential and multimanager tests
"""

for i in range(DATASET_COUNT):
dset_name = f"dset_{i:04d}"
if dset_name not in f:
data = np.random.randint(0, 100, size=DSET_SHAPE, dtype=DSET_DTYPE)
f.create_dataset(dset_name, data=data)

benchmark_sequential_ds(f)

benchmark_multimanager(f)


#
# main
#

# create domain if it does not exist already
with h5pyd.File(DOMAIN_PATH, "a") as f:
run_benchmark(f)
280 changes: 280 additions & 0 deletions examples/notebooks/multi_manager_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"USE_H5PY = False # set to True to use h5py/hdf5lib instead\n",
"if USE_H5PY:\n",
" import h5py\n",
" from h5py import MultiManager\n",
"else:\n",
" import h5pyd as h5py # Use the \"as\" syntax for code compatibility\n",
" from h5pyd import MultiManager\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# create a new file\n",
"f = h5py.File(\"/home/test_user1/multi_try.h5\", mode=\"w\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# create some datasets\n",
"DSET_SHAPE = (10,)\n",
"DSET_DTYPE = np.int32\n",
"\n",
"# create 4 datasets\n",
"DSET_COUNT = 4\n",
"datasets = []\n",
"for i in range(DSET_COUNT):\n",
" dset = f.create_dataset(f\"dset_{i}\", shape=DSET_SHAPE, dtype=DSET_DTYPE)\n",
" datasets.append(dset)\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# initialize some data to write\n",
"data_in = []\n",
"for n in range(DSET_COUNT):\n",
" arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n",
" arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n",
" data_in.append(arr)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# instantiate a MultiManager and use it to write to all the datasets simultaneously\n",
"mm = MultiManager(datasets)\n",
"mm[...] = data_in"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# verify what get saved to the first dataset\n",
"dset = f[\"dset_0\"]\n",
"dset[...]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# and the second dataset\n",
"dset = f[\"dset_1\"]\n",
"dset[...]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read all the data from all the daasets using the same MultiManager instance\n",
"data_out = mm[...]\n",
"len(data_out)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the first item from the returned list\n",
"data_out[0]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# and the second item\n",
"data_out[1]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 3], dtype=int32)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rather than reading all the data for a dataset, you can read a given selection\n",
"data_out = mm[0:4]\n",
"data_out[0]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# it's also possible to pass a list of selections and have each selection\n",
"# read from the corresponding dataset\n",
"selections = []\n",
"for n in range(DSET_COUNT):\n",
" s = slice(n, n+2, 1)\n",
" selections.append(s)\n",
"\n",
"data_out = mm.__getitem__(selections)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1], dtype=int32)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_out[0]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([101, 102], dtype=int32)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_out[1]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit f060ed6

Please sign in to comment.