From 45110624c2c4a8ce12558517fd7b644932f6eee3 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 9 Jul 2024 11:29:20 -0500 Subject: [PATCH 1/2] added multimanager example and benchmark --- examples/multi_mgr_benchmark.py | 104 +++++++ .../notebooks/multi_manager_example.ipynb | 280 ++++++++++++++++++ 2 files changed, 384 insertions(+) create mode 100644 examples/multi_mgr_benchmark.py create mode 100644 examples/notebooks/multi_manager_example.ipynb diff --git a/examples/multi_mgr_benchmark.py b/examples/multi_mgr_benchmark.py new file mode 100644 index 0000000..edcb8b2 --- /dev/null +++ b/examples/multi_mgr_benchmark.py @@ -0,0 +1,104 @@ +import h5pyd +import numpy as np +import random +import time + +DOMAIN_PATH = "/home/test_user1/test/multi_mgr_benchmark.h5" +DATASET_COUNT = 200 +DSET_SHAPE = (10,) +DSET_DTYPE = np.int32 + + +def generate_range(ds_shape: tuple): + # generate a tuple of random indices for one dataset + indices = [] + for axis_length in ds_shape: + index = random.randint(0, axis_length - 1) + indices.append(index) + return tuple(indices) + + +def generate_index_query(h5file): + # generate a list of index tuples + query = [] + for ds in h5file.values(): + ds_shape = ds.shape + indices = generate_range(ds_shape) + query.append(indices) + return query + + +def benchmark_multimanager(h5file, num=10): + """ + Benchmark retrieving one random entry from every dataset in an h5file + using the MultiManager. + """ + ds_names = list(h5file.keys()) + datsets = [h5file[name] for name in ds_names] + mm = h5pyd.MultiManager(datsets) + + # prepare queries to exclude from runtime + queries = [] + for i in range(num): + query = generate_index_query(h5file) + queries.append(query) + + # accessing the data + t0 = time.time() + for query in queries: + mm[query] + + runtime = time.time() - t0 + print(f"Mean runtime multimanager: {runtime/num:.4f} s") + # 100ms for case with 6 datasets + + +def benchmark_sequential_ds(h5file, num=10): + """ + Benchmark retrieving one random entry from every dataset in + an h5file by sequentially looping through the datasets + """ + # prepare queries to exclude this code from runtime + index_lists = [] + for i in range(num): + index_list = [] + for ds in h5file.values(): + indices = generate_range(ds.shape) + index_list.append(indices) + index_lists.append(index_list) + + # accessing the data + t0 = time.time() + for index_list in index_lists: + for indices, ds in zip(index_list, h5file.values()): + ds[indices] + + runtime = time.time() - t0 + print(f"Mean runtime sequentially: {runtime/num:.4f} s") + # ~ 400ms for case with 6 datasests + + +def run_benchmark(f): + """ + Initialize datasets if not done previously + Then run sequential and multimanager tests + """ + + for i in range(DATASET_COUNT): + dset_name = f"dset_{i:04d}" + if dset_name not in f: + data = np.random.randint(0, 100, size=DSET_SHAPE, dtype=DSET_DTYPE) + f.create_dataset(dset_name, data=data) + + benchmark_sequential_ds(f) + + benchmark_multimanager(f) + + +# +# main +# + +# create domain if it does not exist already +with h5pyd.File(DOMAIN_PATH, "a") as f: + run_benchmark(f) diff --git a/examples/notebooks/multi_manager_example.ipynb b/examples/notebooks/multi_manager_example.ipynb new file mode 100644 index 0000000..10c71cd --- /dev/null +++ b/examples/notebooks/multi_manager_example.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "USE_H5PY = False # set to True to use h5py/hdf5lib instead\n", + "if USE_H5PY:\n", + " import h5py\n", + " from h5py import MultiManager\n", + "else:\n", + " import h5pyd as h5py # Use the \"as\" syntax for code compatibility\n", + " from h5pyd import MultiManager\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new file\n", + "f = h5py.File(\"/home/test_user1/multi_try.h5\", mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# create some datasets\n", + "DSET_SHAPE = (10,)\n", + "DSET_DTYPE = np.int32\n", + "\n", + "# create 4 datasets\n", + "DSET_COUNT = 4\n", + "datasets = []\n", + "for i in range(DSET_COUNT):\n", + " dset = f.create_dataset(f\"dset_{i}\", shape=DSET_SHAPE, dtype=DSET_DTYPE)\n", + " datasets.append(dset)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# initialize some data to write\n", + "data_in = []\n", + "for n in range(DSET_COUNT):\n", + " arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n", + " arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n", + " data_in.append(arr)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# instantiate a MultiManager and use it to write to all the datasets simultaneously\n", + "mm = MultiManager(datasets)\n", + "mm[...] = data_in" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# verify what get saved to the first dataset\n", + "dset = f[\"dset_0\"]\n", + "dset[...]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and the second dataset\n", + "dset = f[\"dset_1\"]\n", + "dset[...]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read all the data from all the daasets using the same MultiManager instance\n", + "data_out = mm[...]\n", + "len(data_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get the first item from the returned list\n", + "data_out[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# and the second item\n", + "data_out[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 2, 3], dtype=int32)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rather than reading all the data for a dataset, you can read a given selection\n", + "data_out = mm[0:4]\n", + "data_out[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# it's also possible to pass a list of selections and have each selection\n", + "# read from the corresponding dataset\n", + "selections = []\n", + "for n in range(DSET_COUNT):\n", + " s = slice(n, n+2, 1)\n", + " selections.append(s)\n", + "\n", + "data_out = mm.__getitem__(selections)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1], dtype=int32)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_out[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([101, 102], dtype=int32)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_out[1]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a4630784081e1b3777db662320cbe67b896d1762 Mon Sep 17 00:00:00 2001 From: John Readey Date: Wed, 10 Jul 2024 04:51:40 -0500 Subject: [PATCH 2/2] minor edits for multi mgr examples --- examples/multi_mgr_benchmark.py | 8 +- .../notebooks/multi_manager_example.ipynb | 187 +++++++----------- 2 files changed, 77 insertions(+), 118 deletions(-) diff --git a/examples/multi_mgr_benchmark.py b/examples/multi_mgr_benchmark.py index edcb8b2..3655e8a 100644 --- a/examples/multi_mgr_benchmark.py +++ b/examples/multi_mgr_benchmark.py @@ -1,4 +1,5 @@ import h5pyd +import logging import numpy as np import random import time @@ -34,8 +35,8 @@ def benchmark_multimanager(h5file, num=10): using the MultiManager. """ ds_names = list(h5file.keys()) - datsets = [h5file[name] for name in ds_names] - mm = h5pyd.MultiManager(datsets) + datasets = [h5file[name] for name in ds_names] + mm = h5pyd.MultiManager(datasets) # prepare queries to exclude from runtime queries = [] @@ -99,6 +100,9 @@ def run_benchmark(f): # main # +loglevel = logging.WARNING +logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel) + # create domain if it does not exist already with h5pyd.File(DOMAIN_PATH, "a") as f: run_benchmark(f) diff --git a/examples/notebooks/multi_manager_example.ipynb b/examples/notebooks/multi_manager_example.ipynb index 10c71cd..2d7e965 100644 --- a/examples/notebooks/multi_manager_example.ipynb +++ b/examples/notebooks/multi_manager_example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -46,21 +46,33 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data_in[0]: [0 1 2 3 4 5 6 7 8 9]\n", + "data_in[1]: [100 101 102 103 104 105 106 107 108 109]\n", + "data_in[2]: [200 201 202 203 204 205 206 207 208 209]\n", + "data_in[3]: [300 301 302 303 304 305 306 307 308 309]\n" + ] + } + ], "source": [ "# initialize some data to write\n", "data_in = []\n", "for n in range(DSET_COUNT):\n", " arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n", " arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n", - " data_in.append(arr)\n" + " data_in.append(arr)\n", + " print(f\"data_in[{n}]: {arr}\")\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -71,51 +83,31 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# verify what get saved to the first dataset\n", - "dset = f[\"dset_0\"]\n", - "dset[...]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "dset_0: [0 1 2 3 4 5 6 7 8 9]\n", + "dset_1: [100 101 102 103 104 105 106 107 108 109]\n", + "dset_2: [200 201 202 203 204 205 206 207 208 209]\n", + "dset_3: [300 301 302 303 304 305 306 307 308 309]\n" + ] } ], "source": [ - "# and the second dataset\n", - "dset = f[\"dset_1\"]\n", - "dset[...]" + "# verify what get saved to each dataset\n", + "for n in range(DSET_COUNT):\n", + " dset_name = f\"dset_{n}\"\n", + " dset = f[dset_name]\n", + " print(f\"{dset_name}: {dset[...]}\")" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -124,7 +116,7 @@ "4" ] }, - "execution_count": 21, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -137,71 +129,52 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 8, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "data_out[0]: [0 1 2 3 4 5 6 7 8 9]\n", + "data_out[1]: [100 101 102 103 104 105 106 107 108 109]\n", + "data_out[2]: [200 201 202 203 204 205 206 207 208 209]\n", + "data_out[3]: [300 301 302 303 304 305 306 307 308 309]\n" + ] } ], "source": [ - "# get the first item from the returned list\n", - "data_out[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# and the second item\n", - "data_out[1]" + "# dump the data that was returned\n", + "for n in range(DSET_COUNT):\n", + " print(f\"data_out[{n}]: {data_out[n]}\")" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 9, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([0, 1, 2, 3], dtype=int32)" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "data_out[0]: [0 1 2 3]\n", + "data_out[1]: [100 101 102 103]\n", + "data_out[2]: [200 201 202 203]\n", + "data_out[3]: [300 301 302 303]\n" + ] } ], "source": [ "# rather than reading all the data for a dataset, you can read a given selection\n", "data_out = mm[0:4]\n", - "data_out[0]" + "for n in range(DSET_COUNT):\n", + " print(f\"data_out[{n}]: {data_out[n]}\")" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -217,42 +190,24 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 11, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([0, 1], dtype=int32)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_out[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([101, 102], dtype=int32)" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "data_out[0]: [0 1]\n", + "data_out[1]: [101 102]\n", + "data_out[2]: [202 203]\n", + "data_out[3]: [303 304]\n" + ] } ], "source": [ - "data_out[1]" + "# dump the data that was returned\n", + "for n in range(DSET_COUNT):\n", + " print(f\"data_out[{n}]: {data_out[n]}\")" ] } ],