From 45110624c2c4a8ce12558517fd7b644932f6eee3 Mon Sep 17 00:00:00 2001
From: John Readey <jreadey@hdfgroup.org>
Date: Tue, 9 Jul 2024 11:29:20 -0500
Subject: [PATCH 1/2] added multimanager example and benchmark

---
 examples/multi_mgr_benchmark.py               | 104 +++++++
 .../notebooks/multi_manager_example.ipynb     | 280 ++++++++++++++++++
 2 files changed, 384 insertions(+)
 create mode 100644 examples/multi_mgr_benchmark.py
 create mode 100644 examples/notebooks/multi_manager_example.ipynb

diff --git a/examples/multi_mgr_benchmark.py b/examples/multi_mgr_benchmark.py
new file mode 100644
index 0000000..edcb8b2
--- /dev/null
+++ b/examples/multi_mgr_benchmark.py
@@ -0,0 +1,104 @@
+import h5pyd
+import numpy as np
+import random
+import time
+
+DOMAIN_PATH = "/home/test_user1/test/multi_mgr_benchmark.h5"
+DATASET_COUNT = 200
+DSET_SHAPE = (10,)
+DSET_DTYPE = np.int32
+
+
+def generate_range(ds_shape: tuple):
+    # generate a tuple of random indices for one dataset
+    indices = []
+    for axis_length in ds_shape:
+        index = random.randint(0, axis_length - 1)
+        indices.append(index)
+    return tuple(indices)
+
+
+def generate_index_query(h5file):
+    # generate a list of index tuples
+    query = []
+    for ds in h5file.values():
+        ds_shape = ds.shape
+        indices = generate_range(ds_shape)
+        query.append(indices)
+    return query
+
+
+def benchmark_multimanager(h5file, num=10):
+    """
+    Benchmark retrieving one random entry from every dataset in an h5file
+    using the MultiManager.
+    """
+    ds_names = list(h5file.keys())
+    datsets = [h5file[name] for name in ds_names]
+    mm = h5pyd.MultiManager(datsets)
+
+    # prepare queries to exclude from runtime
+    queries = []
+    for i in range(num):
+        query = generate_index_query(h5file)
+        queries.append(query)
+
+    # accessing the data
+    t0 = time.time()
+    for query in queries:
+        mm[query]
+
+    runtime = time.time() - t0
+    print(f"Mean runtime multimanager: {runtime/num:.4f} s")
+    # 100ms for case with 6 datasets
+
+
+def benchmark_sequential_ds(h5file, num=10):
+    """
+    Benchmark retrieving one random entry from every dataset in
+    an h5file by sequentially looping through the datasets
+    """
+    # prepare queries to exclude this code from runtime
+    index_lists = []
+    for i in range(num):
+        index_list = []
+        for ds in h5file.values():
+            indices = generate_range(ds.shape)
+            index_list.append(indices)
+        index_lists.append(index_list)
+
+    # accessing the data
+    t0 = time.time()
+    for index_list in index_lists:
+        for indices, ds in zip(index_list, h5file.values()):
+            ds[indices]
+
+    runtime = time.time() - t0
+    print(f"Mean runtime sequentially: {runtime/num:.4f} s")
+    # ~ 400ms for case with 6 datasests
+
+
+def run_benchmark(f):
+    """
+    Initialize datasets if not done previously
+    Then run sequential and multimanager tests
+    """
+
+    for i in range(DATASET_COUNT):
+        dset_name = f"dset_{i:04d}"
+        if dset_name not in f:
+            data = np.random.randint(0, 100, size=DSET_SHAPE, dtype=DSET_DTYPE)
+            f.create_dataset(dset_name, data=data)
+
+    benchmark_sequential_ds(f)
+
+    benchmark_multimanager(f)
+
+
+#
+# main
+#
+
+# create domain if it does not exist already
+with h5pyd.File(DOMAIN_PATH, "a") as f:
+    run_benchmark(f)
diff --git a/examples/notebooks/multi_manager_example.ipynb b/examples/notebooks/multi_manager_example.ipynb
new file mode 100644
index 0000000..10c71cd
--- /dev/null
+++ b/examples/notebooks/multi_manager_example.ipynb
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "USE_H5PY = False  # set to True to use h5py/hdf5lib instead\n",
+    "if USE_H5PY:\n",
+    "    import h5py\n",
+    "    from h5py import MultiManager\n",
+    "else:\n",
+    "    import h5pyd as h5py  # Use the \"as\" syntax for code  compatibility\n",
+    "    from h5pyd import MultiManager\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a new file\n",
+    "f = h5py.File(\"/home/test_user1/multi_try.h5\", mode=\"w\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create some datasets\n",
+    "DSET_SHAPE = (10,)\n",
+    "DSET_DTYPE = np.int32\n",
+    "\n",
+    "# create 4 datasets\n",
+    "DSET_COUNT = 4\n",
+    "datasets = []\n",
+    "for i in range(DSET_COUNT):\n",
+    "    dset = f.create_dataset(f\"dset_{i}\", shape=DSET_SHAPE, dtype=DSET_DTYPE)\n",
+    "    datasets.append(dset)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# initialize some data to write\n",
+    "data_in = []\n",
+    "for n in range(DSET_COUNT):\n",
+    "    arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n",
+    "    arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n",
+    "    data_in.append(arr)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# instantiate a MultiManager and use it to write to all the datasets simultaneously\n",
+    "mm = MultiManager(datasets)\n",
+    "mm[...] = data_in"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# verify what get saved to the first dataset\n",
+    "dset = f[\"dset_0\"]\n",
+    "dset[...]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# and the second dataset\n",
+    "dset = f[\"dset_1\"]\n",
+    "dset[...]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Read all the data from all the daasets using the same MultiManager instance\n",
+    "data_out = mm[...]\n",
+    "len(data_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# get the first item from the returned list\n",
+    "data_out[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# and the second item\n",
+    "data_out[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 2, 3], dtype=int32)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# rather than reading all the data for a dataset, you can read a given selection\n",
+    "data_out = mm[0:4]\n",
+    "data_out[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# it's also possible to pass a list of selections and have each selection\n",
+    "# read from the corresponding dataset\n",
+    "selections = []\n",
+    "for n in range(DSET_COUNT):\n",
+    "    s = slice(n, n+2, 1)\n",
+    "    selections.append(s)\n",
+    "\n",
+    "data_out = mm.__getitem__(selections)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1], dtype=int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_out[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([101, 102], dtype=int32)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_out[1]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From a4630784081e1b3777db662320cbe67b896d1762 Mon Sep 17 00:00:00 2001
From: John Readey <jreadey@hdfgroup.org>
Date: Wed, 10 Jul 2024 04:51:40 -0500
Subject: [PATCH 2/2] minor edits for multi mgr examples

---
 examples/multi_mgr_benchmark.py               |   8 +-
 .../notebooks/multi_manager_example.ipynb     | 187 +++++++-----------
 2 files changed, 77 insertions(+), 118 deletions(-)

diff --git a/examples/multi_mgr_benchmark.py b/examples/multi_mgr_benchmark.py
index edcb8b2..3655e8a 100644
--- a/examples/multi_mgr_benchmark.py
+++ b/examples/multi_mgr_benchmark.py
@@ -1,4 +1,5 @@
 import h5pyd
+import logging
 import numpy as np
 import random
 import time
@@ -34,8 +35,8 @@ def benchmark_multimanager(h5file, num=10):
     using the MultiManager.
     """
     ds_names = list(h5file.keys())
-    datsets = [h5file[name] for name in ds_names]
-    mm = h5pyd.MultiManager(datsets)
+    datasets = [h5file[name] for name in ds_names]
+    mm = h5pyd.MultiManager(datasets)
 
     # prepare queries to exclude from runtime
     queries = []
@@ -99,6 +100,9 @@ def run_benchmark(f):
 # main
 #
 
+loglevel = logging.WARNING
+logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel)
+
 # create domain if it does not exist already
 with h5pyd.File(DOMAIN_PATH, "a") as f:
     run_benchmark(f)
diff --git a/examples/notebooks/multi_manager_example.ipynb b/examples/notebooks/multi_manager_example.ipynb
index 10c71cd..2d7e965 100644
--- a/examples/notebooks/multi_manager_example.ipynb
+++ b/examples/notebooks/multi_manager_example.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,21 +46,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data_in[0]: [0 1 2 3 4 5 6 7 8 9]\n",
+      "data_in[1]: [100 101 102 103 104 105 106 107 108 109]\n",
+      "data_in[2]: [200 201 202 203 204 205 206 207 208 209]\n",
+      "data_in[3]: [300 301 302 303 304 305 306 307 308 309]\n"
+     ]
+    }
+   ],
    "source": [
     "# initialize some data to write\n",
     "data_in = []\n",
     "for n in range(DSET_COUNT):\n",
     "    arr = np.zeros(DSET_SHAPE, dtype=DSET_DTYPE)\n",
     "    arr[...] = list(range(n*100, n*100+DSET_SHAPE[0]))\n",
-    "    data_in.append(arr)\n"
+    "    data_in.append(arr)\n",
+    "    print(f\"data_in[{n}]: {arr}\")\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -71,51 +83,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# verify what get saved to the first dataset\n",
-    "dset = f[\"dset_0\"]\n",
-    "dset[...]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dset_0: [0 1 2 3 4 5 6 7 8 9]\n",
+      "dset_1: [100 101 102 103 104 105 106 107 108 109]\n",
+      "dset_2: [200 201 202 203 204 205 206 207 208 209]\n",
+      "dset_3: [300 301 302 303 304 305 306 307 308 309]\n"
+     ]
     }
    ],
    "source": [
-    "# and the second dataset\n",
-    "dset = f[\"dset_1\"]\n",
-    "dset[...]"
+    "# verify what get saved to each dataset\n",
+    "for n in range(DSET_COUNT):\n",
+    "    dset_name = f\"dset_{n}\"\n",
+    "    dset = f[dset_name]\n",
+    "    print(f\"{dset_name}: {dset[...]}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -124,7 +116,7 @@
        "4"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -137,71 +129,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data_out[0]: [0 1 2 3 4 5 6 7 8 9]\n",
+      "data_out[1]: [100 101 102 103 104 105 106 107 108 109]\n",
+      "data_out[2]: [200 201 202 203 204 205 206 207 208 209]\n",
+      "data_out[3]: [300 301 302 303 304 305 306 307 308 309]\n"
+     ]
     }
    ],
    "source": [
-    "# get the first item from the returned list\n",
-    "data_out[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109], dtype=int32)"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# and the second item\n",
-    "data_out[1]"
+    "# dump the data that was returned\n",
+    "for n in range(DSET_COUNT):\n",
+    "    print(f\"data_out[{n}]: {data_out[n]}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "array([0, 1, 2, 3], dtype=int32)"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data_out[0]: [0 1 2 3]\n",
+      "data_out[1]: [100 101 102 103]\n",
+      "data_out[2]: [200 201 202 203]\n",
+      "data_out[3]: [300 301 302 303]\n"
+     ]
     }
    ],
    "source": [
     "# rather than reading all the data for a dataset, you can read a given selection\n",
     "data_out = mm[0:4]\n",
-    "data_out[0]"
+    "for n in range(DSET_COUNT):\n",
+    "    print(f\"data_out[{n}]: {data_out[n]}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -217,42 +190,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "array([0, 1], dtype=int32)"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "data_out[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([101, 102], dtype=int32)"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data_out[0]: [0 1]\n",
+      "data_out[1]: [101 102]\n",
+      "data_out[2]: [202 203]\n",
+      "data_out[3]: [303 304]\n"
+     ]
     }
    ],
    "source": [
-    "data_out[1]"
+    "# dump the data that was returned\n",
+    "for n in range(DSET_COUNT):\n",
+    "    print(f\"data_out[{n}]: {data_out[n]}\")"
    ]
   }
  ],