From 1e67d65c8f452e04562d9e33aba1bde42e7c8384 Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 29 Oct 2024 21:06:15 -0500 Subject: [PATCH] added example using field selection (#228) --- .../notebooks/compound_type_example.ipynb | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 examples/notebooks/compound_type_example.ipynb diff --git a/examples/notebooks/compound_type_example.ipynb b/examples/notebooks/compound_type_example.ipynb new file mode 100644 index 0000000..4d6af83 --- /dev/null +++ b/examples/notebooks/compound_type_example.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import numpy as np\n", + "USE_H5PY=False\n", + "if USE_H5PY:\n", + " import h5py\n", + " filepath = \"./compound.h5\"\n", + "else:\n", + " import h5pyd as h5py\n", + " filepath = \"/home/test_user1/test/compound.h5\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new domain/file\n", + "f = h5py.File(filepath, \"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# create a numpy dtype with 260 Fields: \n", + "# A0, A1, A2, ..., Z7, Z8, Z9\n", + "fields = []\n", + "for i in range(26):\n", + " ch1 = chr(ord('A') + i)\n", + " for j in range(10):\n", + " ch2 = chr(ord('0') + j)\n", + " fields.append((ch1+ch2, \"S6\"))\n", + "dt = np.dtype(fields)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create a dataset using the dtype\n", + "NUM_ROWS = 10000\n", + "dset = f.create_dataset(\"dset\", (NUM_ROWS,), dtype=dt)\n", + "dset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# write some values into the dataset\n", + "arr = np.zeros((NUM_ROWS,), dtype=dt)\n", + "for i in range(NUM_ROWS):\n", + " row = arr[i]\n", + " for name in dt.names:\n", + " row[name] = f\"{i:03d}_{name}\".encode()\n", + "dset[:] = arr[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['C4', 'P6', 'V0', 'S8', 'P4', 'B5', 'L1', 'E7']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get a random set of field names.\n", + "# k controls the max number of names returned\n", + "names = random.choices(dt.names, k=10)\n", + "names = list(set(names))\n", + "names" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 27.1 ms, sys: 16.1 ms, total: 43.2 ms\n", + "Wall time: 93.8 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([(b'000_C4', b'000_P6', b'000_V0', b'000_S8', b'000_P4', b'000_B5', b'000_L1', b'000_E7'),\n", + " (b'001_C4', b'001_P6', b'001_V0', b'001_S8', b'001_P4', b'001_B5', b'001_L1', b'001_E7'),\n", + " (b'002_C4', b'002_P6', b'002_V0', b'002_S8', b'002_P4', b'002_B5', b'002_L1', b'002_E7'),\n", + " ...,\n", + " (b'9997_C', b'9997_P', b'9997_V', b'9997_S', b'9997_P', b'9997_B', b'9997_L', b'9997_E'),\n", + " (b'9998_C', b'9998_P', b'9998_V', b'9998_S', b'9998_P', b'9998_B', b'9998_L', b'9998_E'),\n", + " (b'9999_C', b'9999_P', b'9999_V', b'9999_S', b'9999_P', b'9999_B', b'9999_L', b'9999_E')],\n", + " dtype={'names': ['C4', 'P6', 'V0', 'S8', 'P4', 'B5', 'L1', 'E7'], 'formats': ['S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6', 'S6'], 'offsets': [144, 936, 1260, 1128, 924, 90, 666, 282], 'itemsize': 1560})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the dataset values and then return the field selection\n", + "%time dset[:][names]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.92 ms, sys: 0 ns, total: 3.92 ms\n", + "Wall time: 20.7 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([(b'000_C4', b'000_P6', b'000_V0', b'000_S8', b'000_P4', b'000_B5', b'000_L1', b'000_E7'),\n", + " (b'001_C4', b'001_P6', b'001_V0', b'001_S8', b'001_P4', b'001_B5', b'001_L1', b'001_E7'),\n", + " (b'002_C4', b'002_P6', b'002_V0', b'002_S8', b'002_P4', b'002_B5', b'002_L1', b'002_E7'),\n", + " ...,\n", + " (b'9997_C', b'9997_P', b'9997_V', b'9997_S', b'9997_P', b'9997_B', b'9997_L', b'9997_E'),\n", + " (b'9998_C', b'9998_P', b'9998_V', b'9998_S', b'9998_P', b'9998_B', b'9998_L', b'9998_E'),\n", + " (b'9999_C', b'9999_P', b'9999_V', b'9999_S', b'9999_P', b'9999_B', b'9999_L', b'9999_E')],\n", + " dtype=[('C4', 'S6'), ('P6', 'S6'), ('V0', 'S6'), ('S8', 'S6'), ('P4', 'S6'), ('B5', 'S6'), ('L1', 'S6'), ('E7', 'S6')])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Have HSDS (or HDF5 lib) return just the values for the given set of field names\n", + "# Will return same values as above cell, but should be faster as less data needs \n", + "# to be transferred \n", + "%time dset.fields(names)[:]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hs", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}