From 3c485fc6412c9d578272089f391e4940bbb7e599 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 4 Feb 2025 10:48:58 -0500 Subject: [PATCH] Add minimal API docs --- docs/{demo => }/akimbo-demo.ipynb | 10 +++++++ docs/api.rst | 36 ++++++++++++++----------- docs/conf.py | 8 +----- docs/cudf-ak.ipynb | 1 + docs/demo/.gitignore | 1 - {example => docs/example}/cuda_env.yaml | 0 {example => docs/example}/cudf-ak.ipynb | 16 ++++++++++- docs/index.rst | 9 +++++++ docs/install.rst | 6 ++++- docs/{demo => }/muons_dataset1.svg | 0 docs/{demo => }/muons_dataset_df.svg | 0 src/akimbo/cudf.py | 6 +++++ src/akimbo/io.py | 14 ++++++++-- src/akimbo/ray.py | 6 +++++ src/akimbo/spark.py | 6 +++++ 15 files changed, 92 insertions(+), 27 deletions(-) rename docs/{demo => }/akimbo-demo.ipynb (99%) create mode 120000 docs/cudf-ak.ipynb delete mode 100644 docs/demo/.gitignore rename {example => docs/example}/cuda_env.yaml (100%) rename {example => docs/example}/cudf-ak.ipynb (97%) rename docs/{demo => }/muons_dataset1.svg (100%) rename docs/{demo => }/muons_dataset_df.svg (100%) diff --git a/docs/demo/akimbo-demo.ipynb b/docs/akimbo-demo.ipynb similarity index 99% rename from docs/demo/akimbo-demo.ipynb rename to docs/akimbo-demo.ipynb index 7b992c1..845e889 100644 --- a/docs/demo/akimbo-demo.ipynb +++ b/docs/akimbo-demo.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "8b1be0e8", + "metadata": {}, + "source": [ + "# HEP Demo\n", + "\n", + "Here we show a plausible small workflow on a real excerpt of particle data." + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/docs/api.rst b/docs/api.rst index a2282f8..e48575c 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,21 +1,6 @@ akimbo ============== -.. currentmodule:: akimbo - -Top Level Functions -~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: generated/ - - read_parquet - read_json - read_avro - get_parquet_schema - get_json_schema - get_avro_schema - Accessor ~~~~~~~~ @@ -38,6 +23,8 @@ Backends akimbo.dask.DaskAwkwardAccessor akimbo.polars.PolarsAwkwardAccessor akimbo.cudf.CudfAwkwardAccessor + akimbo.ray.RayAccessor + akimbo.spark.SparkAccessor .. autoclass:: akimbo.pandas.PandasAwkwardAccessor @@ -47,6 +34,25 @@ Backends .. autoclass:: akimbo.cudf.CudfAwkwardAccessor +.. autoclass:: akimbo.ray.RayAccessor + +.. autoclass:: akimbo.spark.SparkAccessor + +Top Level Functions +~~~~~~~~~~~~~~~~~~~ +.. currentmodule:: akimbo + + +.. autosummary:: + :toctree: generated/ + + read_parquet + read_json + read_avro + get_parquet_schema + get_json_schema + get_avro_schema + Extensions ~~~~~~~~~~ diff --git a/docs/conf.py b/docs/conf.py index 461cf3c..ebdb78c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,13 +24,7 @@ ] templates_path = ["_templates"] -exclude_patterns = [ - "_build", - "Thumbs.db", - ".DS_Store", - "**.ipynb_checkpoints", - "**akimbo-demo.ipynb", -] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/cudf-ak.ipynb b/docs/cudf-ak.ipynb new file mode 120000 index 0000000..8f765e8 --- /dev/null +++ b/docs/cudf-ak.ipynb @@ -0,0 +1 @@ +example/cudf-ak.ipynb \ No newline at end of file diff --git a/docs/demo/.gitignore b/docs/demo/.gitignore deleted file mode 100644 index 4bed5da..0000000 --- a/docs/demo/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.parquet diff --git a/example/cuda_env.yaml b/docs/example/cuda_env.yaml similarity index 100% rename from example/cuda_env.yaml rename to docs/example/cuda_env.yaml diff --git a/example/cudf-ak.ipynb b/docs/example/cudf-ak.ipynb similarity index 97% rename from example/cudf-ak.ipynb rename to docs/example/cudf-ak.ipynb index f786c4e..8f3e938 100644 --- a/example/cudf-ak.ipynb +++ b/docs/example/cudf-ak.ipynb @@ -1,10 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "ee00a3e2", + "metadata": {}, + "source": [ + "# GPU backend" + ] + }, { "cell_type": "markdown", "id": "58d18a3a-45b1-425a-b822-e8be0a6c0bc0", "metadata": {}, "source": [ + "This example depends on data in a file that can be made in the following way.\n", "\n", "```python\n", "import awkward as ak\n", @@ -14,6 +23,11 @@ " [[6, 7]]] * N\n", " arr = ak.Array({\"a\": part})\n", " ak.to_parquet(arr, fn, extensionarray=False)\n", + "```\n", + "\n", + "The file cuda-env.yaml can be used to create a functional environment using conda:\n", + "```bash\n", + "$ conda env create -f example/cuda-env.yaml\n", "```" ] }, @@ -617,7 +631,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/docs/index.rst b/docs/index.rst index c3daebd..1ec24f4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,8 @@ identical syntax: - dask.dataframe - polars - cuDF +- ray dataset +- pyspark numpy-like API @@ -111,6 +113,13 @@ the ``akimbo`` system, you can apply these methods to ragged/nested dataframes. install.rst quickstart.ipynb +.. toctree:: + :maxdepth: 1 + :caption: Demos + + akimbo-demo.ipynb + cudf-ak.ipynb + .. toctree:: :maxdepth: 1 :caption: API Reference diff --git a/docs/install.rst b/docs/install.rst index 6c05a4f..50fbd75 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -5,7 +5,11 @@ Requirements ~~~~~~~~~~~~ To install ``akimbo`` you will need ``awkward`` and -one of the backend libraries: ``pandas``, ``dask`` or ``polars``. +one of the backend libraries: ``pandas``, ``dask``, ``cuDF``, ``ray.data``, +``pyspark`` or ``polars``. Each of there have various installation options, +please see their respective documentation. + +``akimbo`` depends on ``pyarrow`` and ``awkward``. From PyPI diff --git a/docs/demo/muons_dataset1.svg b/docs/muons_dataset1.svg similarity index 100% rename from docs/demo/muons_dataset1.svg rename to docs/muons_dataset1.svg diff --git a/docs/demo/muons_dataset_df.svg b/docs/muons_dataset_df.svg similarity index 100% rename from docs/demo/muons_dataset_df.svg rename to docs/muons_dataset_df.svg diff --git a/src/akimbo/cudf.py b/src/akimbo/cudf.py index 6346a68..13cc3e2 100644 --- a/src/akimbo/cudf.py +++ b/src/akimbo/cudf.py @@ -107,6 +107,12 @@ def f(lay, method=meth, **kwargs): class CudfAwkwardAccessor(Accessor): + """Operations on cuDF dataframes on the GPU. + + Data are kept in GPU memory and use views rather than copies where + possible. + """ + series_type = Series dataframe_type = DataFrame diff --git a/src/akimbo/io.py b/src/akimbo/io.py index afaf496..cd32765 100644 --- a/src/akimbo/io.py +++ b/src/akimbo/io.py @@ -6,6 +6,7 @@ def ak_to_series(ds, backend="pandas", extract=True): + """Make backend-specific series from data""" if backend == "pandas": import akimbo.pandas @@ -23,6 +24,9 @@ def ak_to_series(ds, backend="pandas", extract=True): import akimbo.cudf s = akimbo.cudf.CudfAwkwardAccessor._to_output(ds) + elif backend in ["ray", "spark"]: + raise ValueError("Backend only supports dataframes, not series") + else: raise ValueError("Backend must be in {'pandas', 'polars', 'dask'}") if extract and ds.fields: @@ -30,6 +34,8 @@ def ak_to_series(ds, backend="pandas", extract=True): return s +# TODO: read_parquet should use native versions rather than convert. This version +# is OK for pandas def read_parquet( url: str, storage_options: dict | None = None, @@ -60,6 +66,8 @@ def read_parquet( return ak_to_series(ds, backend, extract=extract) +# TODO: should be a map over input files, maybe with newline byte blocks +# as in dask def read_json( url: str, storage_options: dict | None = None, @@ -124,6 +132,8 @@ def get_json_schema( return layout_to_jsonschema(arr.layout) +# TODO: should be a map over input files, maybe with newline byte blocks +# as in dask def read_avro( url: str, storage_options: dict | None = None, @@ -205,9 +215,9 @@ def join( merge = _merge counts = np.empty(len(table1), dtype="uint64") - # TODO: the line below over-allocates, can switch to somehing growable + # TODO: the line below over-allocates, can switch to something growable matches = np.empty(len(table2), dtype="uint64") - # TODO: to_numpy(allow_missong) makes this a bit faster, but is not + # TODO: to_numpy(allow_missing) makes this a bit faster, but is not # not GPU general counts, matches, ind = merge(table1[key], table2[key], counts, matches) matches.resize(int(ind), refcheck=False) diff --git a/src/akimbo/ray.py b/src/akimbo/ray.py index 80f5bca..f96bd35 100644 --- a/src/akimbo/ray.py +++ b/src/akimbo/ray.py @@ -59,6 +59,12 @@ def __dir__(self): class RayAccessor(Accessor): + """Operations on ray.data.Dataset dataframes. + + This is a lazy backend, and operates partition-wise. It predicts the schema + of each operation by running with an empty dataframe of the correct type. + """ + dataframe_type = rd.Dataset series_type = None # only has "dataframe like" subaccessors = Accessor.subaccessors.copy() diff --git a/src/akimbo/spark.py b/src/akimbo/spark.py index 5f0e729..99f60cd 100644 --- a/src/akimbo/spark.py +++ b/src/akimbo/spark.py @@ -62,6 +62,12 @@ def __dir__(self): class SparkAccessor(Accessor): + """Operations on pyspark dataframes. + + This is a lazy backend, and operates partition-wise. It predicts the schema + of each operation by running with an empty dataframe of the correct type. + """ + subaccessors = Accessor.subaccessors.copy() dataframe_type = sdf