Add minimal API docs

intake · Feb 4, 2025 · 3c485fc · 3c485fc
1 parent 7f75ffd
commit 3c485fc
Show file tree

Hide file tree

Showing 15 changed files with 92 additions and 27 deletions.
diff --git a/docs/demo/akimbo-demo.ipynb → docs/akimbo-demo.ipynb b/docs/demo/akimbo-demo.ipynb → docs/akimbo-demo.ipynb
@@ -1,5 +1,15 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8b1be0e8",
+   "metadata": {},
+   "source": [
+    "# HEP Demo\n",
+    "\n",
+    "Here we show a plausible small workflow on a real excerpt of particle data."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,

diff --git a/docs/api.rst b/docs/api.rst
@@ -1,21 +1,6 @@
 akimbo
 ==============
 
-.. currentmodule:: akimbo
-
-Top Level Functions
-~~~~~~~~~~~~~~~~~~~
-
-.. autosummary::
-   :toctree: generated/
-
-   read_parquet
-   read_json
-   read_avro
-   get_parquet_schema
-   get_json_schema
-   get_avro_schema
-
 Accessor
 ~~~~~~~~
 
@@ -38,6 +23,8 @@ Backends
     akimbo.dask.DaskAwkwardAccessor
     akimbo.polars.PolarsAwkwardAccessor
     akimbo.cudf.CudfAwkwardAccessor
+    akimbo.ray.RayAccessor
+    akimbo.spark.SparkAccessor
 
 .. autoclass:: akimbo.pandas.PandasAwkwardAccessor
 
@@ -47,6 +34,25 @@ Backends
 
 .. autoclass:: akimbo.cudf.CudfAwkwardAccessor
 
+.. autoclass:: akimbo.ray.RayAccessor
+
+.. autoclass:: akimbo.spark.SparkAccessor
+
+Top Level Functions
+~~~~~~~~~~~~~~~~~~~
+.. currentmodule:: akimbo
+
+
+.. autosummary::
+   :toctree: generated/
+
+   read_parquet
+   read_json
+   read_avro
+   get_parquet_schema
+   get_json_schema
+   get_avro_schema
+
 
 Extensions
 ~~~~~~~~~~

diff --git a/docs/conf.py b/docs/conf.py
@@ -24,13 +24,7 @@
 ]
 
 templates_path = ["_templates"]
-exclude_patterns = [
-    "_build",
-    "Thumbs.db",
-    ".DS_Store",
-    "**.ipynb_checkpoints",
-    "**akimbo-demo.ipynb",
-]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
 
 
 # -- Options for HTML output -------------------------------------------------

diff --git a/docs/cudf-ak.ipynb b/docs/cudf-ak.ipynb
@@ -0,0 +1 @@
+example/cudf-ak.ipynb
diff --git a/docs/demo/.gitignore b/docs/demo/.gitignore
diff --git a/example/cuda_env.yaml → docs/example/cuda_env.yaml b/example/cuda_env.yaml → docs/example/cuda_env.yaml
diff --git a/example/cudf-ak.ipynb → docs/example/cudf-ak.ipynb b/example/cudf-ak.ipynb → docs/example/cudf-ak.ipynb
@@ -1,10 +1,19 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ee00a3e2",
+   "metadata": {},
+   "source": [
+    "# GPU backend"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "58d18a3a-45b1-425a-b822-e8be0a6c0bc0",
    "metadata": {},
    "source": [
+    "This example depends on data in a file that can be made in the following way.\n",
     "\n",
     "```python\n",
     "import awkward as ak\n",
@@ -14,6 +23,11 @@
     "            [[6, 7]]] * N\n",
     "    arr = ak.Array({\"a\": part})\n",
     "    ak.to_parquet(arr, fn, extensionarray=False)\n",
+    "```\n",
+    "\n",
+    "The file cuda-env.yaml can be used to create a functional environment using conda:\n",
+    "```bash\n",
+    "$ conda env create -f example/cuda-env.yaml\n",
     "```"
    ]
   },
@@ -617,7 +631,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.0"
+   "version": "3.10.9"
   }
  },
  "nbformat": 4,

diff --git a/docs/index.rst b/docs/index.rst
@@ -24,6 +24,8 @@ identical syntax:
 - dask.dataframe
 - polars
 - cuDF
+- ray dataset
+- pyspark
 
 
 numpy-like API
@@ -111,6 +113,13 @@ the ``akimbo`` system, you can apply these methods to ragged/nested dataframes.
    install.rst
    quickstart.ipynb
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Demos
+
+   akimbo-demo.ipynb
+   cudf-ak.ipynb
+
 .. toctree::
    :maxdepth: 1
    :caption: API Reference

diff --git a/docs/install.rst b/docs/install.rst
@@ -5,7 +5,11 @@ Requirements
 ~~~~~~~~~~~~
 
 To install ``akimbo`` you will need ``awkward`` and
-one of the backend libraries: ``pandas``, ``dask`` or ``polars``.
+one of the backend libraries: ``pandas``, ``dask``, ``cuDF``, ``ray.data``,
+``pyspark`` or ``polars``. Each of there have various installation options,
+please see their respective documentation.
+
+``akimbo`` depends on ``pyarrow`` and ``awkward``.
 
 
 From PyPI

diff --git a/docs/demo/muons_dataset1.svg → docs/muons_dataset1.svg b/docs/demo/muons_dataset1.svg → docs/muons_dataset1.svg
diff --git a/docs/demo/muons_dataset_df.svg → docs/muons_dataset_df.svg b/docs/demo/muons_dataset_df.svg → docs/muons_dataset_df.svg
diff --git a/src/akimbo/cudf.py b/src/akimbo/cudf.py
@@ -107,6 +107,12 @@ def f(lay, method=meth, **kwargs):
 
 
 class CudfAwkwardAccessor(Accessor):
+    """Operations on cuDF dataframes on the GPU.
+
+    Data are kept in GPU memory and use views rather than copies where
+    possible.
+    """
+
     series_type = Series
     dataframe_type = DataFrame
 

diff --git a/src/akimbo/io.py b/src/akimbo/io.py
@@ -6,6 +6,7 @@
 
 
 def ak_to_series(ds, backend="pandas", extract=True):
+    """Make backend-specific series from data"""
     if backend == "pandas":
         import akimbo.pandas
 
@@ -23,13 +24,18 @@ def ak_to_series(ds, backend="pandas", extract=True):
         import akimbo.cudf
 
         s = akimbo.cudf.CudfAwkwardAccessor._to_output(ds)
+    elif backend in ["ray", "spark"]:
+        raise ValueError("Backend only supports dataframes, not series")
+
     else:
         raise ValueError("Backend must be in {'pandas', 'polars', 'dask'}")
     if extract and ds.fields:
         return s.ak.unpack()
     return s
 
 
+# TODO: read_parquet should use native versions rather than convert. This version
+#  is OK for pandas
 def read_parquet(
     url: str,
     storage_options: dict | None = None,
@@ -60,6 +66,8 @@ def read_parquet(
     return ak_to_series(ds, backend, extract=extract)
 
 
+# TODO: should be a map over input files, maybe with newline byte blocks
+#  as in dask
 def read_json(
     url: str,
     storage_options: dict | None = None,
@@ -124,6 +132,8 @@ def get_json_schema(
     return layout_to_jsonschema(arr.layout)
 
 
+# TODO: should be a map over input files, maybe with newline byte blocks
+#  as in dask
 def read_avro(
     url: str,
     storage_options: dict | None = None,
@@ -205,9 +215,9 @@ def join(
         merge = _merge
 
     counts = np.empty(len(table1), dtype="uint64")
-    # TODO: the line below over-allocates, can switch to somehing growable
+    # TODO: the line below over-allocates, can switch to something growable
     matches = np.empty(len(table2), dtype="uint64")
-    # TODO: to_numpy(allow_missong) makes this a bit faster, but is not
+    # TODO: to_numpy(allow_missing) makes this a bit faster, but is not
     #  not GPU general
     counts, matches, ind = merge(table1[key], table2[key], counts, matches)
     matches.resize(int(ind), refcheck=False)

diff --git a/src/akimbo/ray.py b/src/akimbo/ray.py
@@ -59,6 +59,12 @@ def __dir__(self):
 
 
 class RayAccessor(Accessor):
+    """Operations on ray.data.Dataset dataframes.
+
+    This is a lazy backend, and operates partition-wise. It predicts the schema
+    of each operation by running with an empty dataframe of the correct type.
+    """
+
     dataframe_type = rd.Dataset
     series_type = None  # only has "dataframe like"
     subaccessors = Accessor.subaccessors.copy()

diff --git a/src/akimbo/spark.py b/src/akimbo/spark.py
@@ -62,6 +62,12 @@ def __dir__(self):
 
 
 class SparkAccessor(Accessor):
+    """Operations on pyspark dataframes.
+
+    This is a lazy backend, and operates partition-wise. It predicts the schema
+    of each operation by running with an empty dataframe of the correct type.
+    """
+
     subaccessors = Accessor.subaccessors.copy()
     dataframe_type = sdf