From 3c485fc6412c9d578272089f391e4940bbb7e599 Mon Sep 17 00:00:00 2001
From: Martin Durant <martin.durant@alumni.utoronto.ca>
Date: Tue, 4 Feb 2025 10:48:58 -0500
Subject: [PATCH] Add minimal API docs

---
 docs/{demo => }/akimbo-demo.ipynb       | 10 +++++++
 docs/api.rst                            | 36 ++++++++++++++-----------
 docs/conf.py                            |  8 +-----
 docs/cudf-ak.ipynb                      |  1 +
 docs/demo/.gitignore                    |  1 -
 {example => docs/example}/cuda_env.yaml |  0
 {example => docs/example}/cudf-ak.ipynb | 16 ++++++++++-
 docs/index.rst                          |  9 +++++++
 docs/install.rst                        |  6 ++++-
 docs/{demo => }/muons_dataset1.svg      |  0
 docs/{demo => }/muons_dataset_df.svg    |  0
 src/akimbo/cudf.py                      |  6 +++++
 src/akimbo/io.py                        | 14 ++++++++--
 src/akimbo/ray.py                       |  6 +++++
 src/akimbo/spark.py                     |  6 +++++
 15 files changed, 92 insertions(+), 27 deletions(-)
 rename docs/{demo => }/akimbo-demo.ipynb (99%)
 create mode 120000 docs/cudf-ak.ipynb
 delete mode 100644 docs/demo/.gitignore
 rename {example => docs/example}/cuda_env.yaml (100%)
 rename {example => docs/example}/cudf-ak.ipynb (97%)
 rename docs/{demo => }/muons_dataset1.svg (100%)
 rename docs/{demo => }/muons_dataset_df.svg (100%)

diff --git a/docs/demo/akimbo-demo.ipynb b/docs/akimbo-demo.ipynb
similarity index 99%
rename from docs/demo/akimbo-demo.ipynb
rename to docs/akimbo-demo.ipynb
index 7b992c1..845e889 100644
--- a/docs/demo/akimbo-demo.ipynb
+++ b/docs/akimbo-demo.ipynb
@@ -1,5 +1,15 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8b1be0e8",
+   "metadata": {},
+   "source": [
+    "# HEP Demo\n",
+    "\n",
+    "Here we show a plausible small workflow on a real excerpt of particle data."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
diff --git a/docs/api.rst b/docs/api.rst
index a2282f8..e48575c 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -1,21 +1,6 @@
 akimbo
 ==============
 
-.. currentmodule:: akimbo
-
-Top Level Functions
-~~~~~~~~~~~~~~~~~~~
-
-.. autosummary::
-   :toctree: generated/
-
-   read_parquet
-   read_json
-   read_avro
-   get_parquet_schema
-   get_json_schema
-   get_avro_schema
-
 Accessor
 ~~~~~~~~
 
@@ -38,6 +23,8 @@ Backends
     akimbo.dask.DaskAwkwardAccessor
     akimbo.polars.PolarsAwkwardAccessor
     akimbo.cudf.CudfAwkwardAccessor
+    akimbo.ray.RayAccessor
+    akimbo.spark.SparkAccessor
 
 .. autoclass:: akimbo.pandas.PandasAwkwardAccessor
 
@@ -47,6 +34,25 @@ Backends
 
 .. autoclass:: akimbo.cudf.CudfAwkwardAccessor
 
+.. autoclass:: akimbo.ray.RayAccessor
+
+.. autoclass:: akimbo.spark.SparkAccessor
+
+Top Level Functions
+~~~~~~~~~~~~~~~~~~~
+.. currentmodule:: akimbo
+
+
+.. autosummary::
+   :toctree: generated/
+
+   read_parquet
+   read_json
+   read_avro
+   get_parquet_schema
+   get_json_schema
+   get_avro_schema
+
 
 Extensions
 ~~~~~~~~~~
diff --git a/docs/conf.py b/docs/conf.py
index 461cf3c..ebdb78c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -24,13 +24,7 @@
 ]
 
 templates_path = ["_templates"]
-exclude_patterns = [
-    "_build",
-    "Thumbs.db",
-    ".DS_Store",
-    "**.ipynb_checkpoints",
-    "**akimbo-demo.ipynb",
-]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
 
 
 # -- Options for HTML output -------------------------------------------------
diff --git a/docs/cudf-ak.ipynb b/docs/cudf-ak.ipynb
new file mode 120000
index 0000000..8f765e8
--- /dev/null
+++ b/docs/cudf-ak.ipynb
@@ -0,0 +1 @@
+example/cudf-ak.ipynb
\ No newline at end of file
diff --git a/docs/demo/.gitignore b/docs/demo/.gitignore
deleted file mode 100644
index 4bed5da..0000000
--- a/docs/demo/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.parquet
diff --git a/example/cuda_env.yaml b/docs/example/cuda_env.yaml
similarity index 100%
rename from example/cuda_env.yaml
rename to docs/example/cuda_env.yaml
diff --git a/example/cudf-ak.ipynb b/docs/example/cudf-ak.ipynb
similarity index 97%
rename from example/cudf-ak.ipynb
rename to docs/example/cudf-ak.ipynb
index f786c4e..8f3e938 100644
--- a/example/cudf-ak.ipynb
+++ b/docs/example/cudf-ak.ipynb
@@ -1,10 +1,19 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ee00a3e2",
+   "metadata": {},
+   "source": [
+    "# GPU backend"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "58d18a3a-45b1-425a-b822-e8be0a6c0bc0",
    "metadata": {},
    "source": [
+    "This example depends on data in a file that can be made in the following way.\n",
     "\n",
     "```python\n",
     "import awkward as ak\n",
@@ -14,6 +23,11 @@
     "            [[6, 7]]] * N\n",
     "    arr = ak.Array({\"a\": part})\n",
     "    ak.to_parquet(arr, fn, extensionarray=False)\n",
+    "```\n",
+    "\n",
+    "The file cuda-env.yaml can be used to create a functional environment using conda:\n",
+    "```bash\n",
+    "$ conda env create -f example/cuda-env.yaml\n",
     "```"
    ]
   },
@@ -617,7 +631,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.0"
+   "version": "3.10.9"
   }
  },
  "nbformat": 4,
diff --git a/docs/index.rst b/docs/index.rst
index c3daebd..1ec24f4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -24,6 +24,8 @@ identical syntax:
 - dask.dataframe
 - polars
 - cuDF
+- ray dataset
+- pyspark
 
 
 numpy-like API
@@ -111,6 +113,13 @@ the ``akimbo`` system, you can apply these methods to ragged/nested dataframes.
    install.rst
    quickstart.ipynb
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Demos
+
+   akimbo-demo.ipynb
+   cudf-ak.ipynb
+
 .. toctree::
    :maxdepth: 1
    :caption: API Reference
diff --git a/docs/install.rst b/docs/install.rst
index 6c05a4f..50fbd75 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -5,7 +5,11 @@ Requirements
 ~~~~~~~~~~~~
 
 To install ``akimbo`` you will need ``awkward`` and
-one of the backend libraries: ``pandas``, ``dask`` or ``polars``.
+one of the backend libraries: ``pandas``, ``dask``, ``cuDF``, ``ray.data``,
+``pyspark`` or ``polars``. Each of there have various installation options,
+please see their respective documentation.
+
+``akimbo`` depends on ``pyarrow`` and ``awkward``.
 
 
 From PyPI
diff --git a/docs/demo/muons_dataset1.svg b/docs/muons_dataset1.svg
similarity index 100%
rename from docs/demo/muons_dataset1.svg
rename to docs/muons_dataset1.svg
diff --git a/docs/demo/muons_dataset_df.svg b/docs/muons_dataset_df.svg
similarity index 100%
rename from docs/demo/muons_dataset_df.svg
rename to docs/muons_dataset_df.svg
diff --git a/src/akimbo/cudf.py b/src/akimbo/cudf.py
index 6346a68..13cc3e2 100644
--- a/src/akimbo/cudf.py
+++ b/src/akimbo/cudf.py
@@ -107,6 +107,12 @@ def f(lay, method=meth, **kwargs):
 
 
 class CudfAwkwardAccessor(Accessor):
+    """Operations on cuDF dataframes on the GPU.
+
+    Data are kept in GPU memory and use views rather than copies where
+    possible.
+    """
+
     series_type = Series
     dataframe_type = DataFrame
 
diff --git a/src/akimbo/io.py b/src/akimbo/io.py
index afaf496..cd32765 100644
--- a/src/akimbo/io.py
+++ b/src/akimbo/io.py
@@ -6,6 +6,7 @@
 
 
 def ak_to_series(ds, backend="pandas", extract=True):
+    """Make backend-specific series from data"""
     if backend == "pandas":
         import akimbo.pandas
 
@@ -23,6 +24,9 @@ def ak_to_series(ds, backend="pandas", extract=True):
         import akimbo.cudf
 
         s = akimbo.cudf.CudfAwkwardAccessor._to_output(ds)
+    elif backend in ["ray", "spark"]:
+        raise ValueError("Backend only supports dataframes, not series")
+
     else:
         raise ValueError("Backend must be in {'pandas', 'polars', 'dask'}")
     if extract and ds.fields:
@@ -30,6 +34,8 @@ def ak_to_series(ds, backend="pandas", extract=True):
     return s
 
 
+# TODO: read_parquet should use native versions rather than convert. This version
+#  is OK for pandas
 def read_parquet(
     url: str,
     storage_options: dict | None = None,
@@ -60,6 +66,8 @@ def read_parquet(
     return ak_to_series(ds, backend, extract=extract)
 
 
+# TODO: should be a map over input files, maybe with newline byte blocks
+#  as in dask
 def read_json(
     url: str,
     storage_options: dict | None = None,
@@ -124,6 +132,8 @@ def get_json_schema(
     return layout_to_jsonschema(arr.layout)
 
 
+# TODO: should be a map over input files, maybe with newline byte blocks
+#  as in dask
 def read_avro(
     url: str,
     storage_options: dict | None = None,
@@ -205,9 +215,9 @@ def join(
         merge = _merge
 
     counts = np.empty(len(table1), dtype="uint64")
-    # TODO: the line below over-allocates, can switch to somehing growable
+    # TODO: the line below over-allocates, can switch to something growable
     matches = np.empty(len(table2), dtype="uint64")
-    # TODO: to_numpy(allow_missong) makes this a bit faster, but is not
+    # TODO: to_numpy(allow_missing) makes this a bit faster, but is not
     #  not GPU general
     counts, matches, ind = merge(table1[key], table2[key], counts, matches)
     matches.resize(int(ind), refcheck=False)
diff --git a/src/akimbo/ray.py b/src/akimbo/ray.py
index 80f5bca..f96bd35 100644
--- a/src/akimbo/ray.py
+++ b/src/akimbo/ray.py
@@ -59,6 +59,12 @@ def __dir__(self):
 
 
 class RayAccessor(Accessor):
+    """Operations on ray.data.Dataset dataframes.
+
+    This is a lazy backend, and operates partition-wise. It predicts the schema
+    of each operation by running with an empty dataframe of the correct type.
+    """
+
     dataframe_type = rd.Dataset
     series_type = None  # only has "dataframe like"
     subaccessors = Accessor.subaccessors.copy()
diff --git a/src/akimbo/spark.py b/src/akimbo/spark.py
index 5f0e729..99f60cd 100644
--- a/src/akimbo/spark.py
+++ b/src/akimbo/spark.py
@@ -62,6 +62,12 @@ def __dir__(self):
 
 
 class SparkAccessor(Accessor):
+    """Operations on pyspark dataframes.
+
+    This is a lazy backend, and operates partition-wise. It predicts the schema
+    of each operation by running with an empty dataframe of the correct type.
+    """
+
     subaccessors = Accessor.subaccessors.copy()
     dataframe_type = sdf