Add more docstrings

intake · Jul 26, 2024 · 6cd2e82 · 6cd2e82
1 parent d1d7e26
commit 6cd2e82
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 11 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -11,6 +11,10 @@ Top Level Functions
 
    read_parquet
    read_json
+   read_avro
+   get_parquet_schema
+   get_json_schema
+   get_avro_schema
 
 Accessor
 ~~~~~~~~

diff --git a/src/akimbo/__init__.py b/src/akimbo/__init__.py
@@ -1,16 +1,20 @@
 from __future__ import annotations
 
-from awkward import (  # re-export
-    behavior,
-    metadata_from_parquet,
-    mixin_class,
-    mixin_class_method,
-)
+from awkward import behavior
+from awkward import metadata_from_parquet as get_parquet_schema  # re-export
+from awkward import mixin_class, mixin_class_method
 
 import akimbo.datetimes as datetimes
 import akimbo.mixin as mixin
 import akimbo.strings as strings
-from akimbo.io import join, read_avro, read_json, read_parquet
+from akimbo.io import (
+    get_avro_schema,
+    get_json_schema,
+    join,
+    read_avro,
+    read_json,
+    read_parquet,
+)
 from akimbo.version import version as __version__  # noqa
 
 __all__ = (
@@ -23,6 +27,8 @@
     "behavior",
     "mixin_class",
     "mixin_class_method",
-    "metadata_from_parquet",
+    "get_parquet_schema",
+    "get_json_schema",
+    "get_avro_schema",
     "strings",
 )
diff --git a/src/akimbo/mixin.py b/src/akimbo/mixin.py
@@ -250,7 +250,35 @@ def unmerge(self):
         out = {k: self.to_output(arr[k]) for k in arr.fields}
         return self.dataframe_type(out)
 
-    def join(self, other, key: str, colname="match", sort=False, rkey=None):
+    def join(
+        self,
+        other,
+        key: str,
+        colname: str = "match",
+        sort: bool = False,
+        rkey: str | None = None,
+        numba: bool = True,
+    ):
+        """DB ORM-style left join to other dataframe/series with nesting but no copy
+
+        Related records of the ``other`` table will appear as a list under the new field
+        ``colname`` for all matching keys. This is the speed and memory efficient way
+        to doing a pandas-style merge/join, which explodes out the values to a much
+        bigger memory footprint.
+
+        Parameters
+        ----------
+        other: series or table
+        key: name of the field in this table to match on
+        colname: the field that will be added to each record. This field will exist even
+            if there are no matches, in which case the list will be empty.
+        sort: if False, assumes that they key is sorted in both tables. If True, an
+            argsort is performed first, and the match is done by indexing. This may be
+            significantly slower.
+        rkey: if the name of the field to match on in different in the ``other`` table.
+        numba: the matching algorithm will go much faster using numba. However, you can
+            set this to False if you do not have numba installed.
+        """
         from akimbo.io import join
 
         out = join(

diff --git a/src/akimbo/pandas.py b/src/akimbo/pandas.py
@@ -11,7 +11,7 @@
 @pd.api.extensions.register_series_accessor("ak")
 @pd.api.extensions.register_dataframe_accessor("ak")
 class PandasAwkwardAccessor(Accessor):
-    """Perhaps awkward operations on pandas data
+    """Perform awkward operations on pandas data
 
     Nested structures are handled using arrow as the
     storage backend. If you use pandas object columns

diff --git a/tests/test_io.py b/tests/test_io.py
@@ -46,7 +46,7 @@ def test_read_parquet(m):  # noqa (m is a fixture)
     df.to_parquet(fn)
 
     out = akimbo.read_parquet(fn)
-    meta = akimbo.metadata_from_parquet(fn)
+    meta = akimbo.get_parquet_schema(fn)
     assert meta["columns"] == ["a.list.element"]  # parquet column naming convention
     assert out.columns == ["a"]
     assert out.a.to_list() == data