Skip to content

Commit

Permalink
Add more docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
martindurant committed Jul 26, 2024
1 parent d1d7e26 commit 6cd2e82
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 11 deletions.
4 changes: 4 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ Top Level Functions

read_parquet
read_json
read_avro
get_parquet_schema
get_json_schema
get_avro_schema

Accessor
~~~~~~~~
Expand Down
22 changes: 14 additions & 8 deletions src/akimbo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
from __future__ import annotations

from awkward import ( # re-export
behavior,
metadata_from_parquet,
mixin_class,
mixin_class_method,
)
from awkward import behavior
from awkward import metadata_from_parquet as get_parquet_schema # re-export
from awkward import mixin_class, mixin_class_method

import akimbo.datetimes as datetimes
import akimbo.mixin as mixin
import akimbo.strings as strings
from akimbo.io import join, read_avro, read_json, read_parquet
from akimbo.io import (
get_avro_schema,
get_json_schema,
join,
read_avro,
read_json,
read_parquet,
)
from akimbo.version import version as __version__ # noqa

__all__ = (
Expand All @@ -23,6 +27,8 @@
"behavior",
"mixin_class",
"mixin_class_method",
"metadata_from_parquet",
"get_parquet_schema",
"get_json_schema",
"get_avro_schema",
"strings",
)
30 changes: 29 additions & 1 deletion src/akimbo/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,35 @@ def unmerge(self):
out = {k: self.to_output(arr[k]) for k in arr.fields}
return self.dataframe_type(out)

def join(self, other, key: str, colname="match", sort=False, rkey=None):
def join(
self,
other,
key: str,
colname: str = "match",
sort: bool = False,
rkey: str | None = None,
numba: bool = True,
):
"""DB ORM-style left join to other dataframe/series with nesting but no copy
Related records of the ``other`` table will appear as a list under the new field
``colname`` for all matching keys. This is the speed and memory efficient way
to doing a pandas-style merge/join, which explodes out the values to a much
bigger memory footprint.
Parameters
----------
other: series or table
key: name of the field in this table to match on
colname: the field that will be added to each record. This field will exist even
if there are no matches, in which case the list will be empty.
sort: if False, assumes that they key is sorted in both tables. If True, an
argsort is performed first, and the match is done by indexing. This may be
significantly slower.
rkey: if the name of the field to match on in different in the ``other`` table.
numba: the matching algorithm will go much faster using numba. However, you can
set this to False if you do not have numba installed.
"""
from akimbo.io import join

out = join(
Expand Down
2 changes: 1 addition & 1 deletion src/akimbo/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
@pd.api.extensions.register_series_accessor("ak")
@pd.api.extensions.register_dataframe_accessor("ak")
class PandasAwkwardAccessor(Accessor):
"""Perhaps awkward operations on pandas data
"""Perform awkward operations on pandas data
Nested structures are handled using arrow as the
storage backend. If you use pandas object columns
Expand Down
2 changes: 1 addition & 1 deletion tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_read_parquet(m): # noqa (m is a fixture)
df.to_parquet(fn)

out = akimbo.read_parquet(fn)
meta = akimbo.metadata_from_parquet(fn)
meta = akimbo.get_parquet_schema(fn)
assert meta["columns"] == ["a.list.element"] # parquet column naming convention
assert out.columns == ["a"]
assert out.a.to_list() == data
Expand Down

0 comments on commit 6cd2e82

Please sign in to comment.