From 07a59a9c0ba9bd5929799986641bf2d5f1c23fb3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Mar 2024 21:45:05 +0000 Subject: [PATCH] update readme --- README.md | 81 +++++++++++++++---------------------------- f.py | 36 +++++-------------- narwhals/dataframe.py | 26 ++++---------- narwhals/group_by.py | 10 ++---- narwhals/series.py | 6 ++-- narwhals/translate.py | 8 ++--- narwhals/typing.py | 3 +- 7 files changed, 53 insertions(+), 117 deletions(-) diff --git a/README.md b/README.md index 306116aca..8d37e7ba0 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Seamlessly support both, without depending on either! - ✅ **Just use** a subset of **the Polars API**, no need to learn anything new - ✅ **No dependencies** (not even Polars), keep your library lightweight -- ✅ Support both **lazy** and eager execution +- ✅ Separate **lazy** and eager APIs - ✅ Use Polars **Expressions** **Note: this is work-in-progress, and a bit of an experiment, don't take it too seriously**. @@ -29,16 +29,16 @@ Or just vendor it, it's only a bunch of pure-Python files. There are three steps to writing dataframe-agnostic code using Narwhals: -1. use `narwhals.DataFrame` to wrap a pandas or Polars DataFrame to a Narwhals DataFrame -2. use the subset of the Polars API supported by Narwhals. Some methods are only available - if you initialised `narwhals.DataFrame` with `features=['eager']`, or `features=['lazy']` +1. use `narwhals.LazyFrame` or `narwhals.DataFrame` to wrap a pandas or Polars + DataFrame/LazyFrame in a Narwhals class +2. use the subset of the Polars API supported by Narwhals. Just like in Polars, + some methods (e.g. `to_numpy`) are only available for `DataFrame`, not `LazyFrame` 3. use `narwhals.to_native` to return an object to the user in its original dataframe flavour. For example: - - if you started with a pandas DataFrame, you'll get a pandas DataFrame back - - if you started with a Polars DataFrame, you'll get a Polars DataFrame back - - if you started with a Polars LazyFrame, you'll get a Polars LazyFrame back (unless - you called `.collect`!) + - if you started with pandas, you'll get pandas back + - if you started with Polars, you'll get Polars back + - if you started with Polars, you'll get Polars back ## Example @@ -56,29 +56,24 @@ def my_agnostic_function( suppliers_native, parts_native, ): - suppliers = nw.DataFrame(suppliers_native) - parts = nw.DataFrame(parts_native) + suppliers = nw.LazyFrame(suppliers_native) + parts = nw.LazyFrame(parts_native) result = ( suppliers.join(parts, left_on="city", right_on="city") - .filter( - nw.col("color").is_in(["Red", "Green"]), - nw.col("weight") > 14, - ) - .group_by("s", "p") + .filter(nw.col("weight") > 10) + .group_by("s") .agg( weight_mean=nw.col("weight").mean(), weight_max=nw.col("weight").max(), ) - ).with_columns(nw.col("weight_max").cast(nw.Int64)) + ) return nw.to_native(result) - ``` You can pass in a pandas or Polars dataframe, the output will be the same! Let's try it out: ```python - suppliers = { "s": ["S1", "S2", "S3", "S4", "S5"], "sname": ["Smith", "Jones", "Blake", "Clark", "Adams"], @@ -101,13 +96,6 @@ print( ) ) print("\nPolars output:") -print( - my_agnostic_function( - pl.DataFrame(suppliers), - pl.DataFrame(parts), - ) -) -print("\nPolars lazy output:") print( my_agnostic_function( pl.LazyFrame(suppliers), @@ -118,37 +106,24 @@ print( ``` pandas output: - s p weight_mean -0 S1 P6 19.0 -1 S2 P2 17.0 -2 S3 P2 17.0 -3 S4 P6 19.0 + s weight_mean weight_max +0 S1 15.0 19.0 +1 S2 14.5 17.0 +2 S3 14.5 17.0 +3 S4 15.0 19.0 Polars output: shape: (4, 3) -┌─────┬─────┬─────────────┐ -│ s ┆ p ┆ weight_mean │ -│ --- ┆ --- ┆ --- │ -│ str ┆ str ┆ f64 │ -╞═════╪═════╪═════════════╡ -│ S1 ┆ P6 ┆ 19.0 │ -│ S3 ┆ P2 ┆ 17.0 │ -│ S4 ┆ P6 ┆ 19.0 │ -│ S2 ┆ P2 ┆ 17.0 │ -└─────┴─────┴─────────────┘ - -Polars lazy output: -shape: (4, 3) -┌─────┬─────┬─────────────┐ -│ s ┆ p ┆ weight_mean │ -│ --- ┆ --- ┆ --- │ -│ str ┆ str ┆ f64 │ -╞═════╪═════╪═════════════╡ -│ S1 ┆ P6 ┆ 19.0 │ -│ S3 ┆ P2 ┆ 17.0 │ -│ S4 ┆ P6 ┆ 19.0 │ -│ S2 ┆ P2 ┆ 17.0 │ -└─────┴─────┴─────────────┘ +┌─────┬─────────────┬────────────┐ +│ s ┆ weight_mean ┆ weight_max │ +│ --- ┆ --- ┆ --- │ +│ str ┆ f64 ┆ f64 │ +╞═════╪═════════════╪════════════╡ +│ S2 ┆ 14.5 ┆ 17.0 │ +│ S3 ┆ 14.5 ┆ 17.0 │ +│ S4 ┆ 15.0 ┆ 19.0 │ +│ S1 ┆ 15.0 ┆ 19.0 │ +└─────┴─────────────┴────────────┘ ``` Magic! 🪄 diff --git a/f.py b/f.py index 396079dc1..a2f4835b4 100644 --- a/f.py +++ b/f.py @@ -1,32 +1,27 @@ # ruff: noqa -from typing import Any, TYPE_CHECKING, TypeVar +# type: ignore import pandas as pd import polars as pl import narwhals as nw -T = TypeVar("T") - def my_agnostic_function( - suppliers_native: T, - parts_native: T, -) -> T: - suppliers = nw.DataFrame(suppliers_native) - parts = nw.DataFrame(parts_native) + suppliers_native, + parts_native, +): + suppliers = nw.LazyFrame(suppliers_native) + parts = nw.LazyFrame(parts_native) result = ( suppliers.join(parts, left_on="city", right_on="city") - .filter( - nw.col("color").is_in(["Red", "Green"]), - nw.col("weight") > 14, - ) - .group_by("s", "p") + .filter(nw.col("weight") > 10) + .group_by("s") .agg( weight_mean=nw.col("weight").mean(), weight_max=nw.col("weight").max(), ) - ).with_columns(nw.col("weight_max").cast(nw.Int64)) + ) return nw.to_native(result) @@ -52,19 +47,6 @@ def my_agnostic_function( ) ) print("\nPolars output:") -print( - my_agnostic_function( - pl.DataFrame(suppliers), - pl.DataFrame(parts), - ) -) -print( - my_agnostic_function( - pl.DataFrame(suppliers), - pl.DataFrame(parts), - ) -) -print("\nPolars lazy output:") print( my_agnostic_function( pl.LazyFrame(suppliers), diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 3abf15169..3cce941d1 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING from typing import Any -from typing import Generic from typing import Iterable from typing import Literal from typing import Sequence @@ -21,22 +20,9 @@ from narwhals.series import Series from narwhals.typing import IntoExpr from narwhals.typing import T -from narwhals.typing import T -def _validate_features(df: Any, features: set[str]) -> None: - if (pl := get_polars()) is not None and isinstance(df, pl.DataFrame): - df_features = {"eager"} - elif (pl := get_polars()) is not None and isinstance(df, pl.LazyFrame): - df_features = {"lazy"} - else: - df_features = df._features - if diff := {f for f in features if f not in df_features}: - msg = f"Features {diff} not supported by {type(df)} DataFrame" - raise TypeError(msg) - - -class BaseFrame(Generic[T]): +class BaseFrame: _dataframe: Any _implementation: str @@ -122,7 +108,7 @@ def filter(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> Self: self._dataframe.filter(*predicates), ) - def group_by(self, *keys: str | Iterable[str]) -> GroupBy[T]: + def group_by(self, *keys: str | Iterable[str]) -> GroupBy: from narwhals.group_by import GroupBy # todo: groupby and lazygroupby @@ -156,7 +142,7 @@ def join( ) -class DataFrame(BaseFrame[T]): +class DataFrame(BaseFrame): def __init__( self, df: T, @@ -194,7 +180,7 @@ def to_numpy(self) -> Any: def shape(self) -> tuple[int, int]: return self._dataframe.shape # type: ignore[no-any-return] - def __getitem__(self, col_name: str) -> Series[Any]: + def __getitem__(self, col_name: str) -> Series: from narwhals.series import Series return Series(self._dataframe[col_name], implementation=self._implementation) @@ -203,7 +189,7 @@ def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: return self._dataframe.to_dict(as_series=as_series) # type: ignore[no-any-return] -class LazyFrame(BaseFrame[T]): +class LazyFrame(BaseFrame): def __init__( self, df: T, @@ -229,7 +215,7 @@ def __init__( msg = f"Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: {type(df)}" raise TypeError(msg) - def collect(self) -> DataFrame[Any]: + def collect(self) -> DataFrame: return DataFrame( self._dataframe.collect(), implementation=self._implementation, diff --git a/narwhals/group_by.py b/narwhals/group_by.py index 4f1630cd1..6f38c267d 100644 --- a/narwhals/group_by.py +++ b/narwhals/group_by.py @@ -1,28 +1,24 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Generic from typing import Iterable if TYPE_CHECKING: from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame from narwhals.typing import IntoExpr -from narwhals.typing import T # todo: make groupby and lazygroupby -class GroupBy(Generic[T]): - def __init__( - self, df: DataFrame[T] | LazyFrame[T], *keys: str | Iterable[str] - ) -> None: +class GroupBy: + def __init__(self, df: DataFrame | LazyFrame, *keys: str | Iterable[str]) -> None: self._df = df self._keys = keys def agg( self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr - ) -> DataFrame[T] | LazyFrame[T]: + ) -> DataFrame | LazyFrame: aggs, named_aggs = self._df._flatten_and_extract(*aggs, **named_aggs) return self._df.__class__( self._df._dataframe.group_by(*self._keys).agg(*aggs, **named_aggs), diff --git a/narwhals/series.py b/narwhals/series.py index 7303e1653..951f3366d 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2,20 +2,18 @@ from typing import TYPE_CHECKING from typing import Any -from typing import Generic from narwhals.translate import get_pandas from narwhals.translate import get_polars if TYPE_CHECKING: from typing_extensions import Self -from narwhals.typing import T -class Series(Generic[T]): +class Series: def __init__( self, - series: T, + series: Any, *, implementation: str | None = None, ) -> None: diff --git a/narwhals/translate.py b/narwhals/translate.py index 35b4ba5df..b00491ba0 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING +from typing import Any from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas @@ -9,21 +10,20 @@ if TYPE_CHECKING: from narwhals.dataframe import BaseFrame from narwhals.series import Series - from narwhals.typing import T -def to_native(obj: BaseFrame[T] | Series[T]) -> T: +def to_native(obj: BaseFrame | Series) -> Any: from narwhals.dataframe import BaseFrame from narwhals.series import Series if isinstance(obj, BaseFrame): - return ( # type: ignore[no-any-return] + return ( obj._dataframe if obj._implementation == "polars" else obj._dataframe._dataframe ) if isinstance(obj, Series): - return obj._series if obj._implementation == "polars" else obj._series._series # type: ignore[no-any-return] + return obj._series if obj._implementation == "polars" else obj._series._series msg = f"Expected Narwhals object, got {type(obj)}." raise TypeError(msg) diff --git a/narwhals/typing.py b/narwhals/typing.py index 14671f327..2504e85a3 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -1,5 +1,4 @@ from typing import TYPE_CHECKING -from typing import Any from typing import TypeAlias from typing import TypeVar @@ -7,7 +6,7 @@ from narwhals.expression import Expr from narwhals.series import Series - IntoExpr: TypeAlias = Expr | str | int | float | Series[Any] + IntoExpr: TypeAlias = Expr | str | int | float | Series NativeDataFrame = TypeVar("NativeDataFrame") NativeSeries = TypeVar("NativeSeries")