diff --git a/demo.py b/demo.py index 3ddf7464b..24f009e8b 100644 --- a/demo.py +++ b/demo.py @@ -1,13 +1,14 @@ # ruff: noqa +# type: ignore from typing import Any import polars as pl -import modin.pandas as mpd +# import modin.pandas as mpd import narwhals as nw -def func(df_raw: nw.typing.T) -> nw.typing.T: - df: nw.DataFrame[nw.typing.T] = nw.DataFrame(df_raw) +def func(df_raw): + df = nw.DataFrame(df_raw) res = df.with_columns( d=nw.col("a") + 1, e=nw.col("a") + nw.col("b"), @@ -24,8 +25,8 @@ def func(df_raw: nw.typing.T) -> nw.typing.T: df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) print(func(df)) -df = mpd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) -print(func(df)) +# df = mpd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) +# print(func(df)) df = pl.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) print(func(df)) df = pl.LazyFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 4c8255d73..a221376c3 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -4,6 +4,7 @@ from narwhals.containers import is_polars from narwhals.containers import is_series from narwhals.dataframe import DataFrame +from narwhals.dataframe import LazyFrame from narwhals.dtypes import * # noqa: F403 from narwhals.expression import all from narwhals.expression import col @@ -34,5 +35,6 @@ "sum", "sum_horizontal", "DataFrame", + "LazyFrame", "Series", ] diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 524be9986..3abf15169 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -36,40 +36,14 @@ def _validate_features(df: Any, features: set[str]) -> None: raise TypeError(msg) -class DataFrame(Generic[T]): - def __init__( - self, - df: T, - *, - features: Iterable[str] | None = None, - implementation: str | None = None, - ) -> None: - self._features: set[str] = set(features) if features is not None else set() - if implementation is not None: - self._dataframe: Any = df - self._implementation = implementation - return - if (pl := get_polars()) is not None and isinstance( - df, (pl.DataFrame, pl.LazyFrame) - ): - self._dataframe = df - self._implementation = "polars" - elif (pd := get_pandas()) is not None and isinstance(df, pd.DataFrame): - self._dataframe = PandasDataFrame(df, implementation="pandas") - self._implementation = "pandas" - elif (mpd := get_modin()) is not None and isinstance(df, mpd.DataFrame): - self._dataframe = PandasDataFrame(df, implementation="modin") - self._implementation = "modin" - else: - msg = f"Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: {type(df)}" - raise TypeError(msg) - _validate_features(self._dataframe, self._features) +class BaseFrame(Generic[T]): + _dataframe: Any + _implementation: str def _from_dataframe(self, df: Any) -> Self: # construct, preserving properties - return self.__class__( + return self.__class__( # type: ignore[call-arg] df, - features=self._features, implementation=self._implementation, ) @@ -86,7 +60,7 @@ def _extract_native(self, arg: Any) -> Any: if self._implementation != "polars": return arg - if isinstance(arg, DataFrame): + if isinstance(arg, BaseFrame): return arg._dataframe if isinstance(arg, Series): return arg._series @@ -121,23 +95,6 @@ def schema(self) -> dict[str, DType]: def columns(self) -> list[str]: return self._dataframe.columns # type: ignore[no-any-return] - @property - def shape(self) -> tuple[int, int]: - if "eager" not in self._features: - raise RuntimeError( - "`DataFrame.shape` can only be called when feature 'eager' is enabled" - ) - return self._dataframe.shape # type: ignore[no-any-return] - - def __getitem__(self, col_name: str) -> Series[Any]: - from narwhals.series import Series - - if "eager" not in self._features: - raise RuntimeError( - "`DataFrame.__getitem__` can only be called when feature 'eager' is enabled" - ) - return Series(self._dataframe[col_name], implementation=self._implementation) - def with_columns( self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr ) -> Self: @@ -168,7 +125,8 @@ def filter(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> Self: def group_by(self, *keys: str | Iterable[str]) -> GroupBy[T]: from narwhals.group_by import GroupBy - return GroupBy(self, *keys) + # todo: groupby and lazygroupby + return GroupBy(self, *keys) # type: ignore[arg-type] def sort( self, @@ -180,22 +138,6 @@ def sort( self._dataframe.sort(by, *more_by, descending=descending) ) - def collect(self) -> Self: - if "lazy" not in self._features: - raise RuntimeError( - "`DataFrame.collect` can only be called when feature 'lazy' is enabled" - ) - features = {f for f in self._features if f != "lazy"} - features.add("eager") - return self.__class__( - self._dataframe.collect(), - implementation=self._implementation, - features=features, - ) - - def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: - return self._dataframe.to_dict(as_series=as_series) # type: ignore[no-any-return] - def join( self, other: Self, @@ -213,16 +155,82 @@ def join( ) ) - def to_pandas(self) -> Any: - if "eager" not in self._features: - raise RuntimeError( - "`DataFrame.to_pandas` can only be called when feature 'eager' is enabled" + +class DataFrame(BaseFrame[T]): + def __init__( + self, + df: T, + *, + implementation: str | None = None, + ) -> None: + if implementation is not None: + self._dataframe: Any = df + self._implementation = implementation + return + if (pl := get_polars()) is not None and isinstance(df, pl.DataFrame): + self._dataframe = df + self._implementation = "polars" + elif (pl := get_polars()) is not None and isinstance(df, pl.LazyFrame): + raise TypeError( + "Can't instantiate DataFrame from Polars LazyFrame. Call `collect()` first, or use `narwhals.LazyFrame` if you don't specifically require eager execution." ) + elif (pd := get_pandas()) is not None and isinstance(df, pd.DataFrame): + self._dataframe = PandasDataFrame(df, implementation="pandas") + self._implementation = "pandas" + elif (mpd := get_modin()) is not None and isinstance(df, mpd.DataFrame): + self._dataframe = PandasDataFrame(df, implementation="modin") + self._implementation = "modin" + else: + msg = f"Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: {type(df)}" + raise TypeError(msg) + + def to_pandas(self) -> Any: return self._dataframe.to_pandas() def to_numpy(self) -> Any: - if "eager" not in self._features: - raise RuntimeError( - "`DataFrame.to_numpy` can only be called when feature 'eager' is enabled" - ) return self._dataframe.to_numpy() + + @property + def shape(self) -> tuple[int, int]: + return self._dataframe.shape # type: ignore[no-any-return] + + def __getitem__(self, col_name: str) -> Series[Any]: + from narwhals.series import Series + + return Series(self._dataframe[col_name], implementation=self._implementation) + + def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: + return self._dataframe.to_dict(as_series=as_series) # type: ignore[no-any-return] + + +class LazyFrame(BaseFrame[T]): + def __init__( + self, + df: T, + *, + implementation: str | None = None, + ) -> None: + if implementation is not None: + self._dataframe: Any = df + self._implementation = implementation + return + if (pl := get_polars()) is not None and isinstance( + df, (pl.DataFrame, pl.LazyFrame) + ): + self._dataframe = df.lazy() + self._implementation = "polars" + elif (pd := get_pandas()) is not None and isinstance(df, pd.DataFrame): + self._dataframe = PandasDataFrame(df, implementation="pandas") + self._implementation = "pandas" + elif (mpd := get_modin()) is not None and isinstance(df, mpd.DataFrame): + self._dataframe = PandasDataFrame(df, implementation="modin") + self._implementation = "modin" + else: + msg = f"Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: {type(df)}" + raise TypeError(msg) + + def collect(self) -> DataFrame[Any]: + return DataFrame( + self._dataframe.collect(), + implementation=self._implementation, + ) diff --git a/narwhals/group_by.py b/narwhals/group_by.py index 664beeb08..4f1630cd1 100644 --- a/narwhals/group_by.py +++ b/narwhals/group_by.py @@ -6,21 +6,25 @@ if TYPE_CHECKING: from narwhals.dataframe import DataFrame + from narwhals.dataframe import LazyFrame from narwhals.typing import IntoExpr from narwhals.typing import T +# todo: make groupby and lazygroupby + class GroupBy(Generic[T]): - def __init__(self, df: DataFrame[T], *keys: str | Iterable[str]) -> None: + def __init__( + self, df: DataFrame[T] | LazyFrame[T], *keys: str | Iterable[str] + ) -> None: self._df = df self._keys = keys def agg( self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr - ) -> DataFrame[T]: + ) -> DataFrame[T] | LazyFrame[T]: aggs, named_aggs = self._df._flatten_and_extract(*aggs, **named_aggs) return self._df.__class__( self._df._dataframe.group_by(*self._keys).agg(*aggs, **named_aggs), implementation=self._df._implementation, - features=self._df._features, ) diff --git a/narwhals/translate.py b/narwhals/translate.py index 20d028f57..35b4ba5df 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -7,16 +7,16 @@ from narwhals.dependencies import get_polars if TYPE_CHECKING: - from narwhals.dataframe import DataFrame + from narwhals.dataframe import BaseFrame from narwhals.series import Series from narwhals.typing import T -def to_native(obj: DataFrame[T] | Series[T]) -> T: - from narwhals.dataframe import DataFrame +def to_native(obj: BaseFrame[T] | Series[T]) -> T: + from narwhals.dataframe import BaseFrame from narwhals.series import Series - if isinstance(obj, DataFrame): + if isinstance(obj, BaseFrame): return ( # type: ignore[no-any-return] obj._dataframe if obj._implementation == "polars" diff --git a/tests/test_common.py b/tests/test_common.py index d200b4215..e43a45dbf 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -20,7 +20,7 @@ [df_pandas, df_polars, df_lazy], ) def test_sort(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.sort("a", "b") result_native = nw.to_native(result) expected = { @@ -36,7 +36,7 @@ def test_sort(df_raw: Any) -> None: [df_pandas, df_polars, df_lazy], ) def test_filter(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.filter(nw.col("a") > 1) result_native = nw.to_native(result) expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]} @@ -48,7 +48,7 @@ def test_filter(df_raw: Any) -> None: [df_pandas, df_polars, df_lazy], ) def test_add(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.with_columns( c=nw.col("a") + nw.col("b"), d=nw.col("a") - nw.col("a").mean(), @@ -69,7 +69,7 @@ def test_add(df_raw: Any) -> None: [df_pandas, df_polars, df_lazy], ) def test_double(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.with_columns(nw.all() * 2) result_native = nw.to_native(result) expected = {"a": [2, 6, 4], "b": [8, 8, 12], "z": [14.0, 16.0, 18.0]} @@ -78,7 +78,7 @@ def test_double(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_sumh(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.with_columns(horizonal_sum=nw.sum_horizontal(nw.col("a"), nw.col("b"))) result_native = nw.to_native(result) expected = { @@ -92,7 +92,7 @@ def test_sumh(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_sumh_literal(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.with_columns(horizonal_sum=nw.sum_horizontal("a", nw.col("b"))) result_native = nw.to_native(result) expected = { @@ -106,7 +106,7 @@ def test_sumh_literal(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_sum_all(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.select(nw.all().sum()) result_native = nw.to_native(result) expected = {"a": [6], "b": [14], "z": [24.0]} @@ -115,7 +115,7 @@ def test_sum_all(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_double_selected(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.select(nw.col("a", "b") * 2) result_native = nw.to_native(result) expected = {"a": [2, 6, 4], "b": [8, 8, 12]} @@ -124,7 +124,7 @@ def test_double_selected(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_rename(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.rename({"a": "x", "b": "y"}) result_native = nw.to_native(result) expected = {"x": [1, 3, 2], "y": [4, 4, 6], "z": [7.0, 8, 9]} @@ -133,7 +133,7 @@ def test_rename(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_join(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) df_right = df.rename({"z": "z_right"}) result = df.join(df_right, left_on=["a", "b"], right_on=["a", "b"], how="inner") result_native = nw.to_native(result) @@ -143,7 +143,7 @@ def test_join(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_schema(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.schema expected = {"a": nw.dtypes.Int64, "b": nw.dtypes.Int64, "z": nw.dtypes.Float64} assert result == expected @@ -151,7 +151,7 @@ def test_schema(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars, df_lazy]) def test_columns(df_raw: Any) -> None: - df = nw.DataFrame(df_raw) + df = nw.LazyFrame(df_raw) result = df.columns expected = ["a", "b", "z"] assert len(result) == len(expected) @@ -164,4 +164,4 @@ def test_accepted_dataframes() -> None: TypeError, match="Expected pandas-like dataframe, Polars dataframe, or Polars lazyframe, got: ", ): - nw.DataFrame(array) + nw.LazyFrame(array) diff --git a/tests/tpch_q1_test.py b/tests/tpch_q1_test.py index 21d834356..c699f5797 100644 --- a/tests/tpch_q1_test.py +++ b/tests/tpch_q1_test.py @@ -21,7 +21,7 @@ ) def test_q1(df_raw: Any) -> None: var_1 = datetime(1998, 9, 2) - df = nw.DataFrame(df_raw, features=["lazy"]) + df = nw.LazyFrame(df_raw) query_result = ( df.filter(nw.col("l_shipdate") <= var_1) .group_by(["l_returnflag", "l_linestatus"]) @@ -82,7 +82,7 @@ def test_q1(df_raw: Any) -> None: @mock.patch.dict(os.environ, {"NARWHALS_FORCE_GENERIC": "1"}) def test_q1_w_pandas_agg_generic_path(df_raw: Any) -> None: var_1 = datetime(1998, 9, 2) - df = nw.DataFrame(df_raw, features=["lazy"]) + df = nw.LazyFrame(df_raw) query_result = ( df.filter(nw.col("l_shipdate") <= var_1) .group_by(["l_returnflag", "l_linestatus"])