diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi index bb8a1b8cd5..242e8c9644 100644 --- a/daft/daft/__init__.pyi +++ b/daft/daft/__init__.pyi @@ -1688,6 +1688,7 @@ class LogicalPlanBuilder: kwargs: dict[str, Any] | None = None, ) -> LogicalPlanBuilder: ... def schema(self) -> PySchema: ... + def describe(self, to_describe: list[PyExpr]) -> LogicalPlanBuilder: ... def optimize(self) -> LogicalPlanBuilder: ... def to_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> PhysicalPlanScheduler: ... def to_adaptive_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> AdaptivePhysicalPlanScheduler: ... diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py index 2735549f2b..8283e638b0 100644 --- a/daft/dataframe/dataframe.py +++ b/daft/dataframe/dataframe.py @@ -233,6 +233,47 @@ def schema(self) -> Schema: """ return self.__builder.schema() + @DataframePublicAPI + def describe(self, columns: ManyColumnsInputType = []) -> "DataFrame": + """Returns column statistics. + + Currently returns the number of rows, nulls, approximate distinct, min, and max of the specified columns. This dataframe method is intended to aid data exploration and the API is subject to change. + + Example: + >>> import daft + >>> df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]}) + >>> df.describe("a").show() + ╭─────────┬─────────┬───────────────────┬───────┬───────╮ + │ a_count ┆ a_nulls ┆ a_approx_distinct ┆ a_min ┆ a_max │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ UInt64 ┆ UInt64 ┆ UInt64 ┆ Int64 ┆ Int64 │ + ╞═════════╪═════════╪═══════════════════╪═══════╪═══════╡ + │ 3 ┆ 0 ┆ 3 ┆ 1 ┆ 3 │ + ╰─────────┴─────────┴───────────────────┴───────┴───────╯ + + (Showing first 1 of 1 rows) + + >>> df.describe().show() + ╭─────────┬─────────┬───────────────────┬───────┬───────┬─────────┬─────────┬───────────────────┬───────┬───────╮ + │ a_count ┆ a_nulls ┆ a_approx_distinct ┆ a_min ┆ a_max ┆ b_count ┆ b_nulls ┆ b_approx_distinct ┆ b_min ┆ b_max │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ UInt64 ┆ UInt64 ┆ UInt64 ┆ Int64 ┆ Int64 ┆ UInt64 ┆ UInt64 ┆ UInt64 ┆ Utf8 ┆ Utf8 │ + ╞═════════╪═════════╪═══════════════════╪═══════╪═══════╪═════════╪═════════╪═══════════════════╪═══════╪═══════╡ + │ 3 ┆ 0 ┆ 3 ┆ 1 ┆ 3 ┆ 3 ┆ 1 ┆ 2 ┆ a ┆ b │ + ╰─────────┴─────────┴───────────────────┴───────┴───────┴─────────┴─────────┴───────────────────┴───────┴───────╯ + + (Showing first 1 of 1 rows) + + Args: + columns (ManyColumnsInputType): Columns to describe. If not specified, all columns will be described. + + Returns: + DataFrame: A dataframe with the number of rows, nulls, approximate distinct, min, and max for each column. Column names + will be the original column names with `_count`, `_nulls`, `_approx_distinct`, `_min`, and `_max` appended. + """ + builder = self.__builder.describe(self._column_inputs_to_expressions(columns)) + return DataFrame(builder) + @property def column_names(self) -> List[str]: """Returns column names of DataFrame as a list of strings. diff --git a/daft/logical/builder.py b/daft/logical/builder.py index c1a47b054b..eb18267d5b 100644 --- a/daft/logical/builder.py +++ b/daft/logical/builder.py @@ -89,6 +89,15 @@ def schema(self) -> Schema: pyschema = self._builder.schema() return Schema._from_pyschema(pyschema) + def describe( + self, + to_describe: list[Expression], + ) -> LogicalPlanBuilder: + """Summary stats for each column of the current logical plan.""" + to_describe_pyexprs = [expr._expr for expr in to_describe] + builder = self._builder.describe(to_describe_pyexprs) + return LogicalPlanBuilder(builder) + def pretty_print(self, simple: bool = False, format: str = "ascii") -> str: """Pretty prints the current underlying logical plan.""" from daft.dataframe.display import MermaidOptions diff --git a/docs/source/api_docs/dataframe.rst b/docs/source/api_docs/dataframe.rst index 14a4e9fa20..5a4c1cba78 100644 --- a/docs/source/api_docs/dataframe.rst +++ b/docs/source/api_docs/dataframe.rst @@ -193,3 +193,12 @@ Schema and Lineage DataFrame.explain DataFrame.schema DataFrame.column_names + +Statistics +########## + +.. autosummary:: + :nosignatures: + :toctree: doc_gen/dataframe_methods + + DataFrame.describe diff --git a/src/daft-logical-plan/src/builder/mod.rs b/src/daft-logical-plan/src/builder/mod.rs index 244a42f933..b9569073e3 100644 --- a/src/daft-logical-plan/src/builder/mod.rs +++ b/src/daft-logical-plan/src/builder/mod.rs @@ -13,7 +13,10 @@ use common_error::{DaftError, DaftResult}; use common_file_formats::FileFormat; use common_io_config::IOConfig; use common_scan_info::{PhysicalScanInfo, Pushdowns, ScanOperatorRef}; -use daft_core::join::{JoinStrategy, JoinType}; +use daft_core::{ + join::{JoinStrategy, JoinType}, + prelude::CountMode, +}; use daft_dsl::{col, ExprRef}; use daft_schema::schema::{Schema, SchemaRef}; use indexmap::IndexSet; @@ -735,6 +738,45 @@ impl LogicalPlanBuilder { self.plan.schema() } + pub fn describe(&self, to_describe: Vec) -> DaftResult { + let mut agg_exprs = Vec::new(); + // If no columns are specified, describe all columns. + let to_describe = if to_describe.is_empty() { + let schema = self.schema(); + schema + .fields + .iter() + .map(|(name, _)| col(name.as_str())) + .collect() + } else { + let expr_resolver = ExprResolver::default(); + let (to_describe, _) = expr_resolver.resolve(to_describe, &self.schema())?; + to_describe + }; + // For each column, aggregate the count, nulls, approx distinct, min, and max. + for expr in &to_describe { + let name = expr.name(); + agg_exprs.push( + expr.clone() + .count(CountMode::All) + .alias(format!("{}_count", name).as_str()), + ); + agg_exprs.push( + expr.clone() + .count(CountMode::Null) + .alias(format!("{}_nulls", name).as_str()), + ); + agg_exprs.push( + expr.clone() + .approx_count_distinct() + .alias(format!("{}_approx_distinct", name).as_str()), + ); + agg_exprs.push(expr.clone().min().alias(format!("{}_min", name).as_str())); + agg_exprs.push(expr.clone().max().alias(format!("{}_max", name).as_str())); + } + self.aggregate(agg_exprs, vec![]) + } + pub fn repr_ascii(&self, simple: bool) -> String { self.plan.repr_ascii(simple) } @@ -1105,6 +1147,14 @@ impl PyLogicalPlanBuilder { Ok(self.builder.schema().into()) } + pub fn describe(&self, to_describe: Vec) -> PyResult { + let to_describe_exprs = to_describe + .iter() + .map(|e| e.clone().into()) + .collect::>(); + Ok(self.builder.describe(to_describe_exprs)?.into()) + } + /// Optimize the underlying logical plan, returning a new plan builder containing the optimized plan. pub fn optimize(&self, py: Python) -> PyResult { py.allow_threads(|| Ok(self.builder.optimize()?.into())) diff --git a/tests/dataframe/test_describe.py b/tests/dataframe/test_describe.py new file mode 100644 index 0000000000..03e47f31e8 --- /dev/null +++ b/tests/dataframe/test_describe.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import pytest + +import daft + + +def test_describe_dataframe_missing_col() -> None: + df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]}) + + with pytest.raises(ValueError): + df = df.describe(["foo", "b"]) + + with pytest.raises(ValueError): + df = df.describe("foo") + + +def test_describe_dataframe(make_df, valid_data: list[dict[str, float]]) -> None: + df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]}) + expected = { + "a_count": [3], + "a_nulls": [0], + "a_approx_distinct": [3], + "a_min": [1], + "a_max": [3], + "b_count": [3], + "b_nulls": [1], + "b_approx_distinct": [2], + "b_min": ["a"], + "b_max": ["b"], + } + + df_all_cols = df.describe(["a", "b"]) + assert df_all_cols.collect().to_pydict() == expected + + df_none_specified = df.describe() + assert df_none_specified.collect().to_pydict() == expected + + expected_one_col = { + "a_count": [3], + "a_nulls": [0], + "a_approx_distinct": [3], + "a_min": [1], + "a_max": [3], + } + + df_one_col = df.describe("a") + assert df_one_col.collect().to_pydict() == expected_one_col