Implement

Eventual-Inc · Jan 18, 2025 · bf07945 · bf07945
1 parent 5549d16
commit bf07945
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 1 deletion.
diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -1688,6 +1688,7 @@ class LogicalPlanBuilder:
         kwargs: dict[str, Any] | None = None,
     ) -> LogicalPlanBuilder: ...
     def schema(self) -> PySchema: ...
+    def describe(self, to_describe: list[PyExpr]) -> LogicalPlanBuilder: ...
     def optimize(self) -> LogicalPlanBuilder: ...
     def to_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> PhysicalPlanScheduler: ...
     def to_adaptive_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> AdaptivePhysicalPlanScheduler: ...

diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
@@ -233,6 +233,47 @@ def schema(self) -> Schema:
         """
         return self.__builder.schema()
 
+    @DataframePublicAPI
+    def describe(self, columns: ManyColumnsInputType = []) -> "DataFrame":
+        """Returns column statistics.
+
+        Currently returns the number of rows, nulls, approximate distinct, min, and max of the specified columns. This dataframe method is intended to aid data exploration and the API is subject to change.
+
+        Example:
+        >>> import daft
+        >>> df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]})
+        >>> df.describe("a").show()
+        ╭─────────┬─────────┬───────────────────┬───────┬───────╮
+        │ a_count ┆ a_nulls ┆ a_approx_distinct ┆ a_min ┆ a_max │
+        │ ---     ┆ ---     ┆ ---               ┆ ---   ┆ ---   │
+        │ UInt64  ┆ UInt64  ┆ UInt64            ┆ Int64 ┆ Int64 │
+        ╞═════════╪═════════╪═══════════════════╪═══════╪═══════╡
+        │ 3       ┆ 0       ┆ 3                 ┆ 1     ┆ 3     │
+        ╰─────────┴─────────┴───────────────────┴───────┴───────╯
+        <BLANKLINE>
+        (Showing first 1 of 1 rows)
+
+        >>> df.describe().show()
+        ╭─────────┬─────────┬───────────────────┬───────┬───────┬─────────┬─────────┬───────────────────┬───────┬───────╮
+        │ a_count ┆ a_nulls ┆ a_approx_distinct ┆ a_min ┆ a_max ┆ b_count ┆ b_nulls ┆ b_approx_distinct ┆ b_min ┆ b_max │
+        │ ---     ┆ ---     ┆ ---               ┆ ---   ┆ ---   ┆ ---     ┆ ---     ┆ ---               ┆ ---   ┆ ---   │
+        │ UInt64  ┆ UInt64  ┆ UInt64            ┆ Int64 ┆ Int64 ┆ UInt64  ┆ UInt64  ┆ UInt64            ┆ Utf8  ┆ Utf8  │
+        ╞═════════╪═════════╪═══════════════════╪═══════╪═══════╪═════════╪═════════╪═══════════════════╪═══════╪═══════╡
+        │ 3       ┆ 0       ┆ 3                 ┆ 1     ┆ 3     ┆ 3       ┆ 1       ┆ 2                 ┆ a     ┆ b     │
+        ╰─────────┴─────────┴───────────────────┴───────┴───────┴─────────┴─────────┴───────────────────┴───────┴───────╯
+        <BLANKLINE>
+        (Showing first 1 of 1 rows)
+
+        Args:
+            columns (ManyColumnsInputType): Columns to describe. If not specified, all columns will be described.
+
+        Returns:
+            DataFrame: A dataframe with the number of rows, nulls, approximate distinct, min, and max for each column. Column names
+                will be the original column names with `_count`, `_nulls`, `_approx_distinct`, `_min`, and `_max` appended.
+        """
+        builder = self.__builder.describe(self._column_inputs_to_expressions(columns))
+        return DataFrame(builder)
+
     @property
     def column_names(self) -> List[str]:
         """Returns column names of DataFrame as a list of strings.

diff --git a/daft/logical/builder.py b/daft/logical/builder.py
@@ -89,6 +89,15 @@ def schema(self) -> Schema:
         pyschema = self._builder.schema()
         return Schema._from_pyschema(pyschema)
 
+    def describe(
+        self,
+        to_describe: list[Expression],
+    ) -> LogicalPlanBuilder:
+        """Summary stats for each column of the current logical plan."""
+        to_describe_pyexprs = [expr._expr for expr in to_describe]
+        builder = self._builder.describe(to_describe_pyexprs)
+        return LogicalPlanBuilder(builder)
+
     def pretty_print(self, simple: bool = False, format: str = "ascii") -> str:
         """Pretty prints the current underlying logical plan."""
         from daft.dataframe.display import MermaidOptions

diff --git a/docs/source/api_docs/dataframe.rst b/docs/source/api_docs/dataframe.rst
@@ -193,3 +193,12 @@ Schema and Lineage
     DataFrame.explain
     DataFrame.schema
     DataFrame.column_names
+
+Statistics
+##########
+
+.. autosummary::
+    :nosignatures:
+    :toctree: doc_gen/dataframe_methods
+
+    DataFrame.describe
diff --git a/src/daft-logical-plan/src/builder/mod.rs b/src/daft-logical-plan/src/builder/mod.rs
@@ -13,7 +13,10 @@ use common_error::{DaftError, DaftResult};
 use common_file_formats::FileFormat;
 use common_io_config::IOConfig;
 use common_scan_info::{PhysicalScanInfo, Pushdowns, ScanOperatorRef};
-use daft_core::join::{JoinStrategy, JoinType};
+use daft_core::{
+    join::{JoinStrategy, JoinType},
+    prelude::CountMode,
+};
 use daft_dsl::{col, ExprRef};
 use daft_schema::schema::{Schema, SchemaRef};
 use indexmap::IndexSet;
@@ -735,6 +738,45 @@ impl LogicalPlanBuilder {
         self.plan.schema()
     }
 
+    pub fn describe(&self, to_describe: Vec<ExprRef>) -> DaftResult<Self> {
+        let mut agg_exprs = Vec::new();
+        // If no columns are specified, describe all columns.
+        let to_describe = if to_describe.is_empty() {
+            let schema = self.schema();
+            schema
+                .fields
+                .iter()
+                .map(|(name, _)| col(name.as_str()))
+                .collect()
+        } else {
+            let expr_resolver = ExprResolver::default();
+            let (to_describe, _) = expr_resolver.resolve(to_describe, &self.schema())?;
+            to_describe
+        };
+        // For each column, aggregate the count, nulls, approx distinct, min, and max.
+        for expr in &to_describe {
+            let name = expr.name();
+            agg_exprs.push(
+                expr.clone()
+                    .count(CountMode::All)
+                    .alias(format!("{}_count", name).as_str()),
+            );
+            agg_exprs.push(
+                expr.clone()
+                    .count(CountMode::Null)
+                    .alias(format!("{}_nulls", name).as_str()),
+            );
+            agg_exprs.push(
+                expr.clone()
+                    .approx_count_distinct()
+                    .alias(format!("{}_approx_distinct", name).as_str()),
+            );
+            agg_exprs.push(expr.clone().min().alias(format!("{}_min", name).as_str()));
+            agg_exprs.push(expr.clone().max().alias(format!("{}_max", name).as_str()));
+        }
+        self.aggregate(agg_exprs, vec![])
+    }
+
     pub fn repr_ascii(&self, simple: bool) -> String {
         self.plan.repr_ascii(simple)
     }
@@ -1105,6 +1147,14 @@ impl PyLogicalPlanBuilder {
         Ok(self.builder.schema().into())
     }
 
+    pub fn describe(&self, to_describe: Vec<PyExpr>) -> PyResult<Self> {
+        let to_describe_exprs = to_describe
+            .iter()
+            .map(|e| e.clone().into())
+            .collect::<Vec<ExprRef>>();
+        Ok(self.builder.describe(to_describe_exprs)?.into())
+    }
+
     /// Optimize the underlying logical plan, returning a new plan builder containing the optimized plan.
     pub fn optimize(&self, py: Python) -> PyResult<Self> {
         py.allow_threads(|| Ok(self.builder.optimize()?.into()))

diff --git a/tests/dataframe/test_describe.py b/tests/dataframe/test_describe.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import pytest
+
+import daft
+
+
+def test_describe_dataframe_missing_col() -> None:
+    df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]})
+
+    with pytest.raises(ValueError):
+        df = df.describe(["foo", "b"])
+
+    with pytest.raises(ValueError):
+        df = df.describe("foo")
+
+
+def test_describe_dataframe(make_df, valid_data: list[dict[str, float]]) -> None:
+    df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]})
+    expected = {
+        "a_count": [3],
+        "a_nulls": [0],
+        "a_approx_distinct": [3],
+        "a_min": [1],
+        "a_max": [3],
+        "b_count": [3],
+        "b_nulls": [1],
+        "b_approx_distinct": [2],
+        "b_min": ["a"],
+        "b_max": ["b"],
+    }
+
+    df_all_cols = df.describe(["a", "b"])
+    assert df_all_cols.collect().to_pydict() == expected
+
+    df_none_specified = df.describe()
+    assert df_none_specified.collect().to_pydict() == expected
+
+    expected_one_col = {
+        "a_count": [3],
+        "a_nulls": [0],
+        "a_approx_distinct": [3],
+        "a_min": [1],
+        "a_max": [3],
+    }
+
+    df_one_col = df.describe("a")
+    assert df_one_col.collect().to_pydict() == expected_one_col