Skip to content

Commit

Permalink
Implement
Browse files Browse the repository at this point in the history
  • Loading branch information
desmondcheongzx committed Jan 18, 2025
1 parent 5549d16 commit bf07945
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 1 deletion.
1 change: 1 addition & 0 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1688,6 +1688,7 @@ class LogicalPlanBuilder:
kwargs: dict[str, Any] | None = None,
) -> LogicalPlanBuilder: ...
def schema(self) -> PySchema: ...
def describe(self, to_describe: list[PyExpr]) -> LogicalPlanBuilder: ...
def optimize(self) -> LogicalPlanBuilder: ...
def to_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> PhysicalPlanScheduler: ...
def to_adaptive_physical_plan_scheduler(self, cfg: PyDaftExecutionConfig) -> AdaptivePhysicalPlanScheduler: ...
Expand Down
41 changes: 41 additions & 0 deletions daft/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,47 @@ def schema(self) -> Schema:
"""
return self.__builder.schema()

@DataframePublicAPI
def describe(self, columns: ManyColumnsInputType = []) -> "DataFrame":
"""Returns column statistics.
Currently returns the number of rows, nulls, approximate distinct, min, and max of the specified columns. This dataframe method is intended to aid data exploration and the API is subject to change.
Example:
>>> import daft
>>> df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]})
>>> df.describe("a").show()
╭─────────┬─────────┬───────────────────┬───────┬───────╮
│ a_count ┆ a_nulls ┆ a_approx_distinct ┆ a_min ┆ a_max │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ UInt64 ┆ UInt64 ┆ UInt64 ┆ Int64 ┆ Int64 │
╞═════════╪═════════╪═══════════════════╪═══════╪═══════╡
│ 3 ┆ 0 ┆ 3 ┆ 1 ┆ 3 │
╰─────────┴─────────┴───────────────────┴───────┴───────╯
<BLANKLINE>
(Showing first 1 of 1 rows)
>>> df.describe().show()
╭─────────┬─────────┬───────────────────┬───────┬───────┬─────────┬─────────┬───────────────────┬───────┬───────╮
│ a_count ┆ a_nulls ┆ a_approx_distinct ┆ a_min ┆ a_max ┆ b_count ┆ b_nulls ┆ b_approx_distinct ┆ b_min ┆ b_max │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ UInt64 ┆ UInt64 ┆ UInt64 ┆ Int64 ┆ Int64 ┆ UInt64 ┆ UInt64 ┆ UInt64 ┆ Utf8 ┆ Utf8 │
╞═════════╪═════════╪═══════════════════╪═══════╪═══════╪═════════╪═════════╪═══════════════════╪═══════╪═══════╡
│ 3 ┆ 0 ┆ 3 ┆ 1 ┆ 3 ┆ 3 ┆ 1 ┆ 2 ┆ a ┆ b │
╰─────────┴─────────┴───────────────────┴───────┴───────┴─────────┴─────────┴───────────────────┴───────┴───────╯
<BLANKLINE>
(Showing first 1 of 1 rows)
Args:
columns (ManyColumnsInputType): Columns to describe. If not specified, all columns will be described.
Returns:
DataFrame: A dataframe with the number of rows, nulls, approximate distinct, min, and max for each column. Column names
will be the original column names with `_count`, `_nulls`, `_approx_distinct`, `_min`, and `_max` appended.
"""
builder = self.__builder.describe(self._column_inputs_to_expressions(columns))
return DataFrame(builder)

@property
def column_names(self) -> List[str]:
"""Returns column names of DataFrame as a list of strings.
Expand Down
9 changes: 9 additions & 0 deletions daft/logical/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ def schema(self) -> Schema:
pyschema = self._builder.schema()
return Schema._from_pyschema(pyschema)

def describe(
self,
to_describe: list[Expression],
) -> LogicalPlanBuilder:
"""Summary stats for each column of the current logical plan."""
to_describe_pyexprs = [expr._expr for expr in to_describe]
builder = self._builder.describe(to_describe_pyexprs)
return LogicalPlanBuilder(builder)

def pretty_print(self, simple: bool = False, format: str = "ascii") -> str:
"""Pretty prints the current underlying logical plan."""
from daft.dataframe.display import MermaidOptions
Expand Down
9 changes: 9 additions & 0 deletions docs/source/api_docs/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,12 @@ Schema and Lineage
DataFrame.explain
DataFrame.schema
DataFrame.column_names

Statistics
##########

.. autosummary::
:nosignatures:
:toctree: doc_gen/dataframe_methods

DataFrame.describe
52 changes: 51 additions & 1 deletion src/daft-logical-plan/src/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ use common_error::{DaftError, DaftResult};
use common_file_formats::FileFormat;
use common_io_config::IOConfig;
use common_scan_info::{PhysicalScanInfo, Pushdowns, ScanOperatorRef};
use daft_core::join::{JoinStrategy, JoinType};
use daft_core::{
join::{JoinStrategy, JoinType},
prelude::CountMode,
};
use daft_dsl::{col, ExprRef};
use daft_schema::schema::{Schema, SchemaRef};
use indexmap::IndexSet;
Expand Down Expand Up @@ -735,6 +738,45 @@ impl LogicalPlanBuilder {
self.plan.schema()
}

pub fn describe(&self, to_describe: Vec<ExprRef>) -> DaftResult<Self> {
let mut agg_exprs = Vec::new();
// If no columns are specified, describe all columns.
let to_describe = if to_describe.is_empty() {
let schema = self.schema();
schema
.fields
.iter()
.map(|(name, _)| col(name.as_str()))
.collect()
} else {
let expr_resolver = ExprResolver::default();
let (to_describe, _) = expr_resolver.resolve(to_describe, &self.schema())?;
to_describe
};
// For each column, aggregate the count, nulls, approx distinct, min, and max.
for expr in &to_describe {
let name = expr.name();
agg_exprs.push(
expr.clone()
.count(CountMode::All)
.alias(format!("{}_count", name).as_str()),
);
agg_exprs.push(
expr.clone()
.count(CountMode::Null)
.alias(format!("{}_nulls", name).as_str()),
);
agg_exprs.push(
expr.clone()
.approx_count_distinct()
.alias(format!("{}_approx_distinct", name).as_str()),
);
agg_exprs.push(expr.clone().min().alias(format!("{}_min", name).as_str()));
agg_exprs.push(expr.clone().max().alias(format!("{}_max", name).as_str()));
}
self.aggregate(agg_exprs, vec![])
}

pub fn repr_ascii(&self, simple: bool) -> String {
self.plan.repr_ascii(simple)
}
Expand Down Expand Up @@ -1105,6 +1147,14 @@ impl PyLogicalPlanBuilder {
Ok(self.builder.schema().into())
}

pub fn describe(&self, to_describe: Vec<PyExpr>) -> PyResult<Self> {
let to_describe_exprs = to_describe
.iter()
.map(|e| e.clone().into())
.collect::<Vec<ExprRef>>();
Ok(self.builder.describe(to_describe_exprs)?.into())
}

/// Optimize the underlying logical plan, returning a new plan builder containing the optimized plan.
pub fn optimize(&self, py: Python) -> PyResult<Self> {
py.allow_threads(|| Ok(self.builder.optimize()?.into()))
Expand Down
48 changes: 48 additions & 0 deletions tests/dataframe/test_describe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from __future__ import annotations

import pytest

import daft


def test_describe_dataframe_missing_col() -> None:
df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]})

with pytest.raises(ValueError):
df = df.describe(["foo", "b"])

with pytest.raises(ValueError):
df = df.describe("foo")


def test_describe_dataframe(make_df, valid_data: list[dict[str, float]]) -> None:
df = daft.from_pydict({"a": [1, 2, 3], "b": [None, "a", "b"]})
expected = {
"a_count": [3],
"a_nulls": [0],
"a_approx_distinct": [3],
"a_min": [1],
"a_max": [3],
"b_count": [3],
"b_nulls": [1],
"b_approx_distinct": [2],
"b_min": ["a"],
"b_max": ["b"],
}

df_all_cols = df.describe(["a", "b"])
assert df_all_cols.collect().to_pydict() == expected

df_none_specified = df.describe()
assert df_none_specified.collect().to_pydict() == expected

expected_one_col = {
"a_count": [3],
"a_nulls": [0],
"a_approx_distinct": [3],
"a_min": [1],
"a_max": [3],
}

df_one_col = df.describe("a")
assert df_one_col.collect().to_pydict() == expected_one_col

0 comments on commit bf07945

Please sign in to comment.