From 71f24f18d158f8e71955d2246264adc244575546 Mon Sep 17 00:00:00 2001 From: Nishant Bhakar Date: Tue, 28 Jan 2025 15:33:50 -0800 Subject: [PATCH] refactor: rename remaining unique -> distinct instances --- daft/daft/__init__.pyi | 2 +- daft/expressions/expressions.py | 4 ++-- src/daft-core/src/series/ops/agg.rs | 2 +- src/daft-core/src/series/ops/list.rs | 2 +- .../src/list/{unique.rs => distinct.rs} | 10 +++++----- .../src/list/{unique_count.rs => distinct_count.rs} | 12 ++++++------ src/daft-functions/src/list/mod.rs | 8 ++++---- src/daft-functions/src/python/list.rs | 6 +++--- src/daft-functions/src/python/mod.rs | 4 ++-- .../src/physical_planner/translate.rs | 7 ++++--- 10 files changed, 29 insertions(+), 28 deletions(-) rename src/daft-functions/src/list/{unique.rs => distinct.rs} (95%) rename src/daft-functions/src/list/{unique_count.rs => distinct_count.rs} (80%) diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi index 7e572573d9..862fbc93c4 100644 --- a/daft/daft/__init__.pyi +++ b/daft/daft/__init__.pyi @@ -1161,7 +1161,7 @@ def dt_truncate(expr: PyExpr, interval: str, relative_to: PyExpr) -> PyExpr: ... # --- def explode(expr: PyExpr) -> PyExpr: ... def list_sort(expr: PyExpr, desc: PyExpr, nulls_first: PyExpr) -> PyExpr: ... -def list_unique(expr: PyExpr, ignore_nulls: bool) -> PyExpr: ... +def list_distinct(expr: PyExpr, ignore_nulls: bool) -> PyExpr: ... def list_value_counts(expr: PyExpr) -> PyExpr: ... def list_join(expr: PyExpr, delimiter: PyExpr) -> PyExpr: ... def list_count(expr: PyExpr, mode: CountMode) -> PyExpr: ... diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 157c2c452f..76014089e3 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -25,8 +25,8 @@ from daft.daft import date_lit as _date_lit from daft.daft import decimal_lit as _decimal_lit from daft.daft import duration_lit as _duration_lit +from daft.daft import list_distinct as _list_distinct from daft.daft import list_sort as _list_sort -from daft.daft import list_unique as _list_unique from daft.daft import lit as _lit from daft.daft import series_lit as _series_lit from daft.daft import time_lit as _time_lit @@ -3264,7 +3264,7 @@ def distinct(self, ignore_nulls: bool = True) -> Expression: Returns: Expression: An expression with lists containing only unique elements """ - return Expression._from_pyexpr(_list_unique(self._expr, ignore_nulls)) + return Expression._from_pyexpr(_list_distinct(self._expr, ignore_nulls)) class ExpressionStructNamespace(ExpressionNamespace): diff --git a/src/daft-core/src/series/ops/agg.rs b/src/daft-core/src/series/ops/agg.rs index 67770cf890..93b5ccff1c 100644 --- a/src/daft-core/src/series/ops/agg.rs +++ b/src/daft-core/src/series/ops/agg.rs @@ -28,7 +28,7 @@ impl Series { } pub fn count_distinct(&self, groups: Option<&GroupIndices>) -> DaftResult { - let series = self.agg_list(groups)?.list_unique_count()?; + let series = self.agg_list(groups)?.list_distinct_count()?; Ok(series) } diff --git a/src/daft-core/src/series/ops/list.rs b/src/daft-core/src/series/ops/list.rs index 30b21e15ef..06b9680a4b 100644 --- a/src/daft-core/src/series/ops/list.rs +++ b/src/daft-core/src/series/ops/list.rs @@ -187,7 +187,7 @@ impl Series { /// ```txt /// [[1, 2, 3], [1, 1, 1], [NULL, NULL, 5]] -> [3, 1, 1] /// ``` - pub fn list_unique_count(&self) -> DaftResult { + pub fn list_distinct_count(&self) -> DaftResult { let field = Field::new(self.name(), DataType::UInt64); match self.data_type() { DataType::List(..) => { diff --git a/src/daft-functions/src/list/unique.rs b/src/daft-functions/src/list/distinct.rs similarity index 95% rename from src/daft-functions/src/list/unique.rs rename to src/daft-functions/src/list/distinct.rs index 8088b8015d..b94deed04c 100644 --- a/src/daft-functions/src/list/unique.rs +++ b/src/daft-functions/src/list/distinct.rs @@ -15,18 +15,18 @@ use daft_dsl::{ use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct ListUnique { +pub struct ListDistinct { ignore_nulls: bool, } #[typetag::serde] -impl ScalarUDF for ListUnique { +impl ScalarUDF for ListDistinct { fn as_any(&self) -> &dyn Any { self } fn name(&self) -> &'static str { - "list_unique" + "list_distinct" } fn to_field(&self, inputs: &[ExprRef], schema: &Schema) -> DaftResult { @@ -136,6 +136,6 @@ impl ScalarUDF for ListUnique { /// /// When ignore_nulls is true (default), nulls are excluded from the result. /// When ignore_nulls is false, nulls are included in the result. -pub fn list_unique(expr: ExprRef, ignore_nulls: bool) -> ExprRef { - ScalarFunction::new(ListUnique { ignore_nulls }, vec![expr]).into() +pub fn list_distinct(expr: ExprRef, ignore_nulls: bool) -> ExprRef { + ScalarFunction::new(ListDistinct { ignore_nulls }, vec![expr]).into() } diff --git a/src/daft-functions/src/list/unique_count.rs b/src/daft-functions/src/list/distinct_count.rs similarity index 80% rename from src/daft-functions/src/list/unique_count.rs rename to src/daft-functions/src/list/distinct_count.rs index 0d15e9b94f..b71c1ca32a 100644 --- a/src/daft-functions/src/list/unique_count.rs +++ b/src/daft-functions/src/list/distinct_count.rs @@ -12,16 +12,16 @@ use daft_dsl::{ use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct ListUniqueCount; +pub struct ListDistinctCount; #[typetag::serde] -impl ScalarUDF for ListUniqueCount { +impl ScalarUDF for ListDistinctCount { fn as_any(&self) -> &dyn Any { self } fn name(&self) -> &'static str { - "list_unique_count" + "list_distinct_count" } fn to_field(&self, inputs: &[ExprRef], schema: &Schema) -> DaftResult { @@ -39,7 +39,7 @@ impl ScalarUDF for ListUniqueCount { fn evaluate(&self, inputs: &[Series]) -> DaftResult { match inputs { - [input] => input.list_unique_count(), + [input] => input.list_distinct_count(), _ => Err(DaftError::SchemaMismatch(format!( "Expected 1 input arg, got {}", inputs.len() @@ -49,6 +49,6 @@ impl ScalarUDF for ListUniqueCount { } #[must_use] -pub fn list_unique_count(expr: ExprRef) -> ExprRef { - ScalarFunction::new(ListUniqueCount, vec![expr]).into() +pub fn list_distinct_count(expr: ExprRef) -> ExprRef { + ScalarFunction::new(ListDistinctCount, vec![expr]).into() } diff --git a/src/daft-functions/src/list/mod.rs b/src/daft-functions/src/list/mod.rs index 455b460324..f717b9e268 100644 --- a/src/daft-functions/src/list/mod.rs +++ b/src/daft-functions/src/list/mod.rs @@ -1,5 +1,7 @@ mod chunk; mod count; +mod distinct; +mod distinct_count; mod explode; mod get; mod join; @@ -10,12 +12,12 @@ mod min; mod slice; mod sort; mod sum; -mod unique; -mod unique_count; mod value_counts; pub use chunk::{list_chunk as chunk, ListChunk}; pub use count::{list_count as count, ListCount}; +pub use distinct::{list_distinct as distinct, ListDistinct}; +pub use distinct_count::{list_distinct_count as distinct_count, ListDistinctCount}; pub use explode::{explode, Explode}; pub use get::{list_get as get, ListGet}; pub use join::{list_join as join, ListJoin}; @@ -26,6 +28,4 @@ pub use min::{list_min as min, ListMin}; pub use slice::{list_slice as slice, ListSlice}; pub use sort::{list_sort as sort, ListSort}; pub use sum::{list_sum as sum, ListSum}; -pub use unique::{list_unique as unique, ListUnique}; -pub use unique_count::{list_unique_count as unique_count, ListUniqueCount}; pub use value_counts::list_value_counts as value_counts; diff --git a/src/daft-functions/src/python/list.rs b/src/daft-functions/src/python/list.rs index 57b82c6331..be5e3e3e75 100644 --- a/src/daft-functions/src/python/list.rs +++ b/src/daft-functions/src/python/list.rs @@ -3,7 +3,7 @@ use daft_dsl::python::PyExpr; use pyo3::{pyfunction, PyResult}; simple_python_wrapper!(list_chunk, crate::list::chunk, [expr: PyExpr, size: usize]); -simple_python_wrapper!(list_unique_count, crate::list::unique_count, [expr: PyExpr]); +simple_python_wrapper!(list_distinct_count, crate::list::distinct_count, [expr: PyExpr]); simple_python_wrapper!(list_count, crate::list::count, [expr: PyExpr, mode: CountMode]); simple_python_wrapper!(explode, crate::list::explode, [expr: PyExpr]); simple_python_wrapper!(list_get, crate::list::get, [expr: PyExpr, idx: PyExpr, default_value: PyExpr]); @@ -17,8 +17,8 @@ simple_python_wrapper!(list_value_counts, crate::list::value_counts, [expr: PyEx #[pyfunction] #[pyo3(signature = (expr, ignore_nulls=true))] -pub fn list_unique(expr: PyExpr, ignore_nulls: bool) -> PyResult { - Ok(crate::list::unique(expr.into(), ignore_nulls).into()) +pub fn list_distinct(expr: PyExpr, ignore_nulls: bool) -> PyResult { + Ok(crate::list::distinct(expr.into(), ignore_nulls).into()) } #[pyfunction] diff --git a/src/daft-functions/src/python/mod.rs b/src/daft-functions/src/python/mod.rs index abbcf0a463..c30a92d57f 100644 --- a/src/daft-functions/src/python/mod.rs +++ b/src/daft-functions/src/python/mod.rs @@ -65,9 +65,9 @@ pub fn register(parent: &Bound) -> PyResult<()> { add!(list::list_slice); add!(list::list_sort); add!(list::list_sum); - add!(list::list_unique_count); + add!(list::list_distinct_count); add!(list::list_value_counts); - add!(list::list_unique); + add!(list::list_distinct); add!(misc::to_struct); add!(misc::utf8_count_matches); diff --git a/src/daft-physical-plan/src/physical_planner/translate.rs b/src/daft-physical-plan/src/physical_planner/translate.rs index 49329f1600..8f79985a75 100644 --- a/src/daft-physical-plan/src/physical_planner/translate.rs +++ b/src/daft-physical-plan/src/physical_planner/translate.rs @@ -14,7 +14,7 @@ use daft_dsl::{ ApproxPercentileParams, Expr, ExprRef, SketchType, }; use daft_functions::{ - list::{unique, unique_count}, + list::{distinct, distinct_count}, numeric::sqrt, }; use daft_logical_plan::{ @@ -929,7 +929,7 @@ pub fn populate_aggregation_stages( ); // Final projection - let result = unique_count(col(list_concat_id.clone())).alias(output_name); + let result = distinct_count(col(list_concat_id.clone())).alias(output_name); final_exprs.push(result); } AggExpr::Sum(e) => { @@ -1107,7 +1107,8 @@ pub fn populate_aggregation_stages( schema, &mut second_stage_aggs, ); - let result = unique(col(list_concat_id.clone()), *ignore_nulls).alias(output_name); + let result = + distinct(col(list_concat_id.clone()), *ignore_nulls).alias(output_name); final_exprs.push(result); } AggExpr::Concat(e) => {