apache · alamb · Feb 3, 2025 · Jan 25, 2025 · Jan 25, 2025 · Jan 25, 2025
diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs
@@ -19,6 +19,7 @@
 //! and return types of functions in DataFusion.
 
 use std::fmt::Display;
+use std::num::NonZeroUsize;
 
 use crate::type_coercion::aggregates::NUMERICS;
 use arrow::datatypes::{DataType, IntervalUnit, TimeUnit};
@@ -236,9 +237,9 @@ pub enum ArrayFunctionSignature {
     /// The first argument should be non-list or list, and the second argument should be List/LargeList.
     /// The first argument's list dimension should be one dimension less than the second argument's list dimension.
     ElementAndArray,
-    /// Specialized Signature for Array functions of the form (List/LargeList, Index)
-    /// The first argument should be List/LargeList/FixedSizedList, and the second argument should be Int64.
-    ArrayAndIndex,
+    /// Specialized Signature for Array functions of the form (List/LargeList, Index+)
+    /// The first argument should be List/LargeList/FixedSizedList, and the next n arguments should be Int64.
+    ArrayAndIndexes(NonZeroUsize),
     /// Specialized Signature for Array functions of the form (List/LargeList, Element, Optional Index)
     ArrayAndElementAndOptionalIndex,
     /// Specialized Signature for ArrayEmpty and similar functions
@@ -265,8 +266,12 @@ impl Display for ArrayFunctionSignature {
             ArrayFunctionSignature::ElementAndArray => {
                 write!(f, "element, array")
             }
-            ArrayFunctionSignature::ArrayAndIndex => {
-                write!(f, "array, index")
+            ArrayFunctionSignature::ArrayAndIndexes(count) => {
+                write!(f, "array")?;
+                for _ in 0..count.get() {
+                    write!(f, ", index")?;
+                }
+                Ok(())
             }
             ArrayFunctionSignature::Array => {
                 write!(f, "array")
@@ -600,9 +605,13 @@ impl Signature {
     }
     /// Specialized Signature for ArrayElement and similar functions
     pub fn array_and_index(volatility: Volatility) -> Self {
+        Self::array_and_indexes(volatility, NonZeroUsize::new(1).expect("1 is non-zero"))
+    }
+    /// Specialized Signature for ArraySlice and similar functions
+    pub fn array_and_indexes(volatility: Volatility, count: NonZeroUsize) -> Self {
         Signature {
             type_signature: TypeSignature::ArraySignature(
-                ArrayFunctionSignature::ArrayAndIndex,
+                ArrayFunctionSignature::ArrayAndIndexes(count),
             ),
             volatility,
         }

diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs
@@ -94,8 +94,8 @@ pub use udaf::{
     aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF, StatisticsArgs,
 };
 pub use udf::{
-    scalar_doc_sections, ReturnInfo, ReturnTypeArgs, ScalarFunctionArgs, ScalarUDF,
-    ScalarUDFImpl,
+    scalar_doc_sections, NullHandling, ReturnInfo, ReturnTypeArgs, ScalarFunctionArgs,
+    ScalarUDF, ScalarUDFImpl,
 };
 pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl};
 pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits};

diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs
@@ -671,13 +671,20 @@ fn get_valid_types(
             ArrayFunctionSignature::ElementAndArray => {
                 array_append_or_prepend_valid_types(current_types, false)?
             }
-            ArrayFunctionSignature::ArrayAndIndex => {
-                if current_types.len() != 2 {
+            ArrayFunctionSignature::ArrayAndIndexes(count) => {
+                if current_types.len() != count.get() + 1 {
                     return Ok(vec![vec![]]);
                 }
                 array(&current_types[0]).map_or_else(
                     || vec![vec![]],
-                    |array_type| vec![vec![array_type, DataType::Int64]],
+                    |array_type| {
+                        let mut inner = Vec::with_capacity(count.get() + 1);
+                        inner.push(array_type);
+                        for _ in 0..count.get() {
+                            inner.push(DataType::Int64);
+                        }
+                        vec![inner]
+                    },
                 )
             }
             ArrayFunctionSignature::ArrayAndElementAndOptionalIndex => {

diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
@@ -389,7 +389,7 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
 
     /// Whether the aggregate function is nullable.
     ///
-    /// Nullable means that that the function could return `null` for any inputs.
+    /// Nullable means that the function could return `null` for any inputs.
     /// For example, aggregate functions like `COUNT` always return a non null value
     /// but others like `MIN` will return `NULL` if there is nullable input.
     /// Note that if the function is declared as *not* nullable, make sure the [`AggregateUDFImpl::default_value`] is `non-null`

diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs
@@ -200,6 +200,11 @@ impl ScalarUDF {
         self.inner.return_type_from_args(args)
     }
 
+    /// Returns the behavior that this function has when any of the inputs are Null.
+    pub fn null_handling(&self) -> NullHandling {
+        self.inner.null_handling()
+    }
+
     /// Do the function rewrite
     ///
     /// See [`ScalarUDFImpl::simplify`] for more details.
@@ -417,6 +422,15 @@ impl ReturnInfo {
     }
 }
 
+/// A function's behavior when the input is Null.
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub enum NullHandling {
+    /// Null inputs are passed into the function implementation.
+    PassThrough,
+    /// Any Null input causes the function to return Null.
+    Propagate,
+}
+
 /// Trait for implementing user defined scalar functions.
 ///
 /// This trait exposes the full API for implementing user defined functions and
@@ -589,6 +603,11 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
         true
     }
 
+    /// Returns the behavior that this function has when any of the inputs are Null.
+    fn null_handling(&self) -> NullHandling {
+        NullHandling::PassThrough
+    }
+
     /// Invoke the function on `args`, returning the appropriate result
     ///
     /// Note: This method is deprecated and will be removed in future releases.

diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs
@@ -27,6 +27,7 @@ use arrow::array::MutableArrayData;
 use arrow::array::OffsetSizeTrait;
 use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
+use arrow_buffer::NullBufferBuilder;
 use arrow_schema::DataType::{FixedSizeList, LargeList, List};
 use arrow_schema::Field;
 use datafusion_common::cast::as_int64_array;
@@ -35,12 +36,13 @@ use datafusion_common::cast::as_list_array;
 use datafusion_common::{
     exec_err, internal_datafusion_err, plan_err, DataFusionError, Result,
 };
-use datafusion_expr::Expr;
+use datafusion_expr::{ArrayFunctionSignature, Expr, TypeSignature};
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, NullHandling, ScalarUDFImpl, Signature, Volatility,
 };
 use datafusion_macros::user_doc;
 use std::any::Any;
+use std::num::NonZeroUsize;
 use std::sync::Arc;
 
 use crate::utils::make_scalar_function;
@@ -330,7 +332,26 @@ pub(super) struct ArraySlice {
 impl ArraySlice {
     pub fn new() -> Self {
         Self {
-            signature: Signature::variadic_any(Volatility::Immutable),
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::ArraySignature(
+                        ArrayFunctionSignature::ArrayAndIndexes(
+                            NonZeroUsize::new(1).expect("1 is non-zero"),
+                        ),
+                    ),
+                    TypeSignature::ArraySignature(
+                        ArrayFunctionSignature::ArrayAndIndexes(
+                            NonZeroUsize::new(2).expect("2 is non-zero"),
+                        ),
+                    ),
+                    TypeSignature::ArraySignature(
+                        ArrayFunctionSignature::ArrayAndIndexes(
+                            NonZeroUsize::new(3).expect("3 is non-zero"),
+                        ),
+                    ),
+                ],
+                Volatility::Immutable,
+            ),
             aliases: vec![String::from("list_slice")],
         }
     }
@@ -374,6 +395,10 @@ impl ScalarUDFImpl for ArraySlice {
         Ok(arg_types[0].clone())
     }
 
+    fn null_handling(&self) -> NullHandling {
+        NullHandling::Propagate
+    }
+
     fn invoke_batch(
         &self,
         args: &[ColumnarValue],
@@ -430,8 +455,6 @@ fn array_slice_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
         }
         LargeList(_) => {
             let array = as_large_list_array(&args[0])?;
-            let from_array = as_int64_array(&args[1])?;
-            let to_array = as_int64_array(&args[2])?;
             general_array_slice::<i64>(array, from_array, to_array, stride)
         }
         _ => exec_err!("array_slice does not support type: {:?}", array_data_type),
@@ -451,9 +474,8 @@ where
     let original_data = values.to_data();
     let capacity = Capacities::Array(original_data.len());
 
-    // use_nulls: false, we don't need nulls but empty array for array_slice, so we don't need explicit nulls but adjust offset to indicate nulls.
     let mut mutable =
-        MutableArrayData::with_capacities(vec![&original_data], false, capacity);
+        MutableArrayData::with_capacities(vec![&original_data], true, capacity);
 
     // We have the slice syntax compatible with DuckDB v0.8.1.
     // The rule `adjusted_from_index` and `adjusted_to_index` follows the rule of array_slice in duckdb.
@@ -516,30 +538,33 @@ where
     }
 
     let mut offsets = vec![O::usize_as(0)];
+    let mut null_builder = NullBufferBuilder::new(array.len());
 
     for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
         let start = offset_window[0];
         let end = offset_window[1];
         let len = end - start;
 
-        // len 0 indicate array is null, return empty array in this row.
+        // If any input is null, return null.
+        if array.is_null(row_index)
+            || from_array.is_null(row_index)
+            || to_array.is_null(row_index)
+        {
+            mutable.extend_nulls(1);
+            offsets.push(offsets[row_index] + O::usize_as(1));
+            null_builder.append_null();
+            continue;
+        }
+        null_builder.append_non_null();
+
+        // Empty arrays always return an empty array.
         if len == O::usize_as(0) {
             offsets.push(offsets[row_index]);
             continue;
         }
 
-        // If index is null, we consider it as the minimum / maximum index of the array.
-        let from_index = if from_array.is_null(row_index) {
-            Some(O::usize_as(0))
-        } else {
-            adjusted_from_index::<O>(from_array.value(row_index), len)?
-        };
-
-        let to_index = if to_array.is_null(row_index) {
-            Some(len - O::usize_as(1))
-        } else {
-            adjusted_to_index::<O>(to_array.value(row_index), len)?
-        };
+        let from_index = adjusted_from_index::<O>(from_array.value(row_index), len)?;
+        let to_index = adjusted_to_index::<O>(to_array.value(row_index), len)?;
 
         if let (Some(from), Some(to)) = (from_index, to_index) {
             let stride = stride.map(|s| s.value(row_index));
@@ -613,7 +638,7 @@ where
         Arc::new(Field::new_list_field(array.value_type(), true)),
         OffsetBuffer::<O>::new(offsets.into()),
         arrow_array::make_array(data),
-        None,
+        null_builder.finish(),
     )?))
 }
 
@@ -665,6 +690,10 @@ impl ScalarUDFImpl for ArrayPopFront {
         Ok(arg_types[0].clone())
     }
 
+    fn null_handling(&self) -> NullHandling {
+        NullHandling::Propagate
+    }
+
     fn invoke_batch(
         &self,
         args: &[ColumnarValue],
@@ -765,6 +794,10 @@ impl ScalarUDFImpl for ArrayPopBack {
         Ok(arg_types[0].clone())
     }
 
+    fn null_handling(&self) -> NullHandling {
+        NullHandling::Propagate
+    }
+
     fn invoke_batch(
         &self,
         args: &[ColumnarValue],

diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs
@@ -45,7 +45,8 @@ use datafusion_expr::interval_arithmetic::Interval;
 use datafusion_expr::sort_properties::ExprProperties;
 use datafusion_expr::type_coercion::functions::data_types_with_scalar_udf;
 use datafusion_expr::{
-    expr_vec_fmt, ColumnarValue, Expr, ReturnTypeArgs, ScalarFunctionArgs, ScalarUDF,
+    expr_vec_fmt, ColumnarValue, Expr, NullHandling, ReturnTypeArgs, ScalarFunctionArgs,
+    ScalarUDF,
 };
 
 /// Physical expression of a scalar function
@@ -186,6 +187,15 @@ impl PhysicalExpr for ScalarFunctionExpr {
             .map(|e| e.evaluate(batch))
             .collect::<Result<Vec<_>>>()?;
 
+        if self.fun.null_handling() == NullHandling::Propagate
+            && args.iter().any(
+                |arg| matches!(arg, ColumnarValue::Scalar(scalar) if scalar.is_null()),
+            )
+        {
+            let null_value = ScalarValue::try_from(&self.return_type)?;
+            return Ok(ColumnarValue::Scalar(null_value));
+        }
+
         let input_empty = args.is_empty();
         let input_all_scalar = args
             .iter()