apache · Weijun-H · Dec 14, 2024 · Dec 14, 2024 · Dec 15, 2024 · Dec 24, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -36,6 +36,7 @@ DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
 PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
+PREFER_ROUND_ROBIN=${PREFER_ROUND_ROBIN:-true}
 VIRTUAL_ENV=${VIRTUAL_ENV:-$SCRIPT_DIR/venv}
 
 usage() {
@@ -93,6 +94,7 @@ CARGO_COMMAND       command that runs the benchmark binary
 DATAFUSION_DIR      directory to use (default $DATAFUSION_DIR)
 RESULTS_NAME        folder where the benchmark files are stored
 PREFER_HASH_JOIN    Prefer hash join algorithm (default true)
+PREFER_ROUND_ROBIN  Prefer round robin partitioning (default true)
 VENV_PATH           Python venv to use for compare and venv commands (default ./venv, override by <your-venv>/bin/activate)
 "
     exit 1
@@ -163,6 +165,9 @@ main() {
                 tpch10)
                     data_tpch "10"
                     ;;
+                tpch50)
+                    data_tpch "50"
+                    ;;
                 tpch_mem10)
                     # same data as for tpch10
                     data_tpch "10"
@@ -220,6 +225,7 @@ main() {
             echo "RESULTS_DIR: ${RESULTS_DIR}"
             echo "CARGO_COMMAND: ${CARGO_COMMAND}"
             echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
+            echo "PREFER_ROUND_ROBIN: ${PREFER_ROUND_ROBIN}"
             echo "***************************"
 
             # navigate to the appropriate directory
@@ -252,6 +258,9 @@ main() {
                 tpch10)
                     run_tpch "10"
                     ;;
+                tpch50)
+                    run_tpch "50"
+                    ;;
                 tpch_mem10)
                     run_tpch_mem "10"
                     ;;
@@ -378,7 +387,7 @@ run_tpch() {
     RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch benchmark..."
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --prefer_round_robin "${PREFER_ROUND_ROBIN}" --format parquet -o "${RESULTS_FILE}"
 }
 
 # Runs the tpch in memory
@@ -394,7 +403,7 @@ run_tpch_mem() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch_mem benchmark..."
     # -m means in memory
-    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --prefer_round_robin "${PREFER_ROUND_ROBIN}" --format parquet -o "${RESULTS_FILE}"
 }
 
 # Runs the parquet filter benchmark
@@ -472,7 +481,7 @@ run_clickbench_partitioned() {
     RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running clickbench (partitioned, 100 files) benchmark..."
-    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
+    $CARGO_COMMAND --bin dfbench -- clickbench  --iterations 5 --path "${DATA_DIR}/hits_partitioned" --prefer_round_robin "${PREFER_ROUND_ROBIN}" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
 }
 
 # Runs the clickbench "extended" benchmark with a single large parquet file

diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -27,6 +27,9 @@ use datafusion_common::exec_datafusion_err;
 use datafusion_common::instant::Instant;
 use structopt::StructOpt;
 
+// hack to avoid `default_value is meaningless for bool` errors
+type BoolDefaultTrue = bool;
+
 /// Run the clickbench benchmark
 ///
 /// The ClickBench[1] benchmarks are widely cited in the industry and
@@ -68,6 +71,11 @@ pub struct RunOpt {
     /// If present, write results json here
     #[structopt(parse(from_os_str), short = "o", long = "output")]
     output_path: Option<PathBuf>,
+
+    /// If true then round robin repartitioning is used, if false then on demand repartitioning
+    /// True by default.
+    #[structopt(short = "r", long = "prefer_round_robin", default_value = "true")]
+    prefer_round_robin: BoolDefaultTrue,
 }
 
 struct AllQueries {
@@ -124,6 +132,11 @@ impl RunOpt {
             parquet_options.binary_as_string = true;
         }
 
+        config
+            .options_mut()
+            .optimizer
+            .prefer_round_robin_repartition = self.prefer_round_robin;
+
         let ctx = SessionContext::new_with_config(config);
         self.register_hits(&ctx).await?;
 

diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
@@ -90,6 +90,11 @@ pub struct RunOpt {
     /// True by default.
     #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")]
     prefer_hash_join: BoolDefaultTrue,
+
+    /// If true then round robin repartitioning is used, if false then on demand repartitioning
+    /// True by default.
+    #[structopt(short = "r", long = "prefer_round_robin", default_value = "true")]
+    prefer_round_robin: BoolDefaultTrue,
 }
 
 const TPCH_QUERY_START_ID: usize = 1;
@@ -121,6 +126,10 @@ impl RunOpt {
             .config()
             .with_collect_statistics(!self.disable_statistics);
         config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
+        config
+            .options_mut()
+            .optimizer
+            .prefer_round_robin_repartition = self.prefer_round_robin;
         let ctx = SessionContext::new_with_config(config);
 
         // register tables
@@ -353,6 +362,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            prefer_round_robin: true,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(query)?;
@@ -386,6 +396,7 @@ mod tests {
             output_path: None,
             disable_statistics: false,
             prefer_hash_join: true,
+            prefer_round_robin: true,
         };
         opt.register_tables(&ctx).await?;
         let queries = get_query_sql(query)?;

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -574,6 +574,10 @@ config_namespace! {
         /// repartitioning to increase parallelism to leverage more CPU cores
         pub enable_round_robin_repartition: bool, default = true
 
+        /// When set to false, the physical plan optimizer will replace the round robin
+        /// repartitioning with on demand repartitioning
+        pub prefer_round_robin_repartition: bool, default = true
+
         /// When set to true, the optimizer will attempt to perform limit operations
         /// during aggregations, if possible
         pub enable_topk_aggregation: bool, default = true

diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs
@@ -45,8 +45,11 @@ mod sp_repartition_fuzz_tests {
     use test_utils::add_empty_batches;
 
     use datafusion_physical_expr_common::sort_expr::LexOrdering;
-    use datafusion_physical_plan::memory::MemorySourceConfig;
     use datafusion_physical_plan::source::DataSourceExec;
+    use datafusion_physical_plan::{
+        memory::MemorySourceConfig,
+        repartition::on_demand_repartition::OnDemandRepartitionExec,
+    };
     use itertools::izip;
     use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
 
@@ -296,25 +299,40 @@ mod sp_repartition_fuzz_tests {
         // behaviour. We can choose, n_distinct as we like. However,
         // we chose it a large number to decrease probability of having same rows in the table.
         let n_distinct = 1_000_000;
-        for (is_first_roundrobin, is_first_sort_preserving) in
-            [(false, false), (false, true), (true, false), (true, true)]
-        {
-            for is_second_roundrobin in [false, true] {
-                let mut handles = Vec::new();
-
-                for seed in seed_start..seed_end {
-                    #[allow(clippy::disallowed_methods)] // spawn allowed only in tests
-                    let job = tokio::spawn(run_sort_preserving_repartition_test(
-                        make_staggered_batches::<true>(n_row, n_distinct, seed as u64),
-                        is_first_roundrobin,
-                        is_first_sort_preserving,
-                        is_second_roundrobin,
-                    ));
-                    handles.push(job);
-                }
-
-                for job in handles {
-                    job.await.unwrap();
+        for use_on_demand_repartition in [false, true] {
+            for (is_first_roundrobin, is_first_sort_preserving) in
+                [(false, false), (false, true), (true, false), (true, true)]
+            {
+                for is_second_roundrobin in [false, true] {
+                    // On demand repartition only replaces the roundrobin repartition
+                    if use_on_demand_repartition
+                        && !is_first_roundrobin
+                        && !is_second_roundrobin
+                    {
+                        continue;
+                    }
+                    let mut handles = Vec::new();
+
+                    for seed in seed_start..seed_end {
+                        #[allow(clippy::disallowed_methods)]
+                        // spawn allowed only in tests
+                        let job = tokio::spawn(run_sort_preserving_repartition_test(
+                            make_staggered_batches::<true>(
+                                n_row,
+                                n_distinct,
+                                seed as u64,
+                            ),
+                            is_first_roundrobin,
+                            is_first_sort_preserving,
+                            is_second_roundrobin,
+                            use_on_demand_repartition,
+                        ));
+                        handles.push(job);
+                    }
+
+                    for job in handles {
+                        job.await.unwrap();
+                    }
                 }
             }
         }
@@ -343,9 +361,17 @@ mod sp_repartition_fuzz_tests {
         // If `true`, second repartition executor after `DataSourceExec` will be in `RoundRobin` mode
         // else it will be in `Hash` mode
         is_second_roundrobin: bool,
+        // If `true`, `OnDemandRepartitionExec` will be used instead of `RepartitionExec`
+        use_on_demand_repartition: bool,
     ) {
         let schema = input1[0].schema();
-        let session_config = SessionConfig::new().with_batch_size(50);
+        let mut session_config = SessionConfig::new().with_batch_size(50);
+        if use_on_demand_repartition {
+            session_config
+                .options_mut()
+                .optimizer
+                .prefer_round_robin_repartition = false;
+        }
         let ctx = SessionContext::new_with_config(session_config);
         let mut sort_keys = LexOrdering::default();
         for ordering_col in ["a", "b", "c"] {
@@ -367,16 +393,32 @@ mod sp_repartition_fuzz_tests {
         let hash_exprs = vec![col("c", &schema).unwrap()];
 
         let intermediate = match (is_first_roundrobin, is_first_sort_preserving) {
-            (true, true) => sort_preserving_repartition_exec_round_robin(running_source),
-            (true, false) => repartition_exec_round_robin(running_source),
+            (true, true) => {
+                if use_on_demand_repartition {
+                    sort_preserving_repartition_exec_on_demand(running_source)
+                } else {
+                    sort_preserving_repartition_exec_round_robin(running_source)
+                }
+            }
+            (true, false) => {
+                if use_on_demand_repartition {
+                    repartition_exec_on_demand(running_source)
+                } else {
+                    repartition_exec_round_robin(running_source)
+                }
+            }
             (false, true) => {
                 sort_preserving_repartition_exec_hash(running_source, hash_exprs.clone())
             }
             (false, false) => repartition_exec_hash(running_source, hash_exprs.clone()),
         };
 
         let intermediate = if is_second_roundrobin {
-            sort_preserving_repartition_exec_round_robin(intermediate)
+            if use_on_demand_repartition {
+                sort_preserving_repartition_exec_on_demand(intermediate)
+            } else {
+                sort_preserving_repartition_exec_round_robin(intermediate)
+            }
         } else {
             sort_preserving_repartition_exec_hash(intermediate, hash_exprs.clone())
         };
@@ -399,6 +441,16 @@ mod sp_repartition_fuzz_tests {
         )
     }
 
+    fn sort_preserving_repartition_exec_on_demand(
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(
+            OnDemandRepartitionExec::try_new(input, Partitioning::OnDemand(2))
+                .unwrap()
+                .with_preserve_order(),
+        )
+    }
+
     fn repartition_exec_round_robin(
         input: Arc<dyn ExecutionPlan>,
     ) -> Arc<dyn ExecutionPlan> {
@@ -407,6 +459,14 @@ mod sp_repartition_fuzz_tests {
         )
     }
 
+    fn repartition_exec_on_demand(
+        input: Arc<dyn ExecutionPlan>,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(
+            OnDemandRepartitionExec::try_new(input, Partitioning::OnDemand(2)).unwrap(),
+        )
+    }
+
     fn sort_preserving_repartition_exec_hash(
         input: Arc<dyn ExecutionPlan>,
         hash_expr: Vec<Arc<dyn PhysicalExpr>>,