From fe244f2754bd5e37d0a380aedae4bdbbdee78906 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 25 Jan 2025 02:05:08 +0000 Subject: [PATCH 1/5] Adding script and documentation for regenerating sqlite test files. --- datafusion/sqllogictest/README.md | 7 + .../sqllogictest/regenerate_sqlite_files.sh | 179 ++++++++++++++++++ docs/source/contributor-guide/testing.md | 2 + 3 files changed, 188 insertions(+) create mode 100755 datafusion/sqllogictest/regenerate_sqlite_files.sh diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index 4a7dc09d7dd1..2b4391bb95a4 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -250,6 +250,13 @@ database engine. The output is a full script that is a copy of the prototype scr You can update the tests / generate expected output by passing the `--complete` argument. +To regenerate and complete the sqlite test suite's files in datafusion-testing/data/sqlite/ please refer to the +'./regenerate_sqlite_files.sh' file. + +*WARNING*: The regenerate_sqlite_files.sh is experimental and should be understood and run with an abundance of caution. +When run the script will clone a remote repository locally, replace the location of a dependency with a custom git +version, will replace an existing .rs file with one from a github gist and will run various commands locally. + ```shell # Update ddl.slt with output from running cargo test --test sqllogictests -- ddl --complete diff --git a/datafusion/sqllogictest/regenerate_sqlite_files.sh b/datafusion/sqllogictest/regenerate_sqlite_files.sh new file mode 100755 index 000000000000..a88c8efe40c1 --- /dev/null +++ b/datafusion/sqllogictest/regenerate_sqlite_files.sh @@ -0,0 +1,179 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +echo "This script is experimental! Please read the following completely to understand +what this script does before running it. + +This script is designed to regenerate the .slt files in datafusion-testing/data/sqlite/ +from source files obtained from a git repository. To do that the following steps are +performed: + +- Verify required commands are installed +- Clone the remote git repository into /tmp/sqlitetesting +- Delete all existing .slt files in datafusion-testing/data/sqlite/ folder +- Copy the .test files from the cloned git repo into datafusion-testing +- Remove a few directories and files from the copied files (not relevant to DataFusion) +- Rename the .test files to .slt and cleanses the files. Cleansing involves: + - dos2unix + - removing all references to mysql, mssql and postgresql + - adds in a new 'control resultmode valuewise' statement at the top of all files + - updates the files to change 'AS REAL' to 'AS FLOAT8' +- Replace the sqllogictest-rs dependency in the Cargo.toml with a version to + a git repository that has been custom written to properly complete the files + with comparison of datafusion results with postgresql +- Replace the sqllogictest.rs file with a customized version from a github gist + that will work with the customized sqllogictest-rs dependency +- Run the sqlite test with completion (takes > 1 hr) +- Update a few results to ignore known issues +- Run sqlite test to verify results +- Perform cleanup to revert changes to the Cargo.toml file/sqllogictest.rs file +- Delete backup files and the /tmp/sqlitetesting directory +" +read -r -p "Do you understand and accept the risk? (yes/no): " acknowledgement + +if [ "${acknowledgement,,}" != "yes" ]; then + exit 0 +else + echo "Ok, Proceeding..." +fi + +if [ ! -x "$(command -v sd)" ]; then + echo "This script required 'sd' to be installed. Install via 'cargo install sd' or using your local package manager" + exit 0 +else + echo "sd command is installed, proceeding..." +fi + +DF_HOME=$(realpath "../../") + +# location where we'll clone sql-logic-test repos +if [ ! -d "/tmp/sqlitetesting" ]; then + mkdir /tmp/sqlitetesting +fi + +if [ ! -d "/tmp/sqlitetesting/sql-logic-test" ]; then + echo "Cloning sql-logic-test into /tmp/sqlitetesting/" + cd /tmp/sqlitetesting/ || exit; + git clone https://github.com/hydromatic/sql-logic-test.git +fi + +echo "Removing all existing .slt files from datafusion-testing/data/sqlite/ directory" + +cd "$DF_HOME/datafusion-testing/data/sqlite/" || exit; +find ./ -type f -name "*.slt" -exec rm {} \; + +echo "Copying .test files from sql-logic-test to datafusion-testing/data/sqlite/" + +cp -r /tmp/sqlitetesting/sql-logic-test/src/main/resources/test/* ./ + +echo "Removing 'evidence/*' and 'index/delete/*' directories from datafusion-testing" + +find ./evidence/ -type f -name "*.test" -exec rm {} \; +rm -rf ./index/delete/1 +rm -rf ./index/delete/10 +rm -rf ./index/delete/100 +rm -rf ./index/delete/1000 +rm -rf ./index/delete/10000 +# this file is empty and causes the sqllogictest-rs code to fail +rm ./index/view/10/slt_good_0.test + +echo "Renaming .test files to .slt and cleansing the files ..." + +# add hash-threshold lines into these 3 files as they were missing +sed -i '1s/^/hash-threshold 8\n\n/' ./select1.test +sed -i '1s/^/hash-threshold 8\n\n/' ./select4.test +sed -i '1s/^/hash-threshold 8\n\n/' ./select5.test +# rename +find ./ -type f -name "*.test" -exec rename -f 's/\.test/\.slt/' {} \; +# gotta love windows :/ +find ./ -type f -name "*.slt" -exec dos2unix --quiet {} \; +# add in control resultmode +find ./ -type f -name "*.slt" -exec sd -f i 'hash-threshold 8\n' 'hash-threshold 8\ncontrol resultmode valuewise\n' {} \; +# remove mysql tests and skipif lines +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mysql.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'skipif mysql.+\n' '' {} \; +# remove postgres skipif +find ./ -type f -name "*.slt" -exec sd -f i 'skipif postgresql(.+)\n' '' {} \; +# remove mssql tests +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'onlyif mssql.+\n.+\n.+\n\n' '' {} \; +find ./ -type f -name "*.slt" -exec sd -f i 'skipif mssql # not compatible\n' '' {} \; +# change REAL datatype to FLOAT8 +find ./ -type f -name "*.slt" -exec sd -f i 'AS REAL' 'AS FLOAT8' {} \; + +echo "Updating the datafusion/sqllogictest/Cargo.toml file with an updated sqllogictest dependency" + +# update the sqllogictest Cargo.toml with the new repo for sqllogictest-rs (tied to a specific hash) +cd "$DF_HOME" || exit; +sed -i -e 's~^sqllogictest.*~sqllogictest = { git = "https://github.com/Omega359/sqllogictest-rs.git", rev = "1cd933d" }~' datafusion/sqllogictest/Cargo.toml + +echo "Replacing the datafusion/sqllogictest/bin/sqllogictests.rs file with a custom version required for running completion" + +# replace the sqllogictest.rs with a customized version. This is tied to a specific version of the gist +curl --silent https://gist.githubusercontent.com/Omega359/5e6548a096fbe0c36ce14c547776db56/raw/3ccd0ee8049657c496d5068f56baa8408c1f00a3/sqllogictest.rs > datafusion/sqllogictest/bin/sqllogictests.rs + +echo "Running the sqllogictests with sqlite completion. This will take approximately an hour to run" + +cargo test --profile release-nonlto --features postgres --test sqllogictests -- --include-sqlite --postgres-runner --complete + +if [ $? -eq 0 ]; then + echo "Applying patches for #13784 (https://github.com/apache/datafusion/issues/13784)" + + sd -f i 'query I rowsort label-2475\n' '# Datafusion - #13784 - https://github.com/apache/datafusion/issues/13784\nskipif Datafusion\nquery I rowsort label-2475\n' datafusion-testing/data/sqlite/random/aggregates/slt_good_102.slt + sd -f i 'query I rowsort label-3738\n' '# Datafusion - #13784 - https://github.com/apache/datafusion/issues/13784\nskipif Datafusion\nquery I rowsort label-3738\n' datafusion-testing/data/sqlite/random/aggregates/slt_good_112.slt + + echo "Running the sqllogictests with sqlite files. This will take approximately 20 minutes to run" + + cargo test --profile release-nonlto --test sqllogictests -- --include-sqlite + + if [ $? -eq 0 ]; then + echo "Sqlite tests completed successfully. The datafusion-testing git submodule is ready to be pushed to a new remote and a PR created in https://github.com/apache/datafusion-testing/" + + else + echo "Completion of sqlite test files failed. Please correct the issues in the .slt files and run the test again using the command 'cargo test --profile release-nonlto --test sqllogictests -- --include-sqlite'" + fi +else + echo "Completion of sqlite test files failed!" +fi + +echo "Cleaning up source code changes and temporary files and directories" + +cd "$DF_HOME" || exit; +find ./datafusion-testing/data/sqlite/ -type f -name "*.bak" -exec rm {} \; +git checkout datafusion/sqllogictest/Cargo.toml +git checkout datafusion/sqllogictest/bin/sqllogictests.rs +rm -rf /tmp/sqlitetesting diff --git a/docs/source/contributor-guide/testing.md b/docs/source/contributor-guide/testing.md index b955b09050b3..3da0e0fe594d 100644 --- a/docs/source/contributor-guide/testing.md +++ b/docs/source/contributor-guide/testing.md @@ -56,6 +56,8 @@ DataFusion's SQL implementation is tested using [sqllogictest](https://github.co Like similar systems such as [DuckDB](https://duckdb.org/dev/testing), DataFusion has chosen to trade off a slightly higher barrier to contribution for longer term maintainability. +DataFusion has integrated [sqlite's test suite](https://sqlite.org/sqllogictest/doc/trunk/about.wiki) as a supplemental test suite that is run whenever a PR is merged into DataFusion. To run it manually please refer to the [README](https://github.com/apache/datafusion/blob/main/datafusion/sqllogictest/README.md#running-tests-sqlite) file for instructions. + ## Rust Integration Tests There are several tests of the public interface of the DataFusion library in the [tests](https://github.com/apache/datafusion/tree/main/datafusion/core/tests) directory. From 70a14d85d42c93011dafb1e5d02b650f57a60df9 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 25 Jan 2025 02:12:46 +0000 Subject: [PATCH 2/5] Prettier. --- datafusion/sqllogictest/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index 2b4391bb95a4..257937a65e4b 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -250,12 +250,12 @@ database engine. The output is a full script that is a copy of the prototype scr You can update the tests / generate expected output by passing the `--complete` argument. -To regenerate and complete the sqlite test suite's files in datafusion-testing/data/sqlite/ please refer to the -'./regenerate_sqlite_files.sh' file. +To regenerate and complete the sqlite test suite's files in datafusion-testing/data/sqlite/ please refer to the +'./regenerate_sqlite_files.sh' file. -*WARNING*: The regenerate_sqlite_files.sh is experimental and should be understood and run with an abundance of caution. -When run the script will clone a remote repository locally, replace the location of a dependency with a custom git -version, will replace an existing .rs file with one from a github gist and will run various commands locally. +_WARNING_: The regenerate_sqlite_files.sh is experimental and should be understood and run with an abundance of caution. +When run the script will clone a remote repository locally, replace the location of a dependency with a custom git +version, will replace an existing .rs file with one from a github gist and will run various commands locally. ```shell # Update ddl.slt with output from running From ffaf4dd7e36847decb9a45d68c835ce19094652d Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 31 Jan 2025 20:47:06 +0000 Subject: [PATCH 3/5] Script updates, include customized sqllogictests.rs file for regeneration. --- .../sqllogictest/regenerate/sqllogictests.rs | 730 ++++++++++++++++++ .../sqllogictest/regenerate_sqlite_files.sh | 39 +- 2 files changed, 761 insertions(+), 8 deletions(-) create mode 100644 datafusion/sqllogictest/regenerate/sqllogictests.rs diff --git a/datafusion/sqllogictest/regenerate/sqllogictests.rs b/datafusion/sqllogictest/regenerate/sqllogictests.rs new file mode 100644 index 000000000000..a2706558814f --- /dev/null +++ b/datafusion/sqllogictest/regenerate/sqllogictests.rs @@ -0,0 +1,730 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use datafusion_common::instant::Instant; +use datafusion_common::utils::get_available_parallelism; +use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError, Result}; +use datafusion_common_runtime::SpawnedTask; +use datafusion_sqllogictest::{DataFusion, TestContext}; +use futures::stream::StreamExt; +use indicatif::{ + HumanDuration, MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle, +}; +use itertools::Itertools; +use log::Level::{Info, Warn}; +use log::{info, log_enabled, warn}; +use sqllogictest::{ + parse_file, strict_column_validator, AsyncDB, Condition, Normalizer, Record, + Validator, +}; + +#[cfg(feature = "postgres")] +use crate::postgres_container::{ + initialize_postgres_container, terminate_postgres_container, +}; +use std::ffi::OsStr; +use std::fs; +use std::path::{Path, PathBuf}; + +#[cfg(feature = "postgres")] +mod postgres_container; + +const TEST_DIRECTORY: &str = "test_files/"; +const DATAFUSION_TESTING_TEST_DIRECTORY: &str = "../../datafusion-testing/data/"; +const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_"; +const SQLITE_PREFIX: &str = "sqlite"; + +pub fn main() -> Result<()> { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()? + .block_on(run_tests()) +} + +// Trailing whitespace from lines in SLT will typically be removed, but do not fail if it is not +// If particular test wants to cover trailing whitespace on a value, +// it should project additional non-whitespace column on the right. +#[allow(clippy::ptr_arg)] +fn value_normalizer(s: &String) -> String { + s.trim_end().to_string() +} + +fn sqlite_value_validator( + normalizer: Normalizer, + actual: &[Vec], + expected: &[String], +) -> bool { + let normalized_expected = expected.iter().map(normalizer).collect::>(); + let normalized_actual = actual + .iter() + .map(|strs| strs.iter().map(normalizer).join(" ")) + .collect_vec(); + + if log_enabled!(Info) && normalized_actual != normalized_expected { + info!("sqlite validation failed. actual vs expected:"); + for i in 0..normalized_actual.len() { + info!("[{i}] {}", normalized_actual[i]); + info!( + "[{i}] {}", + if normalized_expected.len() >= i { + &normalized_expected[i] + } else { + "No more results" + } + ); + } + } + + normalized_actual == normalized_expected +} + +fn df_value_validator( + normalizer: Normalizer, + actual: &[Vec], + expected: &[String], +) -> bool { + let normalized_expected = expected.iter().map(normalizer).collect::>(); + let normalized_actual = actual + .iter() + .map(|strs| strs.iter().join(" ")) + .map(|str| str.trim_end().to_string()) + .collect_vec(); + + if log_enabled!(Warn) && normalized_actual != normalized_expected { + warn!("df validation failed. actual vs expected:"); + for i in 0..normalized_actual.len() { + warn!("[{i}] {}", normalized_actual[i]); + warn!( + "[{i}] {}", + if normalized_expected.len() >= i { + &normalized_expected[i] + } else { + "No more results" + } + ); + } + } + + normalized_actual == normalized_expected +} + +/// Sets up an empty directory at test_files/scratch/ +/// creating it if needed and clearing any file contents if it exists +/// This allows tests for inserting to external tables or copy to +/// persist data to disk and have consistent state when running +/// a new test +fn setup_scratch_dir(name: &Path) -> Result<()> { + // go from copy.slt --> copy + let file_stem = name.file_stem().expect("File should have a stem"); + let path = PathBuf::from("test_files").join("scratch").join(file_stem); + + info!("Creating scratch dir in {path:?}"); + if path.exists() { + fs::remove_dir_all(&path)?; + } + fs::create_dir_all(&path)?; + Ok(()) +} + +async fn run_tests() -> Result<()> { + // Enable logging (e.g. set RUST_LOG=debug to see debug logs) + env_logger::init(); + + let options: Options = Parser::parse(); + if options.list { + // nextest parses stdout, so print messages to stderr + eprintln!("NOTICE: --list option unsupported, quitting"); + // return Ok, not error so that tools like nextest which are listing all + // workspace tests (by running `cargo test ... --list --format terse`) + // do not fail when they encounter this binary. Instead, print nothing + // to stdout and return OK so they can continue listing other tests. + return Ok(()); + } + options.warn_on_ignored(); + + #[cfg(feature = "postgres")] + initialize_postgres_container(&options).await?; + + // Run all tests in parallel, reporting failures at the end + // + // Doing so is safe because each slt file runs with its own + // `SessionContext` and should not have side effects (like + // modifying shared state like `/tmp/`) + let m = MultiProgress::with_draw_target(ProgressDrawTarget::stderr_with_hz(1)); + let m_style = ProgressStyle::with_template( + "[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}", + ) + .unwrap() + .progress_chars("##-"); + + let start = Instant::now(); + + let errors: Vec<_> = futures::stream::iter(read_test_files(&options)?) + .map(|test_file| { + let validator = if options.include_sqlite + && test_file.relative_path.starts_with(SQLITE_PREFIX) + { + sqlite_value_validator + } else { + df_value_validator + }; + + let m_clone = m.clone(); + let m_style_clone = m_style.clone(); + + SpawnedTask::spawn(async move { + match (options.postgres_runner, options.complete) { + (false, false) => { + run_test_file(test_file, validator, m_clone, m_style_clone) + .await? + } + (false, true) => { + run_complete_file(test_file, validator, m_clone, m_style_clone) + .await? + } + (true, false) => { + run_test_file_with_postgres( + test_file, + validator, + m_clone, + m_style_clone, + ) + .await? + } + (true, true) => { + run_complete_file_with_postgres( + test_file, + validator, + m_clone, + m_style_clone, + ) + .await? + } + } + Ok(()) as Result<()> + }) + .join() + }) + // run up to num_cpus streams in parallel + .buffer_unordered(get_available_parallelism()) + .flat_map(|result| { + // Filter out any Ok() leaving only the DataFusionErrors + futures::stream::iter(match result { + // Tokio panic error + Err(e) => Some(DataFusionError::External(Box::new(e))), + Ok(thread_result) => match thread_result { + // Test run error + Err(e) => Some(e), + // success + Ok(_) => None, + }, + }) + }) + .collect() + .await; + + m.println(format!("Completed in {}", HumanDuration(start.elapsed())))?; + + #[cfg(feature = "postgres")] + terminate_postgres_container().await?; + + // report on any errors + if !errors.is_empty() { + for e in &errors { + println!("{e}"); + } + exec_err!("{} failures", errors.len()) + } else { + Ok(()) + } +} + +async fn run_test_file( + test_file: TestFile, + validator: Validator, + mp: MultiProgress, + mp_style: ProgressStyle, +) -> Result<()> { + let TestFile { + path, + relative_path, + } = test_file; + let Some(test_ctx) = TestContext::try_new_for_test_file(&relative_path).await else { + info!("Skipping: {}", path.display()); + return Ok(()); + }; + setup_scratch_dir(&relative_path)?; + + let count: u64 = get_record_count(&path, "Datafusion".to_string()); + let pb = mp.add(ProgressBar::new(count)); + + pb.set_style(mp_style); + pb.set_message(format!("{:?}", &relative_path)); + + let mut runner = sqllogictest::Runner::new(|| async { + Ok(DataFusion::new( + test_ctx.session_ctx().clone(), + relative_path.clone(), + pb.clone(), + )) + }); + runner.add_label("Datafusion"); + runner.with_column_validator(strict_column_validator); + runner.with_normalizer(value_normalizer); + runner.with_validator(validator); + + let res = runner + .run_file_async(path) + .await + .map_err(|e| DataFusionError::External(Box::new(e))); + + pb.finish_and_clear(); + + res +} + +fn get_record_count(path: &PathBuf, label: String) -> u64 { + let records: Vec::ColumnType>> = + parse_file(path).unwrap(); + let mut count: u64 = 0; + + records.iter().for_each(|rec| match rec { + Record::Query { conditions, .. } => { + if conditions.is_empty() + || !conditions.contains(&Condition::SkipIf { + label: label.clone(), + }) + || conditions.contains(&Condition::OnlyIf { + label: label.clone(), + }) + { + count += 1; + } + } + Record::Statement { conditions, .. } => { + if conditions.is_empty() + || !conditions.contains(&Condition::SkipIf { + label: label.clone(), + }) + || conditions.contains(&Condition::OnlyIf { + label: label.clone(), + }) + { + count += 1; + } + } + _ => {} + }); + + count +} + +#[cfg(feature = "postgres")] +async fn run_test_file_with_postgres( + test_file: TestFile, + validator: Validator, + mp: MultiProgress, + mp_style: ProgressStyle, +) -> Result<()> { + use datafusion_sqllogictest::Postgres; + let TestFile { + path, + relative_path, + } = test_file; + setup_scratch_dir(&relative_path)?; + + let count: u64 = get_record_count(&path, "postgresql".to_string()); + let pb = mp.add(ProgressBar::new(count)); + + pb.set_style(mp_style); + pb.set_message(format!("{:?}", &relative_path)); + + let mut runner = sqllogictest::Runner::new(|| { + Postgres::connect(relative_path.clone(), pb.clone()) + }); + runner.add_label("postgres"); + runner.with_column_validator(strict_column_validator); + runner.with_normalizer(value_normalizer); + runner.with_validator(validator); + runner + .run_file_async(path) + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + pb.finish_and_clear(); + + Ok(()) +} + +#[cfg(not(feature = "postgres"))] +async fn run_test_file_with_postgres( + _test_file: TestFile, + _validator: Validator, + _mp: MultiProgress, + _mp_style: ProgressStyle, +) -> Result<()> { + use datafusion_common::plan_err; + plan_err!("Can not run with postgres as postgres feature is not enabled") +} + +async fn run_complete_file( + test_file: TestFile, + _validator: Validator, + mp: MultiProgress, + mp_style: ProgressStyle, +) -> Result<()> { + let TestFile { + path, + relative_path, + } = test_file; + + info!("Using complete mode to complete: {}", path.display()); + + let Some(_test_ctx) = TestContext::try_new_for_test_file(&relative_path).await else { + info!("Skipping: {}", path.display()); + return Ok(()); + }; + setup_scratch_dir(&relative_path)?; + + let count: u64 = get_record_count(&path, "Datafusion".to_string()); + let pb = mp.add(ProgressBar::new(count)); + + pb.set_style(mp_style); + pb.set_message(format!("{:?}", &relative_path)); + + // let mut runner = sqllogictest::Runner::new(|| async { + // Ok(DataFusion::new( + // test_ctx.session_ctx().clone(), + // relative_path.clone(), + // pb.clone(), + // )) + // }); + // + // let col_separator = " "; + // let res = runner + // .update_test_file( + // path, + // col_separator, + // validator, + // value_normalizer, + // strict_column_validator, + // ) + // .await + // // Can't use e directly because it isn't marked Send, so turn it into a string. + // .map_err(|e| { + // DataFusionError::Execution(format!("Error completing {relative_path:?}: {e}")) + // }); + + pb.finish_and_clear(); + + // res + Ok(()) +} + +#[cfg(feature = "postgres")] +async fn run_complete_file_with_postgres( + test_file: TestFile, + validator: Validator, + mp: MultiProgress, + mp_style: ProgressStyle, +) -> Result<()> { + use datafusion_sqllogictest::Postgres; + let TestFile { + path, + relative_path, + } = test_file; + info!( + "Using complete mode to complete with Postgres runner: {}", + path.display() + ); + setup_scratch_dir(&relative_path)?; + + let count: u64 = get_record_count(&path, "postgresql".to_string()); + let pb = mp.add(ProgressBar::new(count)); + + pb.set_style(mp_style); + pb.set_message(format!("{:?}", &relative_path)); + + let Some(test_ctx) = TestContext::try_new_for_test_file(&relative_path).await else { + info!("Skipping: {}", path.display()); + return Ok(()); + }; + let mut runner = sqllogictest::Runner::new(|| async { + Ok(DataFusion::new( + test_ctx.session_ctx().clone(), + relative_path.clone(), + pb.clone(), + )) + }); + runner.add_label("Datafusion"); + runner.with_column_validator(strict_column_validator); + runner.with_normalizer(value_normalizer); + runner.with_validator(validator); + + let mut pg = sqllogictest::Runner::new(|| { + Postgres::connect(relative_path.clone(), pb.clone()) + }); + pg.add_label("postgres"); + pg.with_column_validator(strict_column_validator); + pg.with_normalizer(value_normalizer); + pg.with_validator(validator); + + let col_separator = " "; + let res = runner + .update_test_file( + path, + col_separator, + validator, + value_normalizer, + strict_column_validator, + Some(pg), + ) + .await + // Can't use e directly because it isn't marked Send, so turn it into a string. + .map_err(|e| { + DataFusionError::Execution(format!("Error completing {relative_path:?}: {e}")) + }); + + pb.finish_and_clear(); + + res +} + +#[cfg(not(feature = "postgres"))] +async fn run_complete_file_with_postgres( + _test_file: TestFile, + _validator: Validator, + _mp: MultiProgress, + _mp_style: ProgressStyle, +) -> Result<()> { + use datafusion_common::plan_err; + plan_err!("Can not run with postgres as postgres feature is not enabled") +} + +/// Represents a parsed test file +#[derive(Debug)] +struct TestFile { + /// The absolute path to the file + pub path: PathBuf, + /// The relative path of the file (used for display) + pub relative_path: PathBuf, +} + +impl TestFile { + fn new(path: PathBuf) -> Self { + let p = path.to_string_lossy(); + let relative_path = PathBuf::from(if p.starts_with(TEST_DIRECTORY) { + p.strip_prefix(TEST_DIRECTORY).unwrap() + } else if p.starts_with(DATAFUSION_TESTING_TEST_DIRECTORY) { + p.strip_prefix(DATAFUSION_TESTING_TEST_DIRECTORY).unwrap() + } else { + "" + }); + + Self { + path, + relative_path, + } + } + + fn is_slt_file(&self) -> bool { + self.path.extension() == Some(OsStr::new("slt")) + } + + fn check_sqlite(&self, options: &Options) -> bool { + if !self.relative_path.starts_with(SQLITE_PREFIX) { + return true; + } + + options.include_sqlite + } + + fn check_tpch(&self, options: &Options) -> bool { + if !self.relative_path.starts_with("tpch") { + return true; + } + + options.include_tpch + } +} + +fn read_test_files<'a>( + options: &'a Options, +) -> Result + 'a>> { + let mut paths = read_dir_recursive(TEST_DIRECTORY)? + .into_iter() + .map(TestFile::new) + .filter(|f| options.check_test_file(&f.relative_path)) + .filter(|f| f.is_slt_file()) + .filter(|f| f.check_tpch(options)) + .filter(|f| f.check_sqlite(options)) + .filter(|f| options.check_pg_compat_file(f.path.as_path())) + .collect::>(); + if options.include_sqlite { + let mut sqlite_paths = read_dir_recursive(DATAFUSION_TESTING_TEST_DIRECTORY)? + .into_iter() + .map(TestFile::new) + .filter(|f| options.check_test_file(&f.relative_path)) + .filter(|f| f.is_slt_file()) + .filter(|f| f.check_sqlite(options)) + .filter(|f| options.check_pg_compat_file(f.path.as_path())) + .collect::>(); + + paths.append(&mut sqlite_paths) + } + + Ok(Box::new(paths.into_iter())) +} + +fn read_dir_recursive>(path: P) -> Result> { + let mut dst = vec![]; + read_dir_recursive_impl(&mut dst, path.as_ref())?; + Ok(dst) +} + +/// Append all paths recursively to dst +fn read_dir_recursive_impl(dst: &mut Vec, path: &Path) -> Result<()> { + let entries = fs::read_dir(path) + .map_err(|e| exec_datafusion_err!("Error reading directory {path:?}: {e}"))?; + for entry in entries { + let path = entry + .map_err(|e| { + exec_datafusion_err!("Error reading entry in directory {path:?}: {e}") + })? + .path(); + + if path.is_dir() { + read_dir_recursive_impl(dst, &path)?; + } else { + dst.push(path); + } + } + + Ok(()) +} + +/// Parsed command line options +/// +/// This structure attempts to mimic the command line options of the built-in rust test runner +/// accepted by IDEs such as CLion that pass arguments +/// +/// See for more details +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about= None)] +struct Options { + #[clap(long, help = "Auto complete mode to fill out expected results")] + complete: bool, + + #[clap( + long, + env = "PG_COMPAT", + help = "Run Postgres compatibility tests with Postgres runner" + )] + postgres_runner: bool, + + #[clap(long, env = "INCLUDE_SQLITE", help = "Include sqlite files")] + include_sqlite: bool, + + #[clap(long, env = "INCLUDE_TPCH", help = "Include tpch files")] + include_tpch: bool, + + #[clap( + action, + help = "regex like arguments passed to the program which are treated as cargo test filter (substring match on filenames)" + )] + filters: Vec, + + #[clap( + long, + help = "IGNORED (for compatibility with built in rust test runner)" + )] + format: Option, + + #[clap( + short = 'Z', + long, + help = "IGNORED (for compatibility with built in rust test runner)" + )] + z_options: Option, + + #[clap( + long, + help = "IGNORED (for compatibility with built in rust test runner)" + )] + show_output: bool, + + #[clap( + long, + help = "Quits immediately, not listing anything (for compatibility with built-in rust test runner)" + )] + list: bool, + + #[clap( + long, + help = "IGNORED (for compatibility with built-in rust test runner)" + )] + ignored: bool, +} + +impl Options { + /// Because this test can be run as a cargo test, commands like + /// + /// ```shell + /// cargo test foo + /// ``` + /// + /// Will end up passing `foo` as a command line argument. + /// + /// To be compatible with this, treat the command line arguments as a + /// filter and that does a substring match on each input. returns + /// true f this path should be run + fn check_test_file(&self, relative_path: &Path) -> bool { + if self.filters.is_empty() { + return true; + } + + // otherwise check if any filter matches + self.filters + .iter() + .any(|filter| relative_path.to_string_lossy().contains(filter)) + } + + /// Postgres runner executes only tests in files with specific names or in + /// specific folders + fn check_pg_compat_file(&self, path: &Path) -> bool { + let file_name = path.file_name().unwrap().to_str().unwrap().to_string(); + !self.postgres_runner + || file_name.starts_with(PG_COMPAT_FILE_PREFIX) + || (self.include_sqlite && path.to_string_lossy().contains(SQLITE_PREFIX)) + } + + /// Logs warning messages to stdout if any ignored options are passed + fn warn_on_ignored(&self) { + if self.format.is_some() { + eprintln!("WARNING: Ignoring `--format` compatibility option"); + } + + if self.z_options.is_some() { + eprintln!("WARNING: Ignoring `-Z` compatibility option"); + } + + if self.show_output { + eprintln!("WARNING: Ignoring `--show-output` compatibility option"); + } + } +} diff --git a/datafusion/sqllogictest/regenerate_sqlite_files.sh b/datafusion/sqllogictest/regenerate_sqlite_files.sh index a88c8efe40c1..25d65dbe03d5 100755 --- a/datafusion/sqllogictest/regenerate_sqlite_files.sh +++ b/datafusion/sqllogictest/regenerate_sqlite_files.sh @@ -25,7 +25,7 @@ This script is designed to regenerate the .slt files in datafusion-testing/data/ from source files obtained from a git repository. To do that the following steps are performed: -- Verify required commands are installed +- Verify required commands are installed and the PG_URI environment variable is set - Clone the remote git repository into /tmp/sqlitetesting - Delete all existing .slt files in datafusion-testing/data/sqlite/ folder - Copy the .test files from the cloned git repo into datafusion-testing @@ -38,8 +38,8 @@ performed: - Replace the sqllogictest-rs dependency in the Cargo.toml with a version to a git repository that has been custom written to properly complete the files with comparison of datafusion results with postgresql -- Replace the sqllogictest.rs file with a customized version from a github gist - that will work with the customized sqllogictest-rs dependency +- Replace the sqllogictest.rs file with a customized version that will work with + the customized sqllogictest-rs dependency - Run the sqlite test with completion (takes > 1 hr) - Update a few results to ignore known issues - Run sqlite test to verify results @@ -61,7 +61,30 @@ else echo "sd command is installed, proceeding..." fi -DF_HOME=$(realpath "../../") +if [ ! -x "$(command -v rename)" ]; then + echo "This script required 'rename' to be installed. Install using your local package manager" + exit 0 +else + echo "rename command is installed, proceeding..." +fi + +if [ ! -x "$(command -v dos2unix)" ]; then + echo "This script required 'dos2unix' to be installed. Install using your local package manager" + exit 0 +else + echo "dos2unix command is installed, proceeding..." +fi + +if [ -z "${PG_URI}" ]; then + echo "A postgresql database is required for running the sqlite regeneration script. +Please set the PG_URI environment variable to point to an empty postgresql database and retry." + exit 0 +else + echo "PG_URI was set, proceeding" +fi + +SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DF_HOME="$SCRIPT_PATH/../../" # location where we'll clone sql-logic-test repos if [ ! -d "/tmp/sqlitetesting" ]; then @@ -97,9 +120,9 @@ rm ./index/view/10/slt_good_0.test echo "Renaming .test files to .slt and cleansing the files ..." # add hash-threshold lines into these 3 files as they were missing -sed -i '1s/^/hash-threshold 8\n\n/' ./select1.test -sed -i '1s/^/hash-threshold 8\n\n/' ./select4.test -sed -i '1s/^/hash-threshold 8\n\n/' ./select5.test +sed -i '1s/^/hash-threshold 8\n\n/' select1.test +sed -i '1s/^/hash-threshold 8\n\n/' select4.test +sed -i '1s/^/hash-threshold 8\n\n/' select5.test # rename find ./ -type f -name "*.test" -exec rename -f 's/\.test/\.slt/' {} \; # gotta love windows :/ @@ -144,7 +167,7 @@ sed -i -e 's~^sqllogictest.*~sqllogictest = { git = "https://github.com/Omega359 echo "Replacing the datafusion/sqllogictest/bin/sqllogictests.rs file with a custom version required for running completion" # replace the sqllogictest.rs with a customized version. This is tied to a specific version of the gist -curl --silent https://gist.githubusercontent.com/Omega359/5e6548a096fbe0c36ce14c547776db56/raw/3ccd0ee8049657c496d5068f56baa8408c1f00a3/sqllogictest.rs > datafusion/sqllogictest/bin/sqllogictests.rs +cp datafusion/sqllogictest/regenerate/sqllogictests.rs datafusion/sqllogictest/bin/sqllogictests.rs echo "Running the sqllogictests with sqlite completion. This will take approximately an hour to run" From e3796d5fe238aaee9477d7b39e75dd22d7f7b345 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 31 Jan 2025 20:47:47 +0000 Subject: [PATCH 4/5] Minor comment update --- datafusion/sqllogictest/regenerate_sqlite_files.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/regenerate_sqlite_files.sh b/datafusion/sqllogictest/regenerate_sqlite_files.sh index 25d65dbe03d5..76720eca90a0 100755 --- a/datafusion/sqllogictest/regenerate_sqlite_files.sh +++ b/datafusion/sqllogictest/regenerate_sqlite_files.sh @@ -166,7 +166,7 @@ sed -i -e 's~^sqllogictest.*~sqllogictest = { git = "https://github.com/Omega359 echo "Replacing the datafusion/sqllogictest/bin/sqllogictests.rs file with a custom version required for running completion" -# replace the sqllogictest.rs with a customized version. This is tied to a specific version of the gist +# replace the sqllogictest.rs with a customized version. cp datafusion/sqllogictest/regenerate/sqllogictests.rs datafusion/sqllogictest/bin/sqllogictests.rs echo "Running the sqllogictests with sqlite completion. This will take approximately an hour to run" From 17bf344c7120a2a382e4908b59bc6c6eba8e15e8 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Mon, 3 Feb 2025 21:34:35 +0000 Subject: [PATCH 5/5] Removed use of sed, fixed using host.docker.internal when starting up pg via docker, cleanup pg_compat .bak files. --- datafusion/sqllogictest/bin/postgres_container.rs | 4 ++-- datafusion/sqllogictest/regenerate_sqlite_files.sh | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/datafusion/sqllogictest/bin/postgres_container.rs b/datafusion/sqllogictest/bin/postgres_container.rs index ef61abc12af3..64905022914a 100644 --- a/datafusion/sqllogictest/bin/postgres_container.rs +++ b/datafusion/sqllogictest/bin/postgres_container.rs @@ -123,8 +123,8 @@ async fn start_postgres( .await .unwrap(); // uncomment this if you are running docker in docker - let host = "host.docker.internal".to_string(); - // let host = container.get_host().await.unwrap().to_string(); + // let host = "host.docker.internal".to_string(); + let host = container.get_host().await.unwrap().to_string(); let port = container.get_host_port_ipv4(5432).await.unwrap(); let mut rx = in_channel.rx.lock().await; diff --git a/datafusion/sqllogictest/regenerate_sqlite_files.sh b/datafusion/sqllogictest/regenerate_sqlite_files.sh index 76720eca90a0..0c1b26b1a9d3 100755 --- a/datafusion/sqllogictest/regenerate_sqlite_files.sh +++ b/datafusion/sqllogictest/regenerate_sqlite_files.sh @@ -43,7 +43,7 @@ performed: - Run the sqlite test with completion (takes > 1 hr) - Update a few results to ignore known issues - Run sqlite test to verify results -- Perform cleanup to revert changes to the Cargo.toml file/sqllogictest.rs file +- Perform cleanup to revert changes to the Cargo.toml & sqllogictest.rs files - Delete backup files and the /tmp/sqlitetesting directory " read -r -p "Do you understand and accept the risk? (yes/no): " acknowledgement @@ -120,9 +120,11 @@ rm ./index/view/10/slt_good_0.test echo "Renaming .test files to .slt and cleansing the files ..." # add hash-threshold lines into these 3 files as they were missing -sed -i '1s/^/hash-threshold 8\n\n/' select1.test -sed -i '1s/^/hash-threshold 8\n\n/' select4.test -sed -i '1s/^/hash-threshold 8\n\n/' select5.test +# skip using sed as gnu sed and mac sed are not friends + +echo -e "hash-threshold 8\n\n$(cat select1.test)" > select1.test +echo -e "hash-threshold 8\n\n$(cat select4.test)" > select4.test +echo -e "hash-threshold 8\n\n$(cat select5.test)" > select5.test # rename find ./ -type f -name "*.test" -exec rename -f 's/\.test/\.slt/' {} \; # gotta love windows :/ @@ -162,7 +164,7 @@ echo "Updating the datafusion/sqllogictest/Cargo.toml file with an updated sqllo # update the sqllogictest Cargo.toml with the new repo for sqllogictest-rs (tied to a specific hash) cd "$DF_HOME" || exit; -sed -i -e 's~^sqllogictest.*~sqllogictest = { git = "https://github.com/Omega359/sqllogictest-rs.git", rev = "1cd933d" }~' datafusion/sqllogictest/Cargo.toml +sd -f i '^sqllogictest.*' 'sqllogictest = { git = "https://github.com/Omega359/sqllogictest-rs.git", rev = "1cd933d" }' datafusion/sqllogictest/Cargo.toml echo "Replacing the datafusion/sqllogictest/bin/sqllogictests.rs file with a custom version required for running completion" @@ -197,6 +199,7 @@ echo "Cleaning up source code changes and temporary files and directories" cd "$DF_HOME" || exit; find ./datafusion-testing/data/sqlite/ -type f -name "*.bak" -exec rm {} \; +find ./datafusion/sqllogictest/test_files/pg_compat/ -type f -name "*.bak" -exec rm {} \; git checkout datafusion/sqllogictest/Cargo.toml git checkout datafusion/sqllogictest/bin/sqllogictests.rs rm -rf /tmp/sqlitetesting