From 41217a2783ed7eb06ed74128329ab95350ec9e65 Mon Sep 17 00:00:00 2001 From: Youngwb Date: Fri, 17 Jan 2025 13:19:02 +0800 Subject: [PATCH 1/5] [BugFix] parquet writer do not perform time zone adjustments when writing datetime types. Signed-off-by: Youngwb --- be/src/formats/parquet/level_builder.cpp | 3 +-- .../formats/parquet/parquet_file_writer.cpp | 2 +- .../R/test_iceberg_catalog_timestamp | 20 +++++++++++++++++-- .../T/test_iceberg_catalog_timestamp | 11 ++++++++-- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/be/src/formats/parquet/level_builder.cpp b/be/src/formats/parquet/level_builder.cpp index b7b82e992c5a7..85fc7a288d18e 100644 --- a/be/src/formats/parquet/level_builder.cpp +++ b/be/src/formats/parquet/level_builder.cpp @@ -357,8 +357,7 @@ Status LevelBuilder::_write_datetime_column_chunk(const LevelBuilderContext& ctx DeferOp defer([&] { delete[] values; }); for (size_t i = 0; i < col->size(); i++) { - // normalize to utc - auto timestamp = timestamp::sub(data_col[i]._timestamp, _offset); + auto timestamp = data_col[i]._timestamp; if constexpr (use_int96_timestamp_encoding) { auto date = reinterpret_cast(values[i].value + 2); auto nanosecond = reinterpret_cast(values[i].value); diff --git a/be/src/formats/parquet/parquet_file_writer.cpp b/be/src/formats/parquet/parquet_file_writer.cpp index ebebcf12d72c2..4c65405d61ce0 100644 --- a/be/src/formats/parquet/parquet_file_writer.cpp +++ b/be/src/formats/parquet/parquet_file_writer.cpp @@ -338,7 +338,7 @@ arrow::Result<::parquet::schema::NodePtr> ParquetFileWriter::_make_schema_node(c } else { return ::parquet::schema::PrimitiveNode::Make( name, rep_type, - ::parquet::LogicalType::Timestamp(true, ::parquet::LogicalType::TimeUnit::unit::MICROS), + ::parquet::LogicalType::Timestamp(false, ::parquet::LogicalType::TimeUnit::unit::MICROS), ::parquet::Type::INT64, -1, file_column_id.field_id); } } diff --git a/test/sql/test_iceberg/R/test_iceberg_catalog_timestamp b/test/sql/test_iceberg/R/test_iceberg_catalog_timestamp index 2941d6cc59283..bf3e4a0f8dd47 100644 --- a/test/sql/test_iceberg/R/test_iceberg_catalog_timestamp +++ b/test/sql/test_iceberg/R/test_iceberg_catalog_timestamp @@ -10,8 +10,8 @@ set catalog ice_cat_${uuid0}; create database ice_db_${uuid0}; use ice_db_${uuid0}; create table ice_tbl_${uuid0} ( - col_str int, - col_int datetime + col_int int, + col_datetime datetime ); insert into ice_tbl_${uuid0} values (1, '2024-01-29 01:00:00'),(2, '2024-01-30 20:10:00'),(3,null); -- result: @@ -22,6 +22,22 @@ select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0}; 2 2024-01-30 20:10:00 3 None -- !result +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = '2024-01-29 01:00:00'; +-- result: +1 2024-01-29 01:00:00 +-- !result +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00' as datetime); +-- result: +1 2024-01-29 01:00:00 +-- !result +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00.000' as datetime); +-- result: +1 2024-01-29 01:00:00 +-- !result +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00:000' as datetime); +-- result: +1 2024-01-29 01:00:00 +-- !result drop table ice_tbl_${uuid0} force; drop database ice_db_${uuid0}; drop catalog ice_cat_${uuid0}; diff --git a/test/sql/test_iceberg/T/test_iceberg_catalog_timestamp b/test/sql/test_iceberg/T/test_iceberg_catalog_timestamp index 8ed87be2a97ac..5d020e3ce58b8 100644 --- a/test/sql/test_iceberg/T/test_iceberg_catalog_timestamp +++ b/test/sql/test_iceberg/T/test_iceberg_catalog_timestamp @@ -10,13 +10,20 @@ set catalog ice_cat_${uuid0}; create database ice_db_${uuid0}; use ice_db_${uuid0}; create table ice_tbl_${uuid0} ( - col_str int, - col_int datetime + col_int int, + col_datetime datetime ); insert into ice_tbl_${uuid0} values (1, '2024-01-29 01:00:00'),(2, '2024-01-30 20:10:00'),(3,null); select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0}; +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = '2024-01-29 01:00:00'; + +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00' as datetime); + +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00.000' as datetime); + +select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00:000' as datetime); drop table ice_tbl_${uuid0} force; drop database ice_db_${uuid0}; drop catalog ice_cat_${uuid0}; \ No newline at end of file From adced39d548ab871bbae113a82a49d93ea18bf27 Mon Sep 17 00:00:00 2001 From: Youngwb Date: Fri, 17 Jan 2025 14:43:39 +0800 Subject: [PATCH 2/5] fix ut Signed-off-by: Youngwb --- be/src/formats/parquet/column_converter.cpp | 2 +- be/src/formats/parquet/file_writer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/formats/parquet/column_converter.cpp b/be/src/formats/parquet/column_converter.cpp index aa5a70bf30448..ab2dcfd10a4ec 100644 --- a/be/src/formats/parquet/column_converter.cpp +++ b/be/src/formats/parquet/column_converter.cpp @@ -660,7 +660,7 @@ Status Int96ToDateTimeConverter::convert(const ColumnPtr& src, Column* dst) { dst_null_data[i] = src_null_data[i]; if (!src_null_data[i]) { Timestamp timestamp = (static_cast(src_data[i].hi) << TIMESTAMP_BITS) | (src_data[i].lo / 1000); - dst_data[i].set_timestamp(_utc_to_local(timestamp)); + dst_data[i].set_timestamp(timestamp); } } dst_nullable_column->set_has_null(src_nullable_column->has_null()); diff --git a/be/src/formats/parquet/file_writer.cpp b/be/src/formats/parquet/file_writer.cpp index 950e0606614c3..dbf9738ef5226 100644 --- a/be/src/formats/parquet/file_writer.cpp +++ b/be/src/formats/parquet/file_writer.cpp @@ -298,7 +298,7 @@ arrow::Result<::parquet::schema::NodePtr> ParquetBuildHelper::_make_schema_node( } case TYPE_DATETIME: { return ::parquet::schema::PrimitiveNode::Make( - name, rep_type, ::parquet::LogicalType::Timestamp(true, ::parquet::LogicalType::TimeUnit::unit::MICROS), + name, rep_type, ::parquet::LogicalType::Timestamp(false, ::parquet::LogicalType::TimeUnit::unit::MICROS), ::parquet::Type::INT64, -1, file_column_id.field_id); } case TYPE_DECIMAL32: { From 24e47c141cee860f7efb3daf3ad80764f8025475 Mon Sep 17 00:00:00 2001 From: Youngwb Date: Fri, 17 Jan 2025 14:51:12 +0800 Subject: [PATCH 3/5] format Signed-off-by: Youngwb --- be/src/formats/parquet/file_writer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/be/src/formats/parquet/file_writer.cpp b/be/src/formats/parquet/file_writer.cpp index dbf9738ef5226..fa2a55f96afed 100644 --- a/be/src/formats/parquet/file_writer.cpp +++ b/be/src/formats/parquet/file_writer.cpp @@ -298,7 +298,8 @@ arrow::Result<::parquet::schema::NodePtr> ParquetBuildHelper::_make_schema_node( } case TYPE_DATETIME: { return ::parquet::schema::PrimitiveNode::Make( - name, rep_type, ::parquet::LogicalType::Timestamp(false, ::parquet::LogicalType::TimeUnit::unit::MICROS), + name, rep_type, + ::parquet::LogicalType::Timestamp(false, ::parquet::LogicalType::TimeUnit::unit::MICROS), ::parquet::Type::INT64, -1, file_column_id.field_id); } case TYPE_DECIMAL32: { From 2c6c7de45a9abf78017ae23c2dc7901a81a49c42 Mon Sep 17 00:00:00 2001 From: Youngwb Date: Fri, 17 Jan 2025 16:53:15 +0800 Subject: [PATCH 4/5] fix ut Signed-off-by: Youngwb --- be/test/formats/parquet/column_converter_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/be/test/formats/parquet/column_converter_test.cpp b/be/test/formats/parquet/column_converter_test.cpp index 4ebeb4b043031..855b8806c33af 100644 --- a/be/test/formats/parquet/column_converter_test.cpp +++ b/be/test/formats/parquet/column_converter_test.cpp @@ -462,7 +462,7 @@ TEST_F(ColumnConverterTest, Int96Test) { const std::string col_name = "timestamp_int96"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2023-04-29 00:40:22.618760]", expected_rows); + check(file_path, col_type, col_name, "[2023-04-28 16:40:22.618760]", expected_rows); } } } @@ -476,21 +476,21 @@ TEST_F(ColumnConverterTest, Int96TimeZoneTest) { const std::string col_name = "time_with_new_york"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2019-04-17 04:00:00]", expected_rows); + check(file_path, col_type, col_name, "[2019-04-16 20:00:00]", expected_rows); } } { const std::string col_name = "time_with_shanghai"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2019-04-17 04:00:00]", expected_rows); + check(file_path, col_type, col_name, "[2019-04-16 20:00:00]", expected_rows); } } { const std::string col_name = "time_without_timezone"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2019-04-17 04:00:00]", expected_rows); + check(file_path, col_type, col_name, "[2019-04-16 20:00:00]", expected_rows); } } } From fc909b461149befaf8dbdccf5108bf49c77c23f9 Mon Sep 17 00:00:00 2001 From: Youngwb Date: Sun, 19 Jan 2025 00:44:52 +0800 Subject: [PATCH 5/5] fix ut Signed-off-by: Youngwb --- be/src/formats/parquet/column_converter.cpp | 2 +- be/src/formats/parquet/level_builder.cpp | 4 +++- be/test/formats/parquet/column_converter_test.cpp | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/be/src/formats/parquet/column_converter.cpp b/be/src/formats/parquet/column_converter.cpp index ab2dcfd10a4ec..aa5a70bf30448 100644 --- a/be/src/formats/parquet/column_converter.cpp +++ b/be/src/formats/parquet/column_converter.cpp @@ -660,7 +660,7 @@ Status Int96ToDateTimeConverter::convert(const ColumnPtr& src, Column* dst) { dst_null_data[i] = src_null_data[i]; if (!src_null_data[i]) { Timestamp timestamp = (static_cast(src_data[i].hi) << TIMESTAMP_BITS) | (src_data[i].lo / 1000); - dst_data[i].set_timestamp(timestamp); + dst_data[i].set_timestamp(_utc_to_local(timestamp)); } } dst_nullable_column->set_has_null(src_nullable_column->has_null()); diff --git a/be/src/formats/parquet/level_builder.cpp b/be/src/formats/parquet/level_builder.cpp index 85fc7a288d18e..27de8577dc39c 100644 --- a/be/src/formats/parquet/level_builder.cpp +++ b/be/src/formats/parquet/level_builder.cpp @@ -357,7 +357,9 @@ Status LevelBuilder::_write_datetime_column_chunk(const LevelBuilderContext& ctx DeferOp defer([&] { delete[] values; }); for (size_t i = 0; i < col->size(); i++) { - auto timestamp = data_col[i]._timestamp; + auto timestamp = use_int96_timestamp_encoding + ? timestamp::sub(data_col[i]._timestamp, _offset) + : data_col[i]._timestamp; if constexpr (use_int96_timestamp_encoding) { auto date = reinterpret_cast(values[i].value + 2); auto nanosecond = reinterpret_cast(values[i].value); diff --git a/be/test/formats/parquet/column_converter_test.cpp b/be/test/formats/parquet/column_converter_test.cpp index 855b8806c33af..4ebeb4b043031 100644 --- a/be/test/formats/parquet/column_converter_test.cpp +++ b/be/test/formats/parquet/column_converter_test.cpp @@ -462,7 +462,7 @@ TEST_F(ColumnConverterTest, Int96Test) { const std::string col_name = "timestamp_int96"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2023-04-28 16:40:22.618760]", expected_rows); + check(file_path, col_type, col_name, "[2023-04-29 00:40:22.618760]", expected_rows); } } } @@ -476,21 +476,21 @@ TEST_F(ColumnConverterTest, Int96TimeZoneTest) { const std::string col_name = "time_with_new_york"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2019-04-16 20:00:00]", expected_rows); + check(file_path, col_type, col_name, "[2019-04-17 04:00:00]", expected_rows); } } { const std::string col_name = "time_with_shanghai"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2019-04-16 20:00:00]", expected_rows); + check(file_path, col_type, col_name, "[2019-04-17 04:00:00]", expected_rows); } } { const std::string col_name = "time_without_timezone"; { const TypeDescriptor col_type = TypeDescriptor::from_logical_type(LogicalType::TYPE_DATETIME); - check(file_path, col_type, col_name, "[2019-04-16 20:00:00]", expected_rows); + check(file_path, col_type, col_name, "[2019-04-17 04:00:00]", expected_rows); } } }