Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BugFix] parquet writer do not perform time zone adjustments when writing timestamp types. #55194

Merged
merged 5 commits into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion be/src/formats/parquet/file_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,8 @@ arrow::Result<::parquet::schema::NodePtr> ParquetBuildHelper::_make_schema_node(
}
case TYPE_DATETIME: {
return ::parquet::schema::PrimitiveNode::Make(
name, rep_type, ::parquet::LogicalType::Timestamp(true, ::parquet::LogicalType::TimeUnit::unit::MICROS),
name, rep_type,
::parquet::LogicalType::Timestamp(false, ::parquet::LogicalType::TimeUnit::unit::MICROS),
::parquet::Type::INT64, -1, file_column_id.field_id);
}
case TYPE_DECIMAL32: {
Expand Down
5 changes: 3 additions & 2 deletions be/src/formats/parquet/level_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,9 @@ Status LevelBuilder::_write_datetime_column_chunk(const LevelBuilderContext& ctx
DeferOp defer([&] { delete[] values; });

for (size_t i = 0; i < col->size(); i++) {
// normalize to utc
auto timestamp = timestamp::sub<TimeUnit::SECOND>(data_col[i]._timestamp, _offset);
auto timestamp = use_int96_timestamp_encoding
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this. Could you elaborate it a little bit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use_int96_timestamp_encoding indicates the use of int96 to represent the timestamp type. This is a deprecated encoding method, but we need to maintain compatibility. For timestamps encoded in this way, we continue to use the previous logic and convert the data to the UTC time zone when writing.

Copy link
Contributor

@DorianZheng DorianZheng Jan 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why it needs to be converted to UTC timezone when it's encoding in int96

? timestamp::sub<TimeUnit::SECOND>(data_col[i]._timestamp, _offset)
: data_col[i]._timestamp;
if constexpr (use_int96_timestamp_encoding) {
auto date = reinterpret_cast<int32_t*>(values[i].value + 2);
auto nanosecond = reinterpret_cast<int64_t*>(values[i].value);
Expand Down
2 changes: 1 addition & 1 deletion be/src/formats/parquet/parquet_file_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ arrow::Result<::parquet::schema::NodePtr> ParquetFileWriter::_make_schema_node(c
} else {
return ::parquet::schema::PrimitiveNode::Make(
name, rep_type,
::parquet::LogicalType::Timestamp(true, ::parquet::LogicalType::TimeUnit::unit::MICROS),
::parquet::LogicalType::Timestamp(false, ::parquet::LogicalType::TimeUnit::unit::MICROS),
::parquet::Type::INT64, -1, file_column_id.field_id);
}
}
Expand Down
20 changes: 18 additions & 2 deletions test/sql/test_iceberg/R/test_iceberg_catalog_timestamp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ set catalog ice_cat_${uuid0};
create database ice_db_${uuid0};
use ice_db_${uuid0};
create table ice_tbl_${uuid0} (
col_str int,
col_int datetime
col_int int,
col_datetime datetime
);
insert into ice_tbl_${uuid0} values (1, '2024-01-29 01:00:00'),(2, '2024-01-30 20:10:00'),(3,null);
-- result:
Expand All @@ -22,6 +22,22 @@ select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0};
2 2024-01-30 20:10:00
3 None
-- !result
select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = '2024-01-29 01:00:00';
-- result:
1 2024-01-29 01:00:00
-- !result
select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00' as datetime);
-- result:
1 2024-01-29 01:00:00
-- !result
select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00.000' as datetime);
-- result:
1 2024-01-29 01:00:00
-- !result
select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00:000' as datetime);
-- result:
1 2024-01-29 01:00:00
-- !result
drop table ice_tbl_${uuid0} force;
drop database ice_db_${uuid0};
drop catalog ice_cat_${uuid0};
Expand Down
11 changes: 9 additions & 2 deletions test/sql/test_iceberg/T/test_iceberg_catalog_timestamp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,20 @@ set catalog ice_cat_${uuid0};
create database ice_db_${uuid0};
use ice_db_${uuid0};
create table ice_tbl_${uuid0} (
col_str int,
col_int datetime
col_int int,
col_datetime datetime
);
insert into ice_tbl_${uuid0} values (1, '2024-01-29 01:00:00'),(2, '2024-01-30 20:10:00'),(3,null);

select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0};

select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = '2024-01-29 01:00:00';

select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00' as datetime);

select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00.000' as datetime);

select * from ice_cat_${uuid0}.ice_db_${uuid0}.ice_tbl_${uuid0} where col_datetime = cast('2024-01-29 01:00:00:000' as datetime);
drop table ice_tbl_${uuid0} force;
drop database ice_db_${uuid0};
drop catalog ice_cat_${uuid0};
Loading