From ddbb568819779de2e8174c2ee3a2bab89c4023de Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Mon, 18 Nov 2024 21:02:13 +1100 Subject: [PATCH] Fix bug with decoding timestamp as decimal --- src/array_decoder/timestamp.rs | 2 +- src/encoding/timestamp.rs | 2 ++ tests/basic/main.rs | 40 ++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/array_decoder/timestamp.rs b/src/array_decoder/timestamp.rs index b6c45b5..1bad694 100644 --- a/src/array_decoder/timestamp.rs +++ b/src/array_decoder/timestamp.rs @@ -103,7 +103,7 @@ fn decimal128_decoder( let data = get_rle_reader(column, data)?; let secondary = stripe.stream_map().get(column, Kind::Secondary); - let secondary = get_rle_reader(column, secondary)?; + let secondary = get_unsigned_rle_reader(column, secondary); let present = PresentDecoder::from_stripe(stripe, column); diff --git a/src/encoding/timestamp.rs b/src/encoding/timestamp.rs index d011311..5f7fd5c 100644 --- a/src/encoding/timestamp.rs +++ b/src/encoding/timestamp.rs @@ -133,6 +133,8 @@ fn decode(base: i64, seconds_since_orc_base: i64, nanoseconds: i64) -> (i128, i6 // while we encode them as a single i64 of nanoseconds in Arrow. let nanoseconds_since_epoch = (seconds as i128 * NANOSECONDS_IN_SECOND as i128) + (nanoseconds as i128); + // Returning seconds & nanoseconds only for error message + // TODO: does the error message really need those details? Can simplify by removing. (nanoseconds_since_epoch, seconds, nanoseconds) } diff --git a/tests/basic/main.rs b/tests/basic/main.rs index 84d62d8..424767d 100644 --- a/tests/basic/main.rs +++ b/tests/basic/main.rs @@ -615,6 +615,46 @@ pub fn decimal128_timestamps_test() { assert_batches_eq(&batch, &expected); } +fn integration_path(path: &str) -> String { + let dir = env!("CARGO_MANIFEST_DIR"); + format!("{}/tests/integration/data/{}", dir, path) +} + +// TODO: Move this to integration test file. Placed here because it has access to assert_batches_eq. +// Should make that function available to both basics & integration. +#[test] +pub fn decimal128_timestamps_1900_test() { + let path = integration_path("TestOrcFile.testDate1900.orc"); + let f = File::open(path).expect("no file found"); + let mut reader = ArrowReaderBuilder::try_new(f) + .unwrap() + .with_batch_size(11) // it's a big file, we don't want to test more than that + .with_schema(Arc::new(Schema::new(vec![ + Field::new("time", DataType::Decimal128(38, 9), true), + Field::new("date", DataType::Date32, true), + ]))) + .build(); + let batch = reader.next().unwrap().unwrap(); + let expected = [ + "+-----------------------+------------+", + "| time | date |", + "+-----------------------+------------+", + "| -2198229903.900000000 | 1900-12-25 |", + "| -2198229903.899900000 | 1900-12-25 |", + "| -2198229903.899800000 | 1900-12-25 |", + "| -2198229903.899700000 | 1900-12-25 |", + "| -2198229903.899600000 | 1900-12-25 |", + "| -2198229903.899500000 | 1900-12-25 |", + "| -2198229903.899400000 | 1900-12-25 |", + "| -2198229903.899300000 | 1900-12-25 |", + "| -2198229903.899200000 | 1900-12-25 |", + "| -2198229903.899100000 | 1900-12-25 |", + "| -2198229903.899000000 | 1900-12-25 |", + "+-----------------------+------------+", + ]; + assert_batches_eq(&[batch], &expected); +} + // From https://github.com/apache/arrow-rs/blob/7705acad845e8b2a366a08640f7acb4033ed7049/arrow-flight/src/sql/metadata/mod.rs#L67-L75 pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) { let formatted = pretty::pretty_format_batches(batches).unwrap().to_string();