From 6959c5ba23c9756265eefe9e64a2d2f6d93d9873 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 17 Jan 2025 00:00:43 +0800 Subject: [PATCH] GH-45283: [C++][Parquet] Omit level histogram when max level is 0 --- cpp/src/parquet/column_writer.cc | 3 +++ cpp/src/parquet/size_statistics.cc | 32 +++++++++++++++++-------- cpp/src/parquet/size_statistics_test.cc | 18 ++++++++++++++ 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 683a5ab735aed..4998e6f301a00 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1609,6 +1609,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< auto add_levels = [](std::vector& level_histogram, ::arrow::util::span levels, int16_t max_level) { + if (max_level == 0) { + return; + } ARROW_DCHECK_EQ(static_cast(max_level) + 1, level_histogram.size()); ::parquet::UpdateLevelHistogram(levels, level_histogram); }; diff --git a/cpp/src/parquet/size_statistics.cc b/cpp/src/parquet/size_statistics.cc index 7292f9222a684..2be1828625921 100644 --- a/cpp/src/parquet/size_statistics.cc +++ b/cpp/src/parquet/size_statistics.cc @@ -64,14 +64,19 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) { } void SizeStatistics::Validate(const ColumnDescriptor* descr) const { - if (repetition_level_histogram.size() != - static_cast(descr->max_repetition_level() + 1)) { - throw ParquetException("Repetition level histogram size mismatch"); - } - if (definition_level_histogram.size() != - static_cast(descr->max_definition_level() + 1)) { - throw ParquetException("Definition level histogram size mismatch"); - } + auto validate_histogram = [](const std::vector& histogram, int16_t max_level, + const std::string& name) { + if (max_level == 0 && histogram.empty()) { + return; + } + if (histogram.size() != static_cast(max_level + 1)) { + throw ParquetException(name + " level histogram size mismatch"); + } + }; + validate_histogram(repetition_level_histogram, descr->max_repetition_level(), + "Repetition"); + validate_histogram(definition_level_histogram, descr->max_definition_level(), + "Definition"); if (unencoded_byte_array_data_bytes.has_value() && descr->physical_type() != Type::BYTE_ARRAY) { throw ParquetException("Unencoded byte array data bytes does not support " + @@ -93,8 +98,15 @@ void SizeStatistics::Reset() { std::unique_ptr SizeStatistics::Make(const ColumnDescriptor* descr) { auto size_stats = std::make_unique(); - size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); - size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + // If the max level is 0, the level histogram can be omitted because it contains + // only single level (a.k.a. 0) and its count is equivalent to `num_values` of the + // column chunk or data page. + if (descr->max_repetition_level() != 0) { + size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0); + } + if (descr->max_definition_level() != 0) { + size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0); + } if (descr->physical_type() == Type::BYTE_ARRAY) { size_stats->unencoded_byte_array_data_bytes = 0; } diff --git a/cpp/src/parquet/size_statistics_test.cc b/cpp/src/parquet/size_statistics_test.cc index 0958ae4dec2ca..ee2b79daabf3b 100644 --- a/cpp/src/parquet/size_statistics_test.cc +++ b/cpp/src/parquet/size_statistics_test.cc @@ -376,4 +376,22 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) { /*byte_array_bytes=*/{90000}})); } +TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) { + auto schema = + ::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)}); + WriteFile(SizeStatisticsLevel::PageAndColumnChunk, + ::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}), + /*max_row_group_length=*/2, + /*page_size=*/1024); + ReadSizeStatistics(); + EXPECT_THAT(row_group_stats_, + ::testing::ElementsAre(SizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/6})); + EXPECT_THAT(page_stats_, + ::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{}, + /*rep_levels=*/{}, + /*byte_array_bytes=*/{6}})); +} + } // namespace parquet