Skip to content

Commit

Permalink
apacheGH-45283: [C++][Parquet] Omit level histogram when max level is 0
Browse files Browse the repository at this point in the history
  • Loading branch information
wgtmac committed Jan 16, 2025
1 parent df82d4c commit 6959c5b
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 10 deletions.
3 changes: 3 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1609,6 +1609,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<

auto add_levels = [](std::vector<int64_t>& level_histogram,
::arrow::util::span<const int16_t> levels, int16_t max_level) {
if (max_level == 0) {
return;
}
ARROW_DCHECK_EQ(static_cast<size_t>(max_level) + 1, level_histogram.size());
::parquet::UpdateLevelHistogram(levels, level_histogram);
};
Expand Down
32 changes: 22 additions & 10 deletions cpp/src/parquet/size_statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,19 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) {
}

void SizeStatistics::Validate(const ColumnDescriptor* descr) const {
if (repetition_level_histogram.size() !=
static_cast<size_t>(descr->max_repetition_level() + 1)) {
throw ParquetException("Repetition level histogram size mismatch");
}
if (definition_level_histogram.size() !=
static_cast<size_t>(descr->max_definition_level() + 1)) {
throw ParquetException("Definition level histogram size mismatch");
}
auto validate_histogram = [](const std::vector<int64_t>& histogram, int16_t max_level,
const std::string& name) {
if (max_level == 0 && histogram.empty()) {
return;
}
if (histogram.size() != static_cast<size_t>(max_level + 1)) {
throw ParquetException(name + " level histogram size mismatch");
}
};
validate_histogram(repetition_level_histogram, descr->max_repetition_level(),
"Repetition");
validate_histogram(definition_level_histogram, descr->max_definition_level(),
"Definition");
if (unencoded_byte_array_data_bytes.has_value() &&
descr->physical_type() != Type::BYTE_ARRAY) {
throw ParquetException("Unencoded byte array data bytes does not support " +
Expand All @@ -93,8 +98,15 @@ void SizeStatistics::Reset() {

std::unique_ptr<SizeStatistics> SizeStatistics::Make(const ColumnDescriptor* descr) {
auto size_stats = std::make_unique<SizeStatistics>();
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0);
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0);
// If the max level is 0, the level histogram can be omitted because it contains
// only single level (a.k.a. 0) and its count is equivalent to `num_values` of the
// column chunk or data page.
if (descr->max_repetition_level() != 0) {
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0);
}
if (descr->max_definition_level() != 0) {
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0);
}
if (descr->physical_type() == Type::BYTE_ARRAY) {
size_stats->unencoded_byte_array_data_bytes = 0;
}
Expand Down
18 changes: 18 additions & 0 deletions cpp/src/parquet/size_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -376,4 +376,22 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) {
/*byte_array_bytes=*/{90000}}));
}

TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) {
auto schema =
::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)});
WriteFile(SizeStatisticsLevel::PageAndColumnChunk,
::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}),
/*max_row_group_length=*/2,
/*page_size=*/1024);
ReadSizeStatistics();
EXPECT_THAT(row_group_stats_,
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{},
/*rep_levels=*/{},
/*byte_array_bytes=*/6}));
EXPECT_THAT(page_stats_,
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{},
/*rep_levels=*/{},
/*byte_array_bytes=*/{6}}));
}

} // namespace parquet

0 comments on commit 6959c5b

Please sign in to comment.