Skip to content

Commit

Permalink
Add Method to Calculate Logical/Raw Stats for Flat/Constant Scalars V…
Browse files Browse the repository at this point in the history
…ectors (#141)

Summary:

Adding function that takes in a Velox Vector and returns the logical/raw sizing of the data.

Currently the function will support Flat and Constant encoding vectors for Scalar types.

Differential Revision: D69929415
  • Loading branch information
nathanphan26 authored and facebook-github-bot committed Feb 28, 2025
1 parent fe4efb0 commit c06a013
Show file tree
Hide file tree
Showing 5 changed files with 393 additions and 1 deletion.
3 changes: 3 additions & 0 deletions dwio/nimble/velox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,6 @@ target_link_libraries(
nimble_velox_stats_fb
velox_dwio_common
Folly::folly)

add_library(raw_size_utils RawSizeUtils.cpp)
target_link_libraries(raw_size_utils velox_vector)
56 changes: 56 additions & 0 deletions dwio/nimble/velox/RawSizeUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dwio/nimble/velox/RawSizeUtils.h"

namespace facebook::nimble {

// Returns uint64_t bytes of raw data in the vector.
uint64_t getRawSizeFromVector(const velox::VectorPtr& vector) {
VELOX_CHECK_NOT_NULL(vector, "vector is null");
const auto& typeKind = vector->typeKind();
switch (typeKind) {
case velox::TypeKind::BOOLEAN: {
return getRawSizeFromFixedWidthVector<bool>(vector);
}
case velox::TypeKind::TINYINT: {
return getRawSizeFromFixedWidthVector<int8_t>(vector);
}
case velox::TypeKind::SMALLINT: {
return getRawSizeFromFixedWidthVector<int16_t>(vector);
}
case velox::TypeKind::INTEGER: {
return getRawSizeFromFixedWidthVector<int32_t>(vector);
}
case velox::TypeKind::BIGINT: {
return getRawSizeFromFixedWidthVector<int64_t>(vector);
}
case velox::TypeKind::REAL: {
return getRawSizeFromFixedWidthVector<float>(vector);
}
case velox::TypeKind::DOUBLE: {
return getRawSizeFromFixedWidthVector<double>(vector);
}
case velox::TypeKind::VARCHAR:
case velox::TypeKind::VARBINARY: {
return getRawSizeFromStringVector<velox::StringView>(vector);
}
default: {
VELOX_FAIL("Unsupported type: {}", typeKind);
}
}
}

} // namespace facebook::nimble
105 changes: 105 additions & 0 deletions dwio/nimble/velox/RawSizeUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "velox/vector/BaseVector.h"
#include "velox/vector/ConstantVector.h"
#include "velox/vector/FlatVector.h"

namespace facebook::nimble {

constexpr uint64_t NULL_SIZE = 1;

template <typename T>
uint64_t getRawSizeFromFixedWidthVector(const velox::VectorPtr& vector) {
constexpr auto isCorrectType = std::disjunction_v<
std::is_same<bool, T>,
std::is_same<int8_t, T>,
std::is_same<int16_t, T>,
std::is_same<int32_t, T>,
std::is_same<int64_t, T>,
std::is_same<float, T>,
std::is_same<double, T>>;
VELOX_DCHECK(
isCorrectType,
"Wrong vector type. Expected bool | int8_t | int16_t | int32_t | int64_t | float | double.");

const auto& encoding = vector->encoding();
switch (encoding) {
case velox::VectorEncoding::Simple::FLAT: {
auto* flatVector = vector->asFlatVector<T>();

const auto nullCount = velox::BaseVector::countNulls(
flatVector->nulls(), flatVector->size());

// Non null count * size in bytes + null count * null size
return ((flatVector->size() - nullCount) *
flatVector->type()->cppSizeInBytes()) +
(nullCount * NULL_SIZE);
}
case velox::VectorEncoding::Simple::CONSTANT: {
auto* constVector = vector->as<velox::ConstantVector<T>>();

return constVector->mayHaveNulls()
? NULL_SIZE * constVector->size()
: constVector->size() * constVector->type()->cppSizeInBytes();
}
default: {
VELOX_FAIL("Unsupported encoding: {}", encoding);
}
}
}

template <typename T>
uint64_t getRawSizeFromStringVector(const velox::VectorPtr& vector) {
constexpr auto isCorrectType = std::is_same_v<velox::StringView, T>;
VELOX_DCHECK(isCorrectType, "Wrong vector type. Expected StringView.");

const auto& encoding = vector->encoding();
switch (encoding) {
case velox::VectorEncoding::Simple::FLAT: {
auto* flatVector = vector->as<velox::FlatVector<T>>();

const auto nullCount = velox::BaseVector::countNulls(
flatVector->nulls(), flatVector->size());

const velox::StringView* stringValues = flatVector->rawValues();
uint64_t rawSize = std::accumulate(
stringValues,
stringValues + flatVector->size(),
uint64_t(0),
[](uint64_t sum, const velox::StringView& str) {
return sum + str.size();
});

return rawSize + (nullCount * NULL_SIZE);
}
case velox::VectorEncoding::Simple::CONSTANT: {
auto* constVector = vector->as<velox::ConstantVector<T>>();

return constVector->mayHaveNulls()
? NULL_SIZE * constVector->size()
: constVector->value().size() * constVector->size();
}
default: {
VELOX_FAIL("Unsupported encoding: {}", encoding);
}
}
}

uint64_t getRawSizeFromVector(const velox::VectorPtr& vector);

} // namespace facebook::nimble
14 changes: 13 additions & 1 deletion dwio/nimble/velox/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) Meta Platforms, Inc. and its affiliates.
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -46,3 +46,15 @@ target_link_libraries(
gtest
gtest_main
Folly::folly)

add_executable(raw_size_tests RawSizeTests.cpp)
add_test(raw_size_tests raw_size_tests)

target_link_libraries(
raw_size_tests
raw_size_utils
velox_vector
velox_vector_test_lib
gtest
gtest_main
Folly::folly)
Loading

0 comments on commit c06a013

Please sign in to comment.