Skip to content

Commit

Permalink
refine
Browse files Browse the repository at this point in the history
Signed-off-by: guo-shaoge <[email protected]>
  • Loading branch information
guo-shaoge committed Jan 29, 2025
1 parent 7b8c19f commit 241537d
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 64 deletions.
108 changes: 59 additions & 49 deletions dbms/src/Common/ColumnsHashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ struct HashMethodOneNumber
using KeyHolderType = FieldType;

static constexpr bool is_serialized_key = false;
static constexpr bool batch_get_key_holder = false;

const FieldType * vec;

Expand Down Expand Up @@ -99,11 +98,14 @@ class KeyStringBatchHandlerBase
size_t prepareNextBatchType(
const UInt8 * chars,
const IColumn::Offsets & offsets,
size_t batch_size,
size_t cur_batch_size,
const TiDB::TiDBCollatorPtr & collator)
{
if (cur_batch_size <= 0)
return 0;

const auto * derived_collator = static_cast<const DerivedCollator *>(collator);
for (size_t i = 0; i < batch_size; ++i)
for (size_t i = 0; i < cur_batch_size; ++i)
{
const auto row = batch_row_idx + i;
const auto last_offset = offsets[row - 1];
Expand All @@ -124,26 +126,33 @@ class KeyStringBatchHandlerBase
}

protected:
void init(size_t start_row, size_t batch_size)
bool inited() const
{
return !sort_key_containers.empty();
}

void init(size_t start_row, size_t max_batch_size)
{
RUNTIME_CHECK(max_batch_size >= 256);
batch_row_idx = start_row;
sort_key_containers.resize(batch_size);
batch_rows.resize(batch_size);
sort_key_containers.resize(max_batch_size);
batch_rows.reserve(max_batch_size);
}

public:
size_t prepareNextBatch(
const UInt8 * chars,
const IColumn::Offsets & offsets,
size_t batch_size,
size_t cur_batch_size,
const TiDB::TiDBCollatorPtr & collator)
{
batch_rows.resize(cur_batch_size);

if likely (collator)
{
#define M(VAR_PREFIX, COLLATOR_NAME, IMPL_TYPE, COLLATOR_ID) \
case (COLLATOR_ID): \
{ \
return prepareNextBatchType<IMPL_TYPE, true>(chars, offsets, batch_size, collator); \
return prepareNextBatchType<IMPL_TYPE, true>(chars, offsets, cur_batch_size, collator); \
break; \
}

Expand All @@ -159,7 +168,7 @@ class KeyStringBatchHandlerBase
}
else
{
return prepareNextBatchType<TiDB::ITiDBCollator, false>(chars, offsets, batch_size, collator);
return prepareNextBatchType<TiDB::ITiDBCollator, false>(chars, offsets, cur_batch_size, collator);
}
}

Expand All @@ -173,20 +182,19 @@ class KeyStringBatchHandlerBase
};

/// For the case when there is one string key.
template <typename Value, typename Mapped, bool use_cache = true, size_t get_key_holder_batch_size = 0>
template <typename Value, typename Mapped, bool use_cache = true>
struct HashMethodString
: public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, use_cache, get_key_holder_batch_size>, Value, Mapped, use_cache>
: public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, use_cache>, Value, Mapped, use_cache>
, KeyStringBatchHandlerBase
{
using Self = HashMethodString<Value, Mapped, use_cache>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
using KeyHolderType = ArenaKeyHolder;
using BatchKeyHolderType = KeyHolderType;

static_assert(get_key_holder_batch_size == 0 || get_key_holder_batch_size >= 256);
using BatchHandlerBase = KeyStringBatchHandlerBase;

static constexpr bool is_serialized_key = false;
static constexpr bool batch_get_key_holder = (get_key_holder_batch_size > 0);

const IColumn::Offset * offsets;
const UInt8 * chars;
Expand All @@ -205,24 +213,24 @@ struct HashMethodString
collator = collators[0];
}

void initBatchHandler(size_t start_row)
void initBatchHandler(size_t start_row, size_t max_batch_size)
{
assert(batch_get_key_holder);
BatchHandlerBase::init(start_row, get_key_holder_batch_size);
assert(!BatchHandlerBase::inited());
BatchHandlerBase::init(start_row, max_batch_size);
}

size_t prepareNextBatch(Arena *)
size_t prepareNextBatch(Arena *, size_t cur_batch_size)
{
assert(batch_get_key_holder);
return BatchHandlerBase::prepareNextBatch(chars, *offsets, get_key_holder_batch_size, collator);
assert(BatchHandlerBase::inited());
return BatchHandlerBase::prepareNextBatch(chars, *offsets, cur_batch_size, collator);
}

ALWAYS_INLINE inline KeyHolderType getKeyHolder(
ssize_t row,
[[maybe_unused]] Arena * pool,
std::vector<String> & sort_key_containers) const
{
assert(!batch_get_key_holder);
assert(!BatchHandlerBase::inited());

auto last_offset = row == 0 ? 0 : offsets[row - 1];
// Remove last zero byte.
Expand All @@ -244,9 +252,9 @@ struct HashMethodStringBin
using Self = HashMethodStringBin<Value, Mapped, padding>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
using KeyHolderType = ArenaKeyHolder;
using BatchKeyHolderType = KeyHolderType;

static constexpr bool is_serialized_key = false;
static constexpr bool batch_get_key_holder = false;

const IColumn::Offset * offsets;
const UInt8 * chars;
Expand Down Expand Up @@ -445,9 +453,9 @@ struct HashMethodFastPathTwoKeysSerialized
using Self = HashMethodFastPathTwoKeysSerialized<Key1Desc, Key2Desc, Value, Mapped>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
using KeyHolderType = SerializedKeyHolder;
using BatchKeyHolderType = KeyHolderType;

static constexpr bool is_serialized_key = true;
static constexpr bool batch_get_key_holder = false;

Key1Desc key_1_desc;
Key2Desc key_2_desc;
Expand Down Expand Up @@ -483,9 +491,9 @@ struct HashMethodFixedString
using Self = HashMethodFixedString<Value, Mapped, use_cache>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
using KeyHolderType = ArenaKeyHolder;
using BatchKeyHolderType = KeyHolderType;

static constexpr bool is_serialized_key = false;
static constexpr bool batch_get_key_holder = false;

size_t n;
const ColumnFixedString::Chars_t * chars;
Expand Down Expand Up @@ -532,9 +540,9 @@ struct HashMethodKeysFixed
using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
using Base = columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>;
using KeyHolderType = Key;
using BatchKeyHolderType = KeyHolderType;

static constexpr bool is_serialized_key = false;
static constexpr bool batch_get_key_holder = false;
static constexpr bool has_nullable_keys = has_nullable_keys_;

Sizes key_sizes;
Expand Down Expand Up @@ -700,33 +708,38 @@ class KeySerializedBatchHandlerBase
}

protected:
bool inited() const
{
return !byte_size.empty();
}

void init(size_t start_row, const ColumnRawPtrs & key_columns, const TiDB::TiDBCollators & collators)
{
batch_row_idx = start_row;
byte_size.resize_fill_zero(key_columns[0]->size());
RUNTIME_CHECK(!byte_size.empty());
for (size_t i = 0; i < key_columns.size(); ++i)
key_columns[i]->countSerializeByteSizeForCmp(byte_size, collators.empty() ? nullptr : collators[i]);
}

public:
size_t prepareNextBatch(
const ColumnRawPtrs & key_columns,
Arena * pool,
size_t batch_size,
size_t cur_batch_size,
const TiDB::TiDBCollators & collators)
{
santityCheck();
resize(batch_size);
resize(cur_batch_size);

if unlikely (batch_size <= 0)
if unlikely (cur_batch_size <= 0)
return 0;

size_t mem_size = 0;
for (size_t i = batch_row_idx; i < batch_row_idx + batch_size; ++i)
for (size_t i = batch_row_idx; i < batch_row_idx + cur_batch_size; ++i)
mem_size += byte_size[i];

auto * ptr = static_cast<char *>(pool->alignedAlloc(mem_size, 16));
for (size_t i = 0; i < batch_size; ++i)
for (size_t i = 0; i < cur_batch_size; ++i)
{
pos[i] = ptr;
ori_pos[i] = ptr;
Expand All @@ -737,15 +750,15 @@ class KeySerializedBatchHandlerBase
key_columns[i]->serializeToPosForCmp(
pos,
batch_row_idx,
batch_size,
cur_batch_size,
false,
collators.empty() ? nullptr : collators[i],
&sort_key_container);

for (size_t i = 0; i < batch_size; ++i)
for (size_t i = 0; i < cur_batch_size; ++i)
real_byte_size[i] = pos[i] - ori_pos[i];

batch_row_idx += batch_size;
batch_row_idx += cur_batch_size;

return mem_size;
}
Expand All @@ -764,20 +777,17 @@ class KeySerializedBatchHandlerBase
* That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
* Therefore, when aggregating by several strings, there is no ambiguity.
*/
template <typename Value, typename Mapped, size_t get_key_holder_batch_size = 0>
template <typename Value, typename Mapped>
struct HashMethodSerialized
: public columns_hashing_impl::HashMethodBase<HashMethodSerialized<Value, Mapped, get_key_holder_batch_size>, Value, Mapped, false>
: public columns_hashing_impl::HashMethodBase<HashMethodSerialized<Value, Mapped>, Value, Mapped, false>
, KeySerializedBatchHandlerBase
{
using Self = HashMethodSerialized<Value, Mapped, get_key_holder_batch_size>;
using Self = HashMethodSerialized<Value, Mapped>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;

static_assert(get_key_holder_batch_size == 0 || get_key_holder_batch_size >= 256);
using BatchHandlerBase = KeySerializedBatchHandlerBase;

static constexpr bool is_serialized_key = true;
static constexpr bool batch_get_key_holder = (get_key_holder_batch_size > 0);
using KeyHolderType = typename std::conditional<batch_get_key_holder, ArenaKeyHolder, SerializedKeyHolder>::type;
using KeyHolderType = SerializedKeyHolder;
using BatchKeyHolderType = ArenaKeyHolder;

ColumnRawPtrs key_columns;
size_t keys_size;
Expand All @@ -792,22 +802,22 @@ struct HashMethodSerialized
, collators(collators_)
{}

void initBatchHandler(size_t start_row)
void initBatchHandler(size_t start_row, size_t /* max_batch_size */)
{
assert(batch_get_key_holder);
assert(!BatchHandlerBase::inited());
BatchHandlerBase::init(start_row, key_columns, collators);
}

size_t prepareNextBatch(Arena * pool)
size_t prepareNextBatch(Arena * pool, size_t cur_batch_size)
{
assert(batch_get_key_holder);
return BatchHandlerBase::prepareNextBatch(key_columns, pool, get_key_holder_batch_size, collators);
assert(BatchHandlerBase::inited());
return BatchHandlerBase::prepareNextBatch(key_columns, pool, cur_batch_size, collators);
}

ALWAYS_INLINE inline KeyHolderType getKeyHolder(size_t row, Arena * pool, std::vector<String> & sort_key_containers)
const
{
assert(!batch_get_key_holder);
assert(!BatchHandlerBase::inited());
return SerializedKeyHolder{
serializeKeysToPoolContiguous(row, keys_size, key_columns, collators, sort_key_containers, *pool),
pool};
Expand All @@ -826,9 +836,9 @@ struct HashMethodHashed
using Self = HashMethodHashed<Value, Mapped, use_cache>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
using KeyHolderType = Key;
using BatchKeyHolderType = KeyHolderType;

static constexpr bool is_serialized_key = false;
static constexpr bool batch_get_key_holder = false;

ColumnRawPtrs key_columns;
TiDB::TiDBCollators collators;
Expand Down
Loading

0 comments on commit 241537d

Please sign in to comment.