Skip to content

Commit

Permalink
[feature](function) support format function
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangstar333 committed Jan 16, 2025
1 parent cc183eb commit 6975cbb
Show file tree
Hide file tree
Showing 11 changed files with 939 additions and 0 deletions.
241 changes: 241 additions & 0 deletions be/src/vec/functions/function_format.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <glog/logging.h>

#include <cstdio>
#include <regex>
#include <vector>

#include "common/status.h"
#include "vec/columns/column.h"
#include "vec/columns/column_vector.h"
#include "vec/common/assert_cast.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type_number.h"
#include "vec/functions/cast_type_to_either.h"
#include "vec/functions/simple_function_factory.h"

namespace doris::vectorized {

class FunctionFormatNumber : public IFunction {
public:
static constexpr auto name = "format_number";

static constexpr const char* UNITS[6] = {"", "K", "M", "B", "T", "Q"};

static FunctionPtr create() { return std::make_shared<FunctionFormatNumber>(); }

String get_name() const override { return name; }

size_t get_number_of_arguments() const override { return 1; }

bool is_variadic() const override { return false; }

DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return std::make_shared<DataTypeString>();
}

Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
auto column = block.get_by_position(arguments[0]).column;
const auto& column_data =
assert_cast<const ColumnVector<Float64>*>(column.get())->get_data();
auto col_res = ColumnString::create();
fmt::memory_buffer buffer;

for (auto i = 0; i < input_rows_count; ++i) {
auto res_data = format_number(buffer, column_data[i]);
col_res->insert_data(res_data.data(), res_data.length());
}
block.replace_by_position(result, std::move(col_res));
return Status::OK();
}

std::string format_number(fmt::memory_buffer& buffer, double number) const {
buffer.clear();
double abs_number = std::abs(number);
int unit_index = 0;
while (abs_number >= 1000 && unit_index < 5) {
abs_number /= 1000;
++unit_index;
}
if (number < 0) {
fmt::format_to(buffer, "-");
}
if (abs_number == 1) {
fmt::format_to(buffer, "{}", abs_number);
} else if (abs_number < 10) {
fmt::format_to(buffer, "{:.2f}", abs_number);
} else if (abs_number < 100) {
fmt::format_to(buffer, "{:.1f}", abs_number);
} else {
fmt::format_to(buffer, "{:.0f}", abs_number);
}
fmt::format_to(buffer, UNITS[unit_index]);
return fmt::to_string(buffer);
}
};

class FunctionFormat : public IFunction {
public:
static constexpr auto name = "format";

static FunctionPtr create() { return std::make_shared<FunctionFormat>(); }

String get_name() const override { return name; }

size_t get_number_of_arguments() const override { return 0; }

bool is_variadic() const override { return true; }

DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
return std::make_shared<DataTypeString>();
}

Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const override {
DCHECK_GE(arguments.size(), 2);
bool valid =
cast_type(block.get_by_position(arguments[1]).type.get(), [&](const auto& type) {
using DataType = std::decay_t<decltype(type)>;
using T = typename DataType::FieldType;
using ColVecData =
std::conditional_t<IsNumber<T>, ColumnVector<T>, ColumnString>;
if (auto col = check_and_get_column<ColVecData>(
block.get_by_position(arguments[1]).column.get()) ||
is_column_const(*block.get_by_position(arguments[1]).column)) {
execute_inner<ColVecData, T>(block, arguments, result, input_rows_count);
return true;
}
return false;
});
if (!valid) {
return Status::RuntimeError(
"{}'s argument does not match the expected data type, type: {}, column: {}",
get_name(), block.get_by_position(arguments[1]).type->get_name(),
block.get_by_position(arguments[1]).column->dump_structure());
}
return Status::OK();
}

template <typename F>
static bool cast_type(const IDataType* type, F&& f) {
return cast_type_to_either<DataTypeInt8, DataTypeInt16, DataTypeInt32, DataTypeInt64,
DataTypeInt128, DataTypeFloat32, DataTypeFloat64,
DataTypeString>(type, std::forward<F>(f));
}

template <typename ColVecData, typename T>
void execute_inner(Block& block, const ColumnNumbers& arguments, uint32_t result,
size_t input_rows_count) const {
size_t argument_size = arguments.size();
std::vector<ColumnPtr> argument_columns(argument_size);
auto result_column = ColumnString::create();

// maybe most user is format(const, column), so only handle this case const column
if (argument_size == 2) {
std::vector<uint8_t> is_consts(argument_size);
std::tie(argument_columns[0], is_consts[0]) =
unpack_if_const(block.get_by_position(arguments[0]).column);
std::tie(argument_columns[1], is_consts[1]) =
unpack_if_const(block.get_by_position(arguments[1]).column);
execute_for_two_argument<ColVecData, T>(argument_columns, is_consts,
assert_cast<ColumnString*>(result_column.get()),
input_rows_count);
} else {
for (size_t i = 0; i < argument_size; ++i) {
argument_columns[i] = block.get_by_position(arguments[i])
.column->convert_to_full_column_if_const();
}
execute_for_others_arg<ColVecData, T>(argument_columns,
assert_cast<ColumnString*>(result_column.get()),
argument_size, input_rows_count);
}

block.replace_by_position(result, std::move(result_column));
}

template <typename ColVecData, typename T>
void execute_for_two_argument(std::vector<ColumnPtr>& argument_columns,
std::vector<uint8_t>& is_consts, ColumnString* result_data_column,
size_t input_rows_count) const {
const auto& format_column = assert_cast<const ColumnString&>(*argument_columns[0].get());
const auto& value_column = assert_cast<const ColVecData&>(*argument_columns[1].get());
for (int i = 0; i < input_rows_count; ++i) {
auto format =
format_column.get_data_at(index_check_const(i, is_consts[0])).to_string_view();
std::string res;
try {
if constexpr (std::is_same_v<ColVecData, ColumnString>) {
auto value = value_column.get_data_at(index_check_const(i, is_consts[1]));
res = fmt::format(format, value);
} else {
auto value = value_column.get_data()[i];
res = fmt::format(format, value);
}
} catch (const std::exception& e) {
throw doris::Exception(
ErrorCode::INVALID_ARGUMENT,
"Invalid Input argument \"{}\" of function format, error: {}", format,
e.what());
}
result_data_column->insert_data(res.data(), res.length());
}
}

template <typename ColVecData, typename T>
void execute_for_others_arg(std::vector<ColumnPtr>& argument_columns,
ColumnString* result_data_column, size_t argument_size,
size_t input_rows_count) const {
const auto& format_column = assert_cast<const ColumnString&>(*argument_columns[0].get());
for (int i = 0; i < input_rows_count; ++i) {
auto format = format_column.get_data_at(i).to_string_view();
std::string res;
fmt::dynamic_format_arg_store<fmt::format_context> args;
if constexpr (std::is_same_v<ColVecData, ColumnString>) {
for (int col = 1; col < argument_size; ++col) {
const auto& arg_column_data =
assert_cast<const ColVecData&>(*argument_columns[col].get());
args.push_back(arg_column_data.get_data_at(i).to_string());
}
} else {
for (int col = 1; col < argument_size; ++col) {
const auto& arg_column_data =
assert_cast<const ColVecData&>(*argument_columns[col].get()).get_data();
args.push_back(arg_column_data[i]);
}
}
try {
res = fmt::vformat(format, args);
} catch (const std::exception& e) {
throw doris::Exception(
ErrorCode::INVALID_ARGUMENT,
"Invalid Input argument \"{}\" of function format, error: {}", format,
e.what());
}
result_data_column->insert_data(res.data(), res.length());
}
}
};

void register_function_format(SimpleFunctionFactory& factory) {
factory.register_function<FunctionFormatNumber>();
factory.register_function<FunctionFormat>();
}

} // namespace doris::vectorized
60 changes: 60 additions & 0 deletions be/src/vec/functions/function_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,64 @@ struct StringASCII {
}
};

struct NameParseDataSize {
static constexpr auto name = "parse_data_size";
};

static const std::map<std::string_view, Int128> UNITS = {
{"B", static_cast<Int128>(1)}, {"kB", static_cast<Int128>(1) << 10},
{"MB", static_cast<Int128>(1) << 20}, {"GB", static_cast<Int128>(1) << 30},
{"TB", static_cast<Int128>(1) << 40}, {"PB", static_cast<Int128>(1) << 50},
{"EB", static_cast<Int128>(1) << 60}, {"ZB", static_cast<Int128>(1) << 70},
{"YB", static_cast<Int128>(1) << 80}};

struct ParseDataSize {
using ReturnType = DataTypeInt128;
static constexpr auto TYPE_INDEX = TypeIndex::String;
using Type = String;
using ReturnColumnType = ColumnVector<Int128>;

static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
PaddedPODArray<Int128>& res) {
auto size = offsets.size();
res.resize(size);
for (int i = 0; i < size; ++i) {
const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
int str_size = offsets[i] - offsets[i - 1];
res[i] = parse_data_size(std::string_view(raw_str, str_size));
}
return Status::OK();
}

static Int128 parse_data_size(const std::string_view& dataSize) {
int digit_length = 0;
for (char c : dataSize) {
if (isdigit(c) || c == '.') {
digit_length++;
} else {
break;
}
}

if (digit_length == 0) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"Invalid Input argument \"{}\" of function parse_data_size",
dataSize);
}
// 123.45MB--->123.45 : MB
double value = std::stod(std::string(dataSize.substr(0, digit_length)));
auto unit = dataSize.substr(digit_length);
auto it = UNITS.find(unit);
if (it != UNITS.end()) {
return static_cast<__int128>(static_cast<long double>(it->second) * value);
} else {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"Invalid Input argument \"{}\" of function parse_data_size",
dataSize);
}
}
};

struct NameQuote {
static constexpr auto name = "quote";
};
Expand Down Expand Up @@ -1127,6 +1185,7 @@ template <typename LeftDataType, typename RightDataType>
using StringFindInSetImpl = StringFunctionImpl<LeftDataType, RightDataType, FindInSetOp>;

// ready for regist function
using FunctionStringParseDataSize = FunctionUnaryToType<ParseDataSize, NameParseDataSize>;
using FunctionStringASCII = FunctionUnaryToType<StringASCII, NameStringASCII>;
using FunctionStringLength = FunctionUnaryToType<StringLengthImpl, NameStringLength>;
using FunctionCrc32 = FunctionUnaryToType<Crc32Impl, NameCrc32>;
Expand Down Expand Up @@ -1162,6 +1221,7 @@ using FunctionStringLPad = FunctionStringPad<StringLPad>;
using FunctionStringRPad = FunctionStringPad<StringRPad>;

void register_function_string(SimpleFunctionFactory& factory) {
factory.register_function<FunctionStringParseDataSize>();
factory.register_function<FunctionStringASCII>();
factory.register_function<FunctionStringLength>();
factory.register_function<FunctionCrc32>();
Expand Down
2 changes: 2 additions & 0 deletions be/src/vec/functions/simple_function_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ void register_function_multi_match(SimpleFunctionFactory& factory);
void register_function_split_by_regexp(SimpleFunctionFactory& factory);
void register_function_assert_true(SimpleFunctionFactory& factory);
void register_function_bit_test(SimpleFunctionFactory& factory);
void register_function_format(SimpleFunctionFactory& factory);

class SimpleFunctionFactory {
using Creator = std::function<FunctionBuilderPtr()>;
Expand Down Expand Up @@ -301,6 +302,7 @@ class SimpleFunctionFactory {
register_function_split_by_regexp(instance);
register_function_assert_true(instance);
register_function_bit_test(instance);
register_function_format(instance);
});
return instance;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Format;
import org.apache.doris.nereids.trees.expressions.functions.scalar.FormatNumber;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow;
import org.apache.doris.nereids.trees.expressions.functions.scalar.FromBase64;
import org.apache.doris.nereids.trees.expressions.functions.scalar.FromDays;
Expand Down Expand Up @@ -334,6 +336,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.Nullable;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Nvl;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Overlay;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ParseDataSize;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ParseUrl;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Password;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Pi;
Expand Down Expand Up @@ -656,6 +659,8 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(FirstSignificantSubdomain.class, "first_significant_subdomain"),
scalar(Floor.class, "floor"),
scalar(Fmod.class, "fmod"),
scalar(Format.class, "format"),
scalar(FormatNumber.class, "format_number"),
scalar(Fpow.class, "fpow"),
scalar(FromBase64.class, "from_base64"),
scalar(FromDays.class, "from_days"),
Expand Down Expand Up @@ -824,6 +829,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(Overlay.class, "overlay"),
scalar(ParseUrl.class, "parse_url"),
scalar(Password.class, "password"),
scalar(ParseDataSize.class, "parse_data_size"),
scalar(Pi.class, "pi"),
scalar(Pmod.class, "pmod"),
scalar(Positive.class, "positive"),
Expand Down
Loading

0 comments on commit 6975cbb

Please sign in to comment.