Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lh dict tree sort #10

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@
*.out
*.app

.cache
.vscode
build
data
third_party

# Benchmark Results
benchmark/read_order_percentile/*.json
benchmark/read_order_percentile/*.png
benchmark/read_order_percentile/*.png
1 change: 1 addition & 0 deletions benchmark/delta_string_sort/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
log/
64 changes: 64 additions & 0 deletions benchmark/delta_string_sort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
cmake_minimum_required(VERSION 3.20)
project(sort_example LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

option(DEBUG_WITH_ASAN "Enable address sanitizer in debug compile mode" ON)
option(ENABLE_GPERF "Enable gperf debug compile mode" OFF)

# Add the Arrow and Parquet paths to the CMake prefix
list(APPEND CMAKE_PREFIX_PATH ${PROJECT_SOURCE_DIR}/../../third_party/install/arrow)
message(STATUS ${CMAKE_PREFIX_PATH})

# Find the necessary packages
find_package(Arrow CONFIG REQUIRED)
find_package(Parquet CONFIG REQUIRED)
find_package(Threads REQUIRED)
find_package(glog CONFIG REQUIRED)
find_package(Fmt CONFIG REQUIRED)

set(CMAKE_CXX_FLAGS_DEBUG "-g -Wall -Wno-mismatched-tags -Wno-deprecated-declarations -Wno-deprecated-this-capture")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Wall -march=native -DNDEBUG")

if(DEBUG_WITH_ASAN)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
endif()

if(ENABLE_GPERF)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -pg")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -pg")
endif()

set(DEPS_INSTALL ${PROJECT_SOURCE_DIR}/../../third_party/install)
set(ARROW_INSTALL ${DEPS_INSTALL}/arrow)
include_directories(SYSTEM ${DEPS_INSTALL}/include ${ARROW_INSTALL}/include)
link_directories(${DEPS_INSTALL}/lib ${ARROW_INSTALL}/lib)

# Setup testing
enable_testing()
find_package(GTest CONFIG REQUIRED)
# add_library(GTest::GTest INTERFACE IMPORTED) target_link_libraries(GTest::GTest INTERFACE gtest_main gtest)

message(STATUS "CXX Flag RELWITHDEBINFO: ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
message(STATUS "CXX Flag DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")

add_compile_definitions(PROJECT_SOURCE_DIR=\"${PROJECT_SOURCE_DIR}\")

add_subdirectory(src)
add_subdirectory(test)

add_executable(data_generator data_generator.cpp)
target_include_directories(benchmark PRIVATE ${PROJECT_SOURCE_DIR}/../../third_party/install/arrow/include)
target_link_directories(benchmark PRIVATE ${PROJECT_SOURCE_DIR}/../../third_party/install/arrow/lib)

# Link libraries to the benchmark target
target_link_libraries(
data_generator
PRIVATE Arrow::arrow_static
Parquet::parquet_static
Threads::Threads
glog::glog
fmt::fmt
)
35 changes: 35 additions & 0 deletions benchmark/delta_string_sort/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# How to compile
```bash
# Make sure you do this in sortproto directory
mkdir build
cd build
cmake -S ..
make
```


# UT Benchmark
现在基于GTest跑准确性测试的时候顺便跑了下benchmark,还没有接入parquet,用的std::string
数据生成方式为:随机前缀长度pre_len[0...max_len],再随机字符串后缀长度[0...max_len-pre_len],最后再生成后缀字符串,

benchmark 1, use force cmp (one by one [])
```
==== Test 1
generate data, str_num: 1000000, str_max_len: 50
decode + std::sort time: 1649ms
insert time: 1404ms
output time: 897ms
==== Test 2
generate data, str_num: 10000000, str_max_len: 200
decode + std::sort time: 47653ms
insert time: 15925ms
output time: 16122ms
==== Test 3
generate data, str_num: 50000000, str_max_len: 200
decode + std::sort time: 338991ms
insert time: 96187ms
output time: 103141ms
```


benchmark 2, use uint64 cmp
39 changes: 39 additions & 0 deletions benchmark/delta_string_sort/check_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pyarrow.parquet as pq


def check_column_encodings(parquet_file_path):
print(f"Checking column encodings in '{parquet_file_path}'...")
# Open the Parquet file
parquet_file = pq.ParquetFile(parquet_file_path)

# Get the file metadata
metadata = parquet_file.metadata

# Loop over each row group in the file
for row_group_idx in range(metadata.num_row_groups):
print(f"Row Group {row_group_idx}:")

# Get the row group metadata
row_group = metadata.row_group(row_group_idx)

# Loop over each column in the row group
for column_idx in range(row_group.num_columns):
# Get the column metadata
column = row_group.column(column_idx)
column_name = column.path_in_schema
encoding = column.encodings

# Print the column name and encoding methods used
print(
f" Column '{column_name}' uses the following encodings: {encoding}. type: {column.physical_type}.")
print(f"{column}")

print("Encoding check complete.")


# Example usage
# Replace with the path to your Parquet file
parquet_file_path = './data/input-ty2-2e6-100.parquet'
parquet_file_path = './data/input-ty2-20-150.parquet'
parquet_file_path = './data/input-20-150.parquet'
check_column_encodings(parquet_file_path)
Loading