Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pool allocator in QueryHeap #6943

Open
wants to merge 49 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 48 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
4e5bf05
Try to use boost::fast_pool_allocator in QueryHeap
SiarheiFedartsou Jun 11, 2024
8c7b80e
wip
SiarheiFedartsou Jun 11, 2024
3bd897f
wip
SiarheiFedartsou Jun 12, 2024
0723dc0
wip
SiarheiFedartsou Jul 3, 2024
bbdac63
wip
SiarheiFedartsou Jul 3, 2024
611a3c2
wip
SiarheiFedartsou Jul 3, 2024
f62e917
wip
SiarheiFedartsou Jul 6, 2024
5f166a5
wip
SiarheiFedartsou Jul 6, 2024
8df282b
wip
SiarheiFedartsou Jul 9, 2024
c578698
wip
SiarheiFedartsou Jul 9, 2024
d1f04ab
wip
SiarheiFedartsou Jul 9, 2024
233a756
wip
SiarheiFedartsou Jul 9, 2024
53032e5
wip
SiarheiFedartsou Jul 9, 2024
49f875c
wip
SiarheiFedartsou Jul 10, 2024
7eb2d93
wip
SiarheiFedartsou Jul 10, 2024
13448e4
wip
SiarheiFedartsou Jul 10, 2024
f9358ed
wip
SiarheiFedartsou Jul 10, 2024
1037256
wip
SiarheiFedartsou Jul 10, 2024
6f04aa9
wip
SiarheiFedartsou Jul 10, 2024
6d2fc45
wip
SiarheiFedartsou Jul 10, 2024
abbe5e2
wip
SiarheiFedartsou Jul 11, 2024
7337771
wip
SiarheiFedartsou Jul 11, 2024
058c26e
wip
SiarheiFedartsou Jul 11, 2024
c5aae51
wip
SiarheiFedartsou Jul 11, 2024
4d940ab
wip
SiarheiFedartsou Jul 11, 2024
3691f90
wip
SiarheiFedartsou Jul 11, 2024
18b3c5f
wip
SiarheiFedartsou Jul 11, 2024
9ef1911
wip
SiarheiFedartsou Jul 11, 2024
a90e9dd
wip
SiarheiFedartsou Jul 11, 2024
434cab4
wip
SiarheiFedartsou Jul 11, 2024
a18ad91
wip
SiarheiFedartsou Jul 11, 2024
69bc6c0
wip
SiarheiFedartsou Jul 11, 2024
fb8182a
wip
SiarheiFedartsou Jul 11, 2024
270f187
wip
SiarheiFedartsou Jul 11, 2024
e9cdb31
wip
SiarheiFedartsou Jul 11, 2024
ac05d36
Update bench.cpp
SiarheiFedartsou Jul 11, 2024
e045dea
Use pool in std::unordered_map
SiarheiFedartsou Jul 11, 2024
21f53ed
Use pool in std::unordered_map
SiarheiFedartsou Jul 11, 2024
1436e96
wip
SiarheiFedartsou Jul 12, 2024
f18791a
wip
SiarheiFedartsou Jul 12, 2024
81d128b
wip
SiarheiFedartsou Jul 12, 2024
fdd1ca0
wip
SiarheiFedartsou Jul 12, 2024
70d67d0
wip
SiarheiFedartsou Jul 12, 2024
8bd26dc
wip
SiarheiFedartsou Jul 12, 2024
3096440
wip
SiarheiFedartsou Jul 12, 2024
6090387
wip
SiarheiFedartsou Jul 12, 2024
9ce059f
wip
SiarheiFedartsou Jul 12, 2024
bff349f
wip
SiarheiFedartsou Jul 13, 2024
d47012a
Merge branch 'master' into sf-pool-alloc
SiarheiFedartsou Jul 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 34 additions & 45 deletions .github/workflows/osrm-backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ jobs:
benchmarks:
if: github.event_name == 'pull_request'
needs: [format-taginfo-docs]
runs-on: ubuntu-24.04
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy-paste from #6975

Just to make benchmark results more stable, I'll remove it before merge.

runs-on: self-hosted
env:
CCOMPILER: clang-16
CXXCOMPILER: clang++-16
Expand All @@ -664,37 +664,17 @@ jobs:
GITHUB_REPOSITORY: ${{ github.repository }}
RUN_BIG_BENCHMARK: ${{ contains(github.event.pull_request.labels.*.name, 'Performance') }}
steps:
- name: Enable data.osm.pbf cache
if: ${{ ! env.RUN_BIG_BENCHMARK }}
uses: actions/cache@v4
with:
path: ~/data.osm.pbf
key: v1-data-osm-pbf
restore-keys: |
v1-data-osm-pbf
- name: Enable compiler cache
uses: actions/cache@v4
with:
path: ~/.ccache
key: v1-ccache-benchmarks-${{ github.sha }}
restore-keys: |
v1-ccache-benchmarks-
- name: Enable Conan cache
uses: actions/cache@v4
with:
path: ~/.conan
key: v1-conan-benchmarks-${{ github.sha }}
restore-keys: |
v1-conan-benchmarks-
- name: Checkout PR Branch
uses: actions/checkout@v4
with:
ref: ${{ github.head_ref }}
path: pr
- name: Install dependencies
run: |
python3 -m pip install "conan<2.0.0" "requests==2.31.0" "numpy==1.26.4" --break-system-packages
sudo apt-get update -y && sudo apt-get install ccache
- name: Activate virtualenv
run: |
python3 -m venv .venv
source .venv/bin/activate
echo PATH=$PATH >> $GITHUB_ENV
pip install "conan<2.0.0" "requests==2.31.0" "numpy==1.26.4"
- name: Prepare data
run: |
if [ "$RUN_BIG_BENCHMARK" = "true" ]; then
Expand Down Expand Up @@ -740,32 +720,41 @@ jobs:
make -C test/data
# we run benchmarks in tmpfs to avoid impact of disk IO
- name: Create folder for tmpfs
run: mkdir -p /opt/benchmarks
run: |
# if by any chance it was mounted before(e.g. due to previous job failed), unmount it
sudo umount ~/benchmarks | true
rm -rf ~/benchmarks
mkdir -p ~/benchmarks
# see https://llvm.org/docs/Benchmarking.html
- name: Run PR Benchmarks
run: |
sudo mount -t tmpfs -o size=4g none /opt/benchmarks
cp -rf pr/build /opt/benchmarks/build
mkdir -p /opt/benchmarks/test
cp -rf pr/test/data /opt/benchmarks/test/data
cp -rf pr/profiles /opt/benchmarks/profiles

./pr/scripts/ci/run_benchmarks.sh -f /opt/benchmarks -r $(pwd)/pr_results -s $(pwd)/pr -b /opt/benchmarks/build -o ~/data.osm.pbf -g ~/gps_traces.csv
sudo umount /opt/benchmarks
sudo cset shield -c 2-3 -k on
sudo mount -t tmpfs -o size=4g none ~/benchmarks
cp -rf pr/build ~/benchmarks/build
mkdir -p ~/benchmarks/test
cp -rf pr/test/data ~/benchmarks/test/data
cp -rf pr/profiles ~/benchmarks/profiles

sudo cset shield --exec -- ./pr/scripts/ci/run_benchmarks.sh -f ~/benchmarks -r $(pwd)/pr_results -s $(pwd)/pr -b ~/benchmarks/build -o ~/data.osm.pbf -g ~/gps_traces.csv
sudo umount ~/benchmarks
sudo cset shield --reset
- name: Run Base Benchmarks
run: |
sudo mount -t tmpfs -o size=4g none /opt/benchmarks
cp -rf base/build /opt/benchmarks/build
mkdir -p /opt/benchmarks/test
cp -rf base/test/data /opt/benchmarks/test/data
cp -rf base/profiles /opt/benchmarks/profiles
sudo cset shield -c 2-3 -k on
sudo mount -t tmpfs -o size=4g none ~/benchmarks
cp -rf base/build ~/benchmarks/build
mkdir -p ~/benchmarks/test
cp -rf base/test/data ~/benchmarks/test/data
cp -rf base/profiles ~/benchmarks/profiles

# TODO: remove it when base branch will have this file at needed location
if [ ! -f /opt/benchmarks/test/data/portugal_to_korea.json ]; then
cp base/src/benchmarks/portugal_to_korea.json /opt/benchmarks/test/data/portugal_to_korea.json
if [ ! -f ~/benchmarks/test/data/portugal_to_korea.json ]; then
cp base/src/benchmarks/portugal_to_korea.json ~/benchmarks/test/data/portugal_to_korea.json
fi
# we intentionally use scripts from PR branch to be able to update them and see results in the same PR
./pr/scripts/ci/run_benchmarks.sh -f /opt/benchmarks -r $(pwd)/base_results -s $(pwd)/pr -b /opt/benchmarks/build -o ~/data.osm.pbf -g ~/gps_traces.csv
sudo umount /opt/benchmarks
sudo cset shield --exec -- cset shield --exec -- ./pr/scripts/ci/run_benchmarks.sh -f ~/benchmarks -r $(pwd)/base_results -s $(pwd)/pr -b ~/benchmarks/build -o ~/data.osm.pbf -g ~/gps_traces.csv
sudo umount ~/benchmarks
sudo cset shield --reset
- name: Post Benchmark Results
run: |
python3 pr/scripts/ci/post_benchmark_results.py base_results pr_results
Expand Down
4 changes: 2 additions & 2 deletions include/customizer/cell_customizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ class CellCustomizer
const std::vector<bool> &allowed_nodes,
CellMetric &metric) const
{
Heap heap_exemplar(graph.GetNumberOfNodes());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It used to work in a way that TBB copied heap_exemplar to each thread. QueryHeap started using thread local allocator, so here we had a situation when QueryHeap has PoolAllocator somewhere in its internals and this PoolAllocator has a pointer to its thread local memory pool, then we copy this QueryHeap to another thread and now new exemplar of QueryHeap has another PoolAllocator with a pointer to exactly the same memory pool, but now it is used on another thread!
With such change we guarantee that each thread creates heap for itself on its own.

HeapPtr heaps(heap_exemplar);
const auto number_of_nodes = graph.GetNumberOfNodes();
HeapPtr heaps([number_of_nodes] { return Heap{number_of_nodes}; });

for (std::size_t level = 1; level < partition.GetNumberOfLevels(); ++level)
{
Expand Down
158 changes: 158 additions & 0 deletions include/util/pool_allocator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#pragma once

#include <algorithm>
#include <array>
#include <bit>
#include <boost/assert.hpp>
#include <cstddef>
#include <cstdlib>
#include <memory>
#include <mutex>
#include <new>
#include <vector>

namespace osrm::util
{

inline size_t align_up(size_t n, size_t alignment)
{
return (n + alignment - 1) & ~(alignment - 1);
}

inline size_t get_next_power_of_two_exponent(size_t n)
{
BOOST_ASSERT(n > 0);
return (sizeof(size_t) * 8) - std::countl_zero(n - 1);
}

class MemoryPool
{
private:
constexpr static size_t MIN_CHUNK_SIZE_BYTES = 4096;

public:
static std::shared_ptr<MemoryPool> instance()
{
static thread_local std::shared_ptr<MemoryPool> instance;
if (!instance)
{
instance = std::shared_ptr<MemoryPool>(new MemoryPool());
}
return instance;
}

template <typename T> T *allocate(std::size_t items_count)
{
static_assert(alignof(T) <= alignof(std::max_align_t),
"Type is over-aligned for this allocator.");

size_t free_list_index = get_next_power_of_two_exponent(items_count * sizeof(T));
auto &free_list = free_lists_[free_list_index];
if (free_list.empty())
{
size_t block_size_in_bytes = 1u << free_list_index;
block_size_in_bytes = align_up(block_size_in_bytes, alignof(std::max_align_t));
// check if there is space in current memory chunk
if (current_chunk_left_bytes_ < block_size_in_bytes)
{
allocate_chunk(block_size_in_bytes);
}

free_list.push_back(current_chunk_ptr_);
current_chunk_left_bytes_ -= block_size_in_bytes;
current_chunk_ptr_ += block_size_in_bytes;
}
auto ptr = reinterpret_cast<T *>(free_list.back());
free_list.pop_back();
return ptr;
}

template <typename T> void deallocate(T *p, std::size_t n) noexcept
{
size_t free_list_index = get_next_power_of_two_exponent(n * sizeof(T));
// NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
free_lists_[free_list_index].push_back(reinterpret_cast<void *>(p));
}

~MemoryPool()
{
for (auto chunk : chunks_)
{
// NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
std::free(chunk);
}
}

private:
MemoryPool() = default;
MemoryPool(const MemoryPool &) = delete;
MemoryPool &operator=(const MemoryPool &) = delete;

void allocate_chunk(size_t bytes)
{
auto chunk_size = std::max(bytes, MIN_CHUNK_SIZE_BYTES);
// NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
void *chunk = std::malloc(chunk_size);
if (!chunk)
{
throw std::bad_alloc();
}
chunks_.push_back(chunk);
current_chunk_ptr_ = static_cast<uint8_t *>(chunk);
current_chunk_left_bytes_ = chunk_size;
}

// we have 64 free lists, one for each possible power of two
std::array<std::vector<void *>, sizeof(std::size_t) * 8> free_lists_;

// list of allocated memory chunks, we don't free them until the pool is destroyed
std::vector<void *> chunks_;

uint8_t *current_chunk_ptr_ = nullptr;
size_t current_chunk_left_bytes_ = 0;
};

template <typename T> class PoolAllocator
{
public:
using value_type = T;

PoolAllocator() noexcept : pool(MemoryPool::instance()){};

template <typename U>
PoolAllocator(const PoolAllocator<U> &) noexcept : pool(MemoryPool::instance())
{
}

template <typename U> struct rebind
{
using other = PoolAllocator<U>;
};

T *allocate(std::size_t n) { return pool->allocate<T>(n); }

void deallocate(T *p, std::size_t n) noexcept { pool->deallocate<T>(p, n); }

PoolAllocator(const PoolAllocator &) = default;
PoolAllocator &operator=(const PoolAllocator &) = default;
PoolAllocator(PoolAllocator &&) noexcept = default;
PoolAllocator &operator=(PoolAllocator &&) noexcept = default;

private:
// using shared_ptr guarantees that memory pool won't be destroyed before all allocators using
// it (important if there are static instances of PoolAllocator)
std::shared_ptr<MemoryPool> pool;
};
template <typename T, typename U>
bool operator==(const PoolAllocator<T> &, const PoolAllocator<U> &)
{
return true;
}

template <typename T, typename U>
bool operator!=(const PoolAllocator<T> &, const PoolAllocator<U> &)
{
return false;
}

} // namespace osrm::util
18 changes: 13 additions & 5 deletions include/util/query_heap.hpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
#ifndef OSRM_UTIL_QUERY_HEAP_HPP
#define OSRM_UTIL_QUERY_HEAP_HPP

#include "util/pool_allocator.hpp"
#include <algorithm>
#include <boost/assert.hpp>
#include <boost/heap/d_ary_heap.hpp>

#include <algorithm>
#include <cstdint>
#include <limits>
#include <map>
#include <optional>
#include <unordered_map>
#include <vector>

namespace osrm::util
{

Expand Down Expand Up @@ -56,7 +55,11 @@ template <typename NodeID, typename Key> class UnorderedMapStorage
void Clear() { nodes.clear(); }

private:
std::unordered_map<NodeID, Key> nodes;
template <typename K, typename V>
using UnorderedMap = std::
unordered_map<K, V, std::hash<K>, std::equal_to<K>, PoolAllocator<std::pair<const K, V>>>;

UnorderedMap<NodeID, Key> nodes;
};

template <typename NodeID,
Expand Down Expand Up @@ -142,10 +145,12 @@ class QueryHeap
return weight > other.weight;
}
};

using HeapContainer = boost::heap::d_ary_heap<HeapData,
boost::heap::arity<4>,
boost::heap::mutable_<true>,
boost::heap::compare<std::greater<HeapData>>>;
boost::heap::compare<std::greater<HeapData>>,
boost::heap::allocator<PoolAllocator<HeapData>>>;
using HeapHandle = typename HeapContainer::handle_type;

public:
Expand All @@ -160,6 +165,9 @@ class QueryHeap
Data data;
};

QueryHeap(const QueryHeap &other) = delete;
QueryHeap(QueryHeap &&other) = delete;

template <typename... StorageArgs> explicit QueryHeap(StorageArgs... args) : node_index(args...)
{
Clear();
Expand Down
Loading