Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: DecayRange #486

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ endif()

target_compile_definitions(snmalloc INTERFACE $<$<BOOL:CONST_QUALIFIED_MALLOC_USABLE_SIZE>:MALLOC_USABLE_SIZE_QUALIFIER=const>)

target_compile_definitions(snmalloc INTERFACE SNMALLOC_TRACING)

# In debug and CI builds, link the backtrace library so that we can get stack
# traces on errors.
find_package(Backtrace)
Expand Down
12 changes: 8 additions & 4 deletions src/backend/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "../pal/pal.h"
#include "commitrange.h"
#include "commonconfig.h"
#include "decayrange.h"
#include "empty_range.h"
#include "globalrange.h"
#include "largebuddyrange.h"
Expand Down Expand Up @@ -148,9 +149,10 @@ namespace snmalloc
using GlobalR = GlobalRange<StatsR>;

# ifdef SNMALLOC_META_PROTECTED
using CommittedRange =
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
// Source for object allocations
using ObjectRange =
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>;
using ObjectRange = LargeBuddyRange<CommittedRange, 21, 21, Pagemap>;
// Set up protected range for metadata
using SubR = CommitRange<SubRange<GlobalR, DefaultPal, 6>, DefaultPal>;
using MetaRange =
Expand All @@ -159,8 +161,10 @@ namespace snmalloc
# else
// Source for object allocations and metadata
// No separation between the two
using ObjectRange = SmallBuddyRange<
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>>;
using CommittedRange =
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
using ObjectRange =
SmallBuddyRange<LargeBuddyRange<CommittedRange, 21, 21, Pagemap>>;
using GlobalMetaRange = GlobalRange<ObjectRange>;
# endif
#endif
Expand Down
351 changes: 351 additions & 0 deletions src/backend/decayrange.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
#pragma once

#include "../ds/ptrwrap.h"
#include "../pal/pal_ds.h"
#include "largebuddyrange.h"

namespace snmalloc
{
template<SNMALLOC_CONCEPT(RBRep) Rep>
class RepList
{
uintptr_t head = 0;

RepList(uintptr_t head) : head(head) {}

public:
constexpr RepList() = default;

[[nodiscard]] bool is_empty() const
{
return head == 0;
}

RepList get_next()
{
SNMALLOC_ASSERT(!is_empty());
auto next_field = &(Rep::ref(false, head));
auto next = Rep::get(next_field);
return {next};
}

capptr::Chunk<void> get_capability()
{
return capptr::Chunk<void>(reinterpret_cast<void*>(head));
}

RepList cons(capptr::Chunk<void> new_head_cap)
{
auto new_head = new_head_cap.unsafe_uintptr();
auto field = &(Rep::ref(false, new_head));
Rep::set(field, head);
return {new_head};
}

template<typename F>
void forall(F f)
{
auto curr = *this;
while (!curr.is_empty())
{
auto next = curr.get_next();

f(curr.get_capability());

curr = next;
}
}
};

/**
* Concurrent Stack
*
* This stack supports the following clients
* (push|pop)* || pop_all* || ... || pop_all*
*
* That is a single thread that can do push and pop, and other threads
* that do pop_all. pop_all if it returns a value, returns all of the
* stack, however, it may return nullptr if it races with either a push
* or a pop.
*
* The primary use case is single-threaded access, where other threads
* can attempt to steal all the values.
*/
template<SNMALLOC_CONCEPT(RBRep) Rep>
class RepStack
{
static constexpr auto empty = RepList<Rep>{};

private:
alignas(CACHELINE_SIZE) std::atomic<RepList<Rep>> stack{};

RepList<Rep> take()
{
if (stack.load(std::memory_order_relaxed).is_empty())
return empty;
return stack.exchange(empty, std::memory_order_acquire);
}

void replace(RepList<Rep> new_head)
{
SNMALLOC_ASSERT(stack.load().is_empty());
stack.store(new_head, std::memory_order_release);
}

public:
constexpr RepStack() = default;

void push(capptr::Chunk<void> new_head_cap)
{
auto old_head = take();
auto new_head = old_head.cons(new_head_cap);
replace(new_head);
}

capptr::Chunk<void> pop()
{
auto old_head = take();
if (old_head.is_empty())
return nullptr;

auto next = old_head.get_next();
replace(next);

return old_head.get_capability();
}

RepList<Rep> pop_all()
{
return take();
}
};

/**
* This range slowly filters back memory to the parent range.
* It locally caches memory and after it hasn't been used for some time
* it goes back to its parent range.
*/

template<typename ParentRange, typename PAL, typename Pagemap>
class DecayRange
{
/**
* How many slab sizes that can be provided.
*/
static constexpr size_t NUM_SLAB_SIZES = Pal::address_bits - MIN_CHUNK_BITS;

/**
* Number of free stacks per chunk size that each allocator will use.
* For performance ideally a power of 2. We will return to the central
* pool anything that has not be used in the last NUM_EPOCHS - 1, where
* each epoch is separated by DecayMemoryTimerObject::PERIOD.
* I.e. if period is 500ms and num of epochs is 4, then we will return to
* the central pool anything not used for the last 1500-2000ms.
*/
static constexpr size_t NUM_EPOCHS = 4;
static_assert(bits::is_pow2(NUM_EPOCHS), "Code assumes power of two.");

/**
* Stack of ranges that have been returned for reuse.
*/
ModArray<
NUM_SLAB_SIZES,
ModArray<NUM_EPOCHS, RepStack<BuddyChunkRep<Pagemap>>>>
chunk_stack;

typename ParentRange::State parent{};

/**
* Which is the current epoch to place dealloced chunks, and the
* first place we look for allocating chunks.
*/
static inline // alignas(CACHELINE_SIZE)
std::atomic<size_t>
epoch{0};

/**
* Flag to ensure one-shot registration with the PAL.
*/
static inline std::atomic_bool registered_timer{false};

std::atomic_bool registered_local{false};

/**
* All activated DecayRanges.
*/
static inline std::atomic<DecayRange*> all_local{nullptr};

DecayRange* all_local_next{nullptr};

static void handle_decay_tick()
{
static_assert(
ParentRange::ConcurrencySafe,
"Parent must be concurrency safe, as dealloc_range is called here on "
"potentially another thread's state.");
auto new_epoch = (epoch + 1) % NUM_EPOCHS;
// Flush old index for all threads.
auto curr = all_local.load(std::memory_order_acquire);
while (curr != nullptr)
{
for (size_t sc = 0; sc < NUM_SLAB_SIZES; sc++)
{
auto old_stack = curr->chunk_stack[sc][new_epoch].pop_all();

old_stack.forall([curr, sc](auto cap) {
curr->parent->dealloc_range(cap, MIN_CHUNK_SIZE << sc);
});
}
curr = curr->all_local_next;
}

// Advance current index
epoch = new_epoch;
}

class DecayMemoryTimerObject : public PalTimerObject
{
/***
* Method for callback object to perform lazy decommit.
*/
static void process(PalTimerObject*)
{
#ifdef SNMALLOC_TRACING
message<1024>("DecayRange::handle_decay_tick timer");
#endif
handle_decay_tick();
}

// Specify that we notify the ChunkAllocator every 500ms.
static constexpr size_t PERIOD = 500;

public:
constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {}
};

static inline DecayMemoryTimerObject timer_object;

public:
class State
{
DecayRange commit_range{};

public:
constexpr State() = default;

DecayRange* operator->()
{
return &commit_range;
}
};

static constexpr bool Aligned = ParentRange::Aligned;

static constexpr bool ConcurrencySafe = false;

constexpr DecayRange() = default;

capptr::Chunk<void> alloc_range(size_t size)
{
// Check local cache

if constexpr (pal_supports<Time, PAL>)
{
auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
// Try local cache of chunks first
for (size_t e = 0; e < NUM_EPOCHS; e++)
{
auto p = chunk_stack[slab_sizeclass][(epoch - e) % NUM_EPOCHS].pop();

if (p != nullptr)
{
#ifdef SNMALLOC_TRACING
message<1024>(
"DecayRange::alloc_range: returning from local cache: {} on {}",
address_cast(p),
this);
#endif
return p;
}
}
}

// Loop to possibly flush all the other local threads caches.
// Note that flushing passes to the parent range, which may consolidate
// blocks and thus be able to service this request.
// Alternatively, we could implement stealing, but that wouldn't
// be able to consolidate.
capptr::Chunk<void> result;
for (auto i = NUM_EPOCHS; i > 0; i--)
{
// Nothing in local cache, so allocate from parent.
result = parent->alloc_range(size);
if (result != nullptr)
{
#ifdef SNMALLOC_TRACING
message<1024>(
"DecayRange::alloc_range: returning from parent: {} on {}",
address_cast(result),
this);
#endif
return result;
}

// We have run out of memory.
// Try to free some memory to the parent.
#ifdef SNMALLOC_TRACING
message<1024>("DecayRange::handle_decay_tick OOM");
#endif
handle_decay_tick();
}

// Last try.
result = parent->alloc_range(size);

#ifdef SNMALLOC_TRACING
message<1024>(
"DecayRange::alloc_range: returning from parent last try: {} on {}",
address_cast(result),
this);
#endif

return result;
}

void dealloc_range(capptr::Chunk<void> base, size_t size)
{
if constexpr (!pal_supports<Time, PAL>)
{
parent->dealloc_range(base, size);
return;
}

if (!registered_timer.exchange(true))
{
// Register with the PAL.
PAL::register_timer(&timer_object);
}

// Check we have registered
if (!registered_local.exchange(true))
{
// Add to the list of local states.
auto* head = all_local.load();
do
{
all_local_next = head;
} while (!all_local.compare_exchange_strong(head, this));
}

auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
// Add to local cache.
#ifdef SNMALLOC_TRACING
message<1024>(
"DecayRange::dealloc_range: returning to local cache: {} on {}",
address_cast(base),
this);
#endif
chunk_stack[slab_sizeclass][epoch].push(base);
}
};
} // namespace snmalloc
Loading