Skip to content

Commit

Permalink
Merge pull request #4299 from Sonicadvance1/fix_48bit_wine
Browse files Browse the repository at this point in the history
FEX: Allocate a VMA allocator when running on a 48-bit VA
  • Loading branch information
lioncash authored Jan 29, 2025
2 parents fb2a59a + 3a33f55 commit c2f8b5b
Show file tree
Hide file tree
Showing 7 changed files with 173 additions and 33 deletions.
17 changes: 13 additions & 4 deletions FEXCore/Source/Utils/Allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,18 @@ void ReenableSBRKAllocations(void* Ptr) {

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
void SetupHooks() {
Alloc64 = Alloc::OSAllocator::Create64BitAllocator();
static void AssignHookOverrides() {
SetJemallocMmapHook(FEX_mmap);
SetJemallocMunmapHook(FEX_munmap);
FEXCore::Allocator::mmap = FEX_mmap;
FEXCore::Allocator::munmap = FEX_munmap;
}

void SetupHooks() {
Alloc64 = Alloc::OSAllocator::Create64BitAllocator();
AssignHookOverrides();
}

void ClearHooks() {
SetJemallocMmapHook(::mmap);
SetJemallocMunmapHook(::munmap);
Expand Down Expand Up @@ -300,15 +304,20 @@ fextl::vector<MemoryRegion> StealMemoryRegion(uintptr_t Begin, uintptr_t End) {
return Regions;
}

fextl::vector<MemoryRegion> Steal48BitVA() {
fextl::vector<MemoryRegion> Setup48BitAllocatorIfExists() {
size_t Bits = FEXCore::Allocator::DetermineVASize();
if (Bits < 48) {
return {};
}

uintptr_t Begin48BitVA = 0x0'8000'0000'0000ULL;
uintptr_t End48BitVA = 0x1'0000'0000'0000ULL;
return StealMemoryRegion(Begin48BitVA, End48BitVA);
auto Regions = StealMemoryRegion(Begin48BitVA, End48BitVA);

Alloc64 = Alloc::OSAllocator::Create64BitAllocatorWithRegions(Regions);
AssignHookOverrides();

return Regions;
}

void ReclaimMemoryRegion(const fextl::vector<MemoryRegion>& Regions) {
Expand Down
130 changes: 106 additions & 24 deletions FEXCore/Source/Utils/Allocator/64BitAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <FEXCore/Utils/MathUtils.h>
#include <FEXCore/Utils/SignalScopeGuards.h>
#include <FEXCore/Utils/TypeDefines.h>
#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/Utils/MathUtils.h>
#include <FEXCore/fextl/sstream.h>
#include <FEXHeaderUtils/Syscalls.h>
#include <FEXCore/fextl/memory.h>
Expand Down Expand Up @@ -35,6 +37,8 @@ thread_local FEXCore::Core::InternalThreadState* TLSThread {};
class OSAllocator_64Bit final : public Alloc::HostAllocator {
public:
OSAllocator_64Bit();
OSAllocator_64Bit(fextl::vector<FEXCore::Allocator::MemoryRegion>& Regions);

virtual ~OSAllocator_64Bit();
void* AllocateSlab(size_t Size) override {
return nullptr;
Expand Down Expand Up @@ -99,19 +103,20 @@ class OSAllocator_64Bit final : public Alloc::HostAllocator {
// This returns the size of the LiveVMARegion in addition to the flex set that tracks the used data
// The LiveVMARegion lives at the start of the VMA region which means on initialization we need to set that
// tracked ranged as used immediately
static size_t GetSizeWithFlexSet(size_t Size) {
static size_t GetFEXManagedVMARegionSize(size_t Size) {
// One element per page

// 0x10'0000'0000 bytes
// 0x100'0000 Pages
// 1 bit per page for tracking means 0x20'0000 (Pages / 8) bytes of flex space
// Which is 2MB of tracking
uint64_t NumElements = (Size >> FEXCore::Utils::FEX_PAGE_SHIFT) * sizeof(FlexBitElementType);
return sizeof(LiveVMARegion) + FEXCore::FlexBitSet<FlexBitElementType>::Size(NumElements);
const uint64_t NumElements = Size >> FEXCore::Utils::FEX_PAGE_SHIFT;
return sizeof(LiveVMARegion) + FEXCore::FlexBitSet<FlexBitElementType>::SizeInBytes(NumElements);
}

static void InitializeVMARegionUsed(LiveVMARegion* Region, size_t AdditionalSize) {
size_t SizeOfLiveRegion = FEXCore::AlignUp(LiveVMARegion::GetSizeWithFlexSet(Region->SlabInfo->RegionSize), FEXCore::Utils::FEX_PAGE_SIZE);
size_t SizeOfLiveRegion =
FEXCore::AlignUp(LiveVMARegion::GetFEXManagedVMARegionSize(Region->SlabInfo->RegionSize), FEXCore::Utils::FEX_PAGE_SIZE);
size_t SizePlusManagedData = SizeOfLiveRegion + AdditionalSize;

Region->FreeSpace = Region->SlabInfo->RegionSize - SizePlusManagedData;
Expand Down Expand Up @@ -155,7 +160,8 @@ class OSAllocator_64Bit final : public Alloc::HostAllocator {
ReservedRegions->erase(ReservedIterator);

// mprotect the new region we've allocated
size_t SizeOfLiveRegion = FEXCore::AlignUp(LiveVMARegion::GetSizeWithFlexSet(ReservedRegion->RegionSize), FEXCore::Utils::FEX_PAGE_SIZE);
size_t SizeOfLiveRegion =
FEXCore::AlignUp(LiveVMARegion::GetFEXManagedVMARegionSize(ReservedRegion->RegionSize), FEXCore::Utils::FEX_PAGE_SIZE);
size_t SizePlusManagedData = UsedSize + SizeOfLiveRegion;

[[maybe_unused]] auto Res = mprotect(reinterpret_cast<void*>(ReservedRegion->Base), SizePlusManagedData, PROT_READ | PROT_WRITE);
Expand All @@ -180,7 +186,7 @@ class OSAllocator_64Bit final : public Alloc::HostAllocator {
// 32-bit old kernel workarounds
fextl::vector<FEXCore::Allocator::MemoryRegion> Steal32BitIfOldKernel();

void AllocateMemoryRegions(const fextl::vector<FEXCore::Allocator::MemoryRegion>& Ranges);
void AllocateMemoryRegions(fextl::vector<FEXCore::Allocator::MemoryRegion>& Ranges);
LiveVMARegion* FindLiveRegionForAddress(uintptr_t Addr, uintptr_t AddrEnd);
};

Expand Down Expand Up @@ -383,7 +389,7 @@ void* OSAllocator_64Bit::Mmap(void* addr, size_t length, int prot, int flags, in
if (!LiveRegion) {
// Couldn't find a fit in the live regions
// Allocate a new reserved region
size_t lengthOfLiveRegion = FEXCore::AlignUp(LiveVMARegion::GetSizeWithFlexSet(length), FEXCore::Utils::FEX_PAGE_SIZE);
size_t lengthOfLiveRegion = FEXCore::AlignUp(LiveVMARegion::GetFEXManagedVMARegionSize(length), FEXCore::Utils::FEX_PAGE_SIZE);
size_t lengthPlusManagedData = length + lengthOfLiveRegion;
for (auto it = ReservedRegions->begin(); it != ReservedRegions->end(); ++it) {
if ((*it)->RegionSize >= lengthPlusManagedData) {
Expand Down Expand Up @@ -515,27 +521,43 @@ fextl::vector<FEXCore::Allocator::MemoryRegion> OSAllocator_64Bit::Steal32BitIfO
return FEXCore::Allocator::StealMemoryRegion(LOWER_BOUND_32, UPPER_BOUND_32);
}

void OSAllocator_64Bit::AllocateMemoryRegions(const fextl::vector<FEXCore::Allocator::MemoryRegion>& Ranges) {
for (auto [Ptr, AllocationSize] : Ranges) {
if (!ObjectAlloc) {
auto MaxSize = std::min(size_t(64) * 1024 * 1024, AllocationSize);
void OSAllocator_64Bit::AllocateMemoryRegions(fextl::vector<FEXCore::Allocator::MemoryRegion>& Ranges) {
// Need to allocate the ObjectAlloc up front. Find a region that is larger than our minimum size first.
const size_t ObjectAllocSize = 64 * 1024 * 1024;

for (auto& it : Ranges) {
if (ObjectAllocSize > it.Size) {
continue;
}

// Allocate up to 64 MiB the first allocation for an intrusive allocator
mprotect(Ptr, MaxSize, PROT_READ | PROT_WRITE);
// Allocate up to 64 MiB the first allocation for an intrusive allocator
mprotect(it.Ptr, ObjectAllocSize, PROT_READ | PROT_WRITE);

// This enables the kernel to use transparent large pages in the allocator which can reduce memory pressure
::madvise(Ptr, MaxSize, MADV_HUGEPAGE);
// This enables the kernel to use transparent large pages in the allocator which can reduce memory pressure
::madvise(it.Ptr, ObjectAllocSize, MADV_HUGEPAGE);

ObjectAlloc = new (Ptr) Alloc::ForwardOnlyIntrusiveArenaAllocator(Ptr, MaxSize);
ReservedRegions = ObjectAlloc->new_construct(ReservedRegions, ObjectAlloc);
LiveRegions = ObjectAlloc->new_construct(LiveRegions, ObjectAlloc);
ObjectAlloc = new (it.Ptr) Alloc::ForwardOnlyIntrusiveArenaAllocator(it.Ptr, ObjectAllocSize);
ReservedRegions = ObjectAlloc->new_construct(ReservedRegions, ObjectAlloc);
LiveRegions = ObjectAlloc->new_construct(LiveRegions, ObjectAlloc);

if (AllocationSize > MaxSize) {
AllocationSize -= MaxSize;
(uint8_t*&)Ptr += MaxSize;
} else {
continue;
}
if (it.Size >= ObjectAllocSize) {
// Modify region size
it.Size -= ObjectAllocSize;
(uint8_t*&)it.Ptr += ObjectAllocSize;
}

break;
}

if (!ObjectAlloc) {
ERROR_AND_DIE_FMT("Couldn't allocate object allocator!");
}

for (auto [Ptr, AllocationSize] : Ranges) {
// Skip using any regions that are <= two pages. FEX's VMA allocator requires two pages
// for tracking data. So three pages are minimum for a single page VMA allocation.
if (AllocationSize <= (FEXCore::Utils::FEX_PAGE_SIZE * 2)) {
continue;
}

ReservedVMARegion* Region = ObjectAlloc->new_construct<ReservedVMARegion>();
Expand All @@ -557,6 +579,10 @@ OSAllocator_64Bit::OSAllocator_64Bit() {
FEXCore::Allocator::ReclaimMemoryRegion(LowMem);
}

OSAllocator_64Bit::OSAllocator_64Bit(fextl::vector<FEXCore::Allocator::MemoryRegion>& Regions) {
AllocateMemoryRegions(Regions);
}

OSAllocator_64Bit::~OSAllocator_64Bit() {
// This needs a mutex to be thread safe
auto lk = FEXCore::GuardSignalDeferringSectionWithFallback(AllocationMutex, TLSThread);
Expand All @@ -576,6 +602,62 @@ OSAllocator_64Bit::~OSAllocator_64Bit() {
fextl::unique_ptr<Alloc::HostAllocator> Create64BitAllocator() {
return fextl::make_unique<OSAllocator_64Bit>();
}

template<class T>
struct alloc_delete : public std::default_delete<T> {
void operator()(T* ptr) const {
if (ptr) {
const auto size = sizeof(T);
const auto MinPage = FEXCore::AlignUp(size, FEXCore::Utils::FEX_PAGE_SIZE);

std::destroy_at(ptr);
::munmap(ptr, MinPage);
}
}

template<typename U>
requires (std::is_base_of_v<U, T>)
operator fextl::default_delete<U>() {
return fextl::default_delete<U>();
}
};

template<class T, class... Args>
requires (!std::is_array_v<T>)
fextl::unique_ptr<T> make_alloc_unique(FEXCore::Allocator::MemoryRegion& Base, Args&&... args) {
const auto size = sizeof(T);
const auto MinPage = FEXCore::AlignUp(size, FEXCore::Utils::FEX_PAGE_SIZE);
if (Base.Size < size || MinPage != FEXCore::Utils::FEX_PAGE_SIZE) {
ERROR_AND_DIE_FMT("Couldn't fit allocator in to page!");
}

auto ptr = ::mmap(Base.Ptr, MinPage, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
if (ptr == MAP_FAILED) {
ERROR_AND_DIE_FMT("Couldn't allocate memory region");
}

// Remove the page from the base region.
// Could be zero after this.
Base.Size -= MinPage;
Base.Ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(Base.Ptr) + MinPage);

auto Result = ::new (ptr) T(std::forward<Args>(args)...);
return fextl::unique_ptr<T, alloc_delete<T>>(Result);
}

fextl::unique_ptr<Alloc::HostAllocator> Create64BitAllocatorWithRegions(fextl::vector<FEXCore::Allocator::MemoryRegion>& Regions) {
// This is a bit tricky as we can't allocate memory safely except from the Regions provided. Otherwise we might overwrite memory pages we
// don't own. Scan the memory regions and find the smallest one.
FEXCore::Allocator::MemoryRegion& Smallest = Regions[0];
for (auto& it : Regions) {
if (it.Size <= Smallest.Size) {
Smallest = it;
}
}

return make_alloc_unique<OSAllocator_64Bit>(Smallest, Regions);
}

} // namespace Alloc::OSAllocator

namespace FEXCore::Allocator {
Expand Down
10 changes: 8 additions & 2 deletions FEXCore/Source/Utils/Allocator/FlexBitSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,14 @@ struct FlexBitSet final {
return Get(Element);
}

static size_t Size(uint64_t Elements) {
return FEXCore::AlignUp(Elements / MinimumSizeBits, MinimumSizeBits);
// Returns the number of bits required to hold the number of elements.
// Just rounds up to the MinimumSizeInBits.
constexpr static size_t SizeInBits(uint64_t Elements) {
return FEXCore::AlignUp(Elements, MinimumSizeBits);
}
// Returns the number of bytes required to hold the number of elements.
constexpr static size_t SizeInBytes(uint64_t Elements) {
return SizeInBits(Elements) / 8;
}
};

Expand Down
4 changes: 3 additions & 1 deletion FEXCore/Source/Utils/Allocator/HostAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
#pragma once
#include <FEXCore/fextl/allocator.h>
#include <FEXCore/fextl/memory.h>
#include <FEXCore/fextl/vector.h>
#include <FEXCore/Utils/Allocator.h>

#include <cstddef>
#include <cstdint>
#include <sys/types.h>

namespace FEXCore::Core {
Expand Down Expand Up @@ -49,4 +50,5 @@ class GlobalAllocator {

namespace Alloc::OSAllocator {
fextl::unique_ptr<Alloc::HostAllocator> Create64BitAllocator();
fextl::unique_ptr<Alloc::HostAllocator> Create64BitAllocatorWithRegions(fextl::vector<FEXCore::Allocator::MemoryRegion>& Regions);
} // namespace Alloc::OSAllocator
2 changes: 1 addition & 1 deletion FEXCore/include/FEXCore/Utils/Allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ FEX_DEFAULT_VISIBILITY void ReclaimMemoryRegion(const fextl::vector<MemoryRegion
// AArch64 canonical addresses are only up to bits 48/52 with the remainder being other things
// Use this to reserve the top 128TB of VA so the guest never see it
// Returns nullptr on host VA < 48bits
FEX_DEFAULT_VISIBILITY fextl::vector<MemoryRegion> Steal48BitVA();
FEX_DEFAULT_VISIBILITY fextl::vector<MemoryRegion> Setup48BitAllocatorIfExists();

#ifndef _WIN32
FEX_DEFAULT_VISIBILITY void RegisterTLSData(FEXCore::Core::InternalThreadState* Thread);
Expand Down
41 changes: 41 additions & 0 deletions FEXCore/unittests/APITests/FlexBitSet.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators_range.hpp>

#include "Utils/Allocator/FlexBitSet.h"

TEST_CASE("FlexBitSet - Sizing") {
// Ensure that FlexBitSet sizing is correct.

// Size of zero shouldn't take any space.
CHECK(FEXCore::FlexBitSet<uint8_t>::SizeInBytes(0) == 0);
CHECK(FEXCore::FlexBitSet<uint16_t>::SizeInBytes(0) == 0);
CHECK(FEXCore::FlexBitSet<uint32_t>::SizeInBytes(0) == 0);
CHECK(FEXCore::FlexBitSet<uint64_t>::SizeInBytes(0) == 0);

CHECK(FEXCore::FlexBitSet<uint8_t>::SizeInBits(0) == 0);
CHECK(FEXCore::FlexBitSet<uint16_t>::SizeInBits(0) == 0);
CHECK(FEXCore::FlexBitSet<uint32_t>::SizeInBits(0) == 0);
CHECK(FEXCore::FlexBitSet<uint64_t>::SizeInBits(0) == 0);

// Size of 1 should take one sizeof(ElementSize) size
CHECK(FEXCore::FlexBitSet<uint8_t>::SizeInBytes(1) == sizeof(uint8_t));
CHECK(FEXCore::FlexBitSet<uint16_t>::SizeInBytes(1) == sizeof(uint16_t));
CHECK(FEXCore::FlexBitSet<uint32_t>::SizeInBytes(1) == sizeof(uint32_t));
CHECK(FEXCore::FlexBitSet<uint64_t>::SizeInBytes(1) == sizeof(uint64_t));

CHECK(FEXCore::FlexBitSet<uint8_t>::SizeInBits(1) == sizeof(uint8_t) * 8);
CHECK(FEXCore::FlexBitSet<uint16_t>::SizeInBits(1) == sizeof(uint16_t) * 8);
CHECK(FEXCore::FlexBitSet<uint32_t>::SizeInBits(1) == sizeof(uint32_t) * 8);
CHECK(FEXCore::FlexBitSet<uint64_t>::SizeInBits(1) == sizeof(uint64_t) * 8);

// Size of `sizeof(ElementSize) * 8` should take one sizeof(ElementSize) size
CHECK(FEXCore::FlexBitSet<uint8_t>::SizeInBytes(sizeof(uint8_t) * 8) == sizeof(uint8_t));
CHECK(FEXCore::FlexBitSet<uint16_t>::SizeInBytes(sizeof(uint16_t) * 8) == sizeof(uint16_t));
CHECK(FEXCore::FlexBitSet<uint32_t>::SizeInBytes(sizeof(uint32_t) * 8) == sizeof(uint32_t));
CHECK(FEXCore::FlexBitSet<uint64_t>::SizeInBytes(sizeof(uint64_t) * 8) == sizeof(uint64_t));

CHECK(FEXCore::FlexBitSet<uint8_t>::SizeInBits(sizeof(uint8_t) * 8) == sizeof(uint8_t) * 8);
CHECK(FEXCore::FlexBitSet<uint16_t>::SizeInBits(sizeof(uint16_t) * 8) == sizeof(uint16_t) * 8);
CHECK(FEXCore::FlexBitSet<uint32_t>::SizeInBits(sizeof(uint32_t) * 8) == sizeof(uint32_t) * 8);
CHECK(FEXCore::FlexBitSet<uint64_t>::SizeInBits(sizeof(uint64_t) * 8) == sizeof(uint64_t) * 8);
}
2 changes: 1 addition & 1 deletion Source/Tools/FEXLoader/FEXLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ int main(int argc, char** argv, char** const envp) {

if (Loader.Is64BitMode()) {
// Destroy the 48th bit if it exists
Base48Bit = FEXCore::Allocator::Steal48BitVA();
Base48Bit = FEXCore::Allocator::Setup48BitAllocatorIfExists();
} else {
// Reserve [0x1_0000_0000, 0x2_0000_0000).
// Safety net if 32-bit address calculation overflows in to 64-bit range.
Expand Down

0 comments on commit c2f8b5b

Please sign in to comment.