diff --git a/Utilities/JIT.h b/Utilities/JIT.h index cbba82b960b2..5069b639507e 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -514,8 +514,8 @@ class jit_compiler final atomic_t m_disk_space = umax; public: - jit_compiler(const std::unordered_map& _link, const std::string& _cpu, u32 flags = 0); - ~jit_compiler(); + jit_compiler(const std::unordered_map& _link, const std::string& _cpu, u32 flags = 0, std::function symbols_cement = {}) noexcept; + ~jit_compiler() noexcept; // Get LLVM context auto& get_context() diff --git a/Utilities/JITLLVM.cpp b/Utilities/JITLLVM.cpp index 1a9c9e7052cd..8edba90deca8 100644 --- a/Utilities/JITLLVM.cpp +++ b/Utilities/JITLLVM.cpp @@ -77,8 +77,7 @@ static u64 make_null_function(const std::string& name) if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < 0x8000'0000) { - // Point the garbage to reserved, non-executable memory - return reinterpret_cast(vm::g_sudo_addr + addr); + fmt::throw_exception("Unhandled symbols cementing! (name='%s'", name); } } @@ -174,18 +173,34 @@ struct JITAnnouncer : llvm::JITEventListener struct MemoryManager1 : llvm::RTDyldMemoryManager { // 256 MiB for code or data - static constexpr u64 c_max_size = 0x20000000 / 2; + static constexpr u64 c_max_size = 0x1000'0000; // Allocation unit (2M) static constexpr u64 c_page_size = 2 * 1024 * 1024; - // Reserve 512 MiB - u8* const ptr = static_cast(utils::memory_reserve(c_max_size * 2)); + // Reserve 256 MiB blocks + void* m_code_mems = nullptr; + void* m_data_ro_mems = nullptr; + void* m_data_rw_mems = nullptr; u64 code_ptr = 0; - u64 data_ptr = c_max_size; + u64 data_ro_ptr = 0; + u64 data_rw_ptr = 0; - MemoryManager1() = default; + // First fallback for non-existing symbols + // May be a memory container internally + std::function m_symbols_cement; + + MemoryManager1(std::function symbols_cement = {}) noexcept + : m_symbols_cement(std::move(symbols_cement)) + { + auto ptr = reinterpret_cast(utils::memory_reserve(c_max_size * 3)); + m_code_mems = ptr; + // ptr += c_max_size; + // m_data_ro_mems = ptr; + ptr += c_max_size; + m_data_rw_mems = ptr; + } MemoryManager1(const MemoryManager1&) = delete; @@ -194,13 +209,20 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager ~MemoryManager1() override { // Hack: don't release to prevent reuse of address space, see jit_announce - utils::memory_decommit(ptr, c_max_size * 2); + utils::memory_decommit(m_code_mems, utils::align(code_ptr % c_max_size, c_page_size)); + utils::memory_decommit(m_data_ro_mems, utils::align(data_ro_ptr % c_max_size, c_page_size)); + utils::memory_decommit(m_data_rw_mems, utils::align(data_rw_ptr % c_max_size, c_page_size)); } llvm::JITSymbol findSymbol(const std::string& name) override { u64 addr = RTDyldMemoryManager::getSymbolAddress(name); + if (!addr && m_symbols_cement) + { + addr = m_symbols_cement(name); + } + if (!addr) { addr = make_null_function(name); @@ -214,45 +236,71 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager return {addr, llvm::JITSymbolFlags::Exported}; } - u8* allocate(u64& oldp, uptr size, uint align, utils::protection prot) + u8* allocate(u64& alloc_pos, void* block, uptr size, u64 align, utils::protection prot) { - if (align > c_page_size) + align = align ? align : 16; + + const u64 sizea = utils::align(size, align); + + if (!size || align > c_page_size || sizea > c_max_size || sizea < size) { - jit_log.fatal("Unsupported alignment (size=0x%x, align=0x%x)", size, align); + jit_log.fatal("Unsupported size/alignment (size=0x%x, align=0x%x)", size, align); return nullptr; } - const u64 olda = utils::align(oldp, align); - const u64 newp = utils::align(olda + size, align); + const u64 oldp = alloc_pos; + + u64 olda = utils::align(oldp, align); - if ((newp - 1) / c_max_size != oldp / c_max_size) + ensure(olda >= oldp); + ensure(olda < ~sizea); + + u64 newp = olda + sizea; + + if ((newp - 1) / c_max_size != (oldp - 1) / c_max_size) { - jit_log.fatal("Out of memory (size=0x%x, align=0x%x)", size, align); - return nullptr; + if ((newp - 1) / c_max_size > 1) + { + // Does not work for relocations, needs more robust solution + fmt::throw_exception("Out of memory (size=0x%x, align=0x%x)", size, align); + } + + olda = utils::align(oldp, c_max_size); + + ensure(olda >= oldp); + ensure(olda < ~sizea); + + newp = olda + sizea; } - if ((oldp - 1) / c_page_size != (newp - 1) / c_page_size) + // Update allocation counter + alloc_pos = newp; + + if ((newp - 1) / c_page_size != (oldp - 1) / c_page_size) { // Allocate pages on demand - const u64 pagea = utils::align(oldp, c_page_size); + const u64 pagea = utils::align(olda, c_page_size); const u64 psize = utils::align(newp - pagea, c_page_size); - utils::memory_commit(this->ptr + pagea, psize, prot); + utils::memory_commit(reinterpret_cast(block) + (pagea % c_max_size), psize, prot); } - // Update allocation counter - oldp = newp; - - return this->ptr + olda; + return reinterpret_cast(block) + (olda % c_max_size); } u8* allocateCodeSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/) override { - return allocate(code_ptr, size, align, utils::protection::wx); + return allocate(code_ptr, m_code_mems, size, align, utils::protection::wx); } - u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool /*is_ro*/) override + u8* allocateDataSection(uptr size, uint align, uint /*sec_id*/, llvm::StringRef /*sec_name*/, bool is_ro) override { - return allocate(data_ptr, size, align, utils::protection::rw); + if (is_ro) + { + // Disabled + //return allocate(data_ro_ptr, m_data_ro_mems, size, align, utils::protection::rw); + } + + return allocate(data_rw_ptr, m_data_rw_mems, size, align, utils::protection::rw); } bool finalizeMemory(std::string* = nullptr) override @@ -272,7 +320,14 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager // Simple memory manager struct MemoryManager2 : llvm::RTDyldMemoryManager { - MemoryManager2() = default; + // First fallback for non-existing symbols + // May be a memory container internally + std::function m_symbols_cement; + + MemoryManager2(std::function symbols_cement = {}) noexcept + : m_symbols_cement(std::move(symbols_cement)) + { + } ~MemoryManager2() override { @@ -282,6 +337,11 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager { u64 addr = RTDyldMemoryManager::getSymbolAddress(name); + if (!addr && m_symbols_cement) + { + addr = m_symbols_cement(name); + } + if (!addr) { addr = make_null_function(name); @@ -561,7 +621,7 @@ bool jit_compiler::add_sub_disk_space(ssz space) }).second; } -jit_compiler::jit_compiler(const std::unordered_map& _link, const std::string& _cpu, u32 flags) +jit_compiler::jit_compiler(const std::unordered_map& _link, const std::string& _cpu, u32 flags, std::function symbols_cement) noexcept : m_context(new llvm::LLVMContext) , m_cpu(cpu(_cpu)) { @@ -589,17 +649,17 @@ jit_compiler::jit_compiler(const std::unordered_map& _link, co // Auxiliary JIT (does not use custom memory manager, only writes the objects) if (flags & 0x1) { - mem = std::make_unique(); + mem = std::make_unique(std::move(symbols_cement)); } else { - mem = std::make_unique(); + mem = std::make_unique(std::move(symbols_cement)); null_mod->setTargetTriple(jit_compiler::triple2()); } } else { - mem = std::make_unique(); + mem = std::make_unique(std::move(symbols_cement)); } { @@ -648,7 +708,7 @@ jit_compiler::jit_compiler(const std::unordered_map& _link, co } } -jit_compiler::~jit_compiler() +jit_compiler::~jit_compiler() noexcept { } diff --git a/Utilities/Thread.h b/Utilities/Thread.h index 37c4a56f4c17..edc488e73b84 100644 --- a/Utilities/Thread.h +++ b/Utilities/Thread.h @@ -769,7 +769,7 @@ class named_thread_group final } // Move the context (if movable) - new (static_cast(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count - 1), std::forward(f)); + new (static_cast(m_threads + m_count - 1)) Thread(std::string(name) + std::to_string(m_count), std::forward(f)); } // Constructor with a function performed before adding more threads diff --git a/rpcs3/Emu/Cell/PPUAnalyser.h b/rpcs3/Emu/Cell/PPUAnalyser.h index 0b225bc8213c..88022346b184 100644 --- a/rpcs3/Emu/Cell/PPUAnalyser.h +++ b/rpcs3/Emu/Cell/PPUAnalyser.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "util/types.hpp" #include "util/endian.hpp" #include "util/asm.hpp" @@ -38,7 +39,41 @@ struct ppu_function std::map blocks{}; // Basic blocks: addr -> size std::set calls{}; // Set of called functions std::set callers{}; - std::string name{}; // Function name + mutable std::string name{}; // Function name + + struct iterator + { + const ppu_function* _this; + usz index = 0; + + std::pair operator*() const + { + return _this->blocks.empty() ? std::pair(_this->addr, _this->size) : *std::next(_this->blocks.begin(), index); + } + + iterator& operator++() + { + index++; + return *this; + } + + bool operator==(const iterator& rhs) const noexcept + { + return rhs.index == index; + } + + bool operator!=(const iterator& rhs) const noexcept = default; + }; + + iterator begin() const + { + return iterator{this}; + } + + iterator end() const + { + return iterator{this, std::max(1, blocks.size())}; + } }; // PPU Relocation Information @@ -87,18 +122,37 @@ struct ppu_module : public Type ppu_module& operator=(ppu_module&&) noexcept = default; - uchar sha1[20]{}; - std::string name{}; - std::string path{}; + uchar sha1[20]{}; // Hash + std::string name{}; // Filename + std::string path{}; // Filepath s64 offset = 0; // Offset of file - std::string cache{}; - std::vector relocs{}; - std::vector segs{}; - std::vector secs{}; - std::vector funcs{}; - std::vector applied_patches; - std::deque> allocations; - std::map addr_to_seg_index; + mutable bs_t attr{}; // Shared module attributes + std::string cache{}; // Cache file path + std::vector relocs{}; // Relocations + std::vector segs{}; // Segments + std::vector secs{}; // Segment sections + std::vector funcs{}; // Function list + std::vector applied_patches; // Patch addresses + std::deque> allocations; // Segment memory allocations + std::map addr_to_seg_index; // address->segment ordered translator map + ppu_module* parent = nullptr; + std::pair local_bounds{u32{umax}, 0}; // Module addresses range + std::shared_ptr> jit_bounds; // JIT instance modules addresses range + + auto& get_funcs() + { + return parent ? parent->funcs : funcs; + } + + const auto& get_funcs() const + { + return parent ? parent->funcs : funcs; + } + + const auto& get_relocs() const + { + return parent ? parent->relocs : relocs; + } // Copy info without functions void copy_part(const ppu_module& info) @@ -106,11 +160,11 @@ struct ppu_module : public Type std::memcpy(sha1, info.sha1, sizeof(sha1)); name = info.name; path = info.path; - relocs = info.relocs; segs = info.segs; secs = info.secs; allocations = info.allocations; addr_to_seg_index = info.addr_to_seg_index; + parent = const_cast(&info); } bool analyse(u32 lib_toc, u32 entry, u32 end, const std::vector& applied, const std::vector& exported_funcs = std::vector{}, std::function check_aborted = {}); diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 8da9cca8f0df..4f150d68ad07 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -176,7 +176,7 @@ bool serialize(utils::serial& ar, typename ppu_thread::cr_b extern void ppu_initialize(); extern void ppu_finalize(const ppu_module& info, bool force_mem_release = false); extern bool ppu_initialize(const ppu_module& info, bool check_only = false, u64 file_size = 0); -static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module& whole_module); +static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name); extern bool ppu_load_exec(const ppu_exec_object&, bool virtual_load, const std::string&, utils::serial* = nullptr); extern std::pair, CellError> ppu_load_overlay(const ppu_exec_object&, bool virtual_load, const std::string& path, s64 file_offset, utils::serial* = nullptr); extern void ppu_unload_prx(const lv2_prx&); @@ -3677,8 +3677,8 @@ namespace // Compiled PPU module info struct jit_module { - void(*symbol_resolver)(u8*, u64) = nullptr; - std::shared_ptr pjit; + std::vector symbol_resolvers; + std::vector> pjit; bool init = false; }; @@ -3729,6 +3729,7 @@ namespace } to_destroy.pjit = std::move(found->second.pjit); + to_destroy.symbol_resolvers = std::move(found->second.symbol_resolvers); bucket.map.erase(found); } @@ -4445,7 +4446,7 @@ extern void ppu_initialize() idm::select([&](u32, lv2_prx& _module) { - if (_module.funcs.empty()) + if (_module.get_funcs().empty()) { return; } @@ -4556,7 +4557,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s auto& ppu_toc = toc_manager.toc_map; - for (const auto& func : info.funcs) + for (const auto& func : info.get_funcs()) { if (func.size && func.blocks.empty()) { @@ -4659,11 +4660,14 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s jit_module& jit_mod = g_fxo->get().get(cache_path + "_" + std::to_string(std::bit_cast(info.segs[0].ptr))); // Compiler instance (deferred initialization) - std::shared_ptr& jit = jit_mod.pjit; + std::vector>& jits = jit_mod.pjit; // Split module into fragments <= 1 MiB usz fpos = 0; + // Modules counted so far + usz module_counter = 0; + // Difference between function name and current location const u32 reloc = info.relocs.empty() ? 0 : ::at32(info.segs, 0).addr; @@ -4684,14 +4688,14 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s const cpu_thread* cpu = cpu_thread::get_current(); - for (auto& func : info.funcs) + for (auto& func : info.get_funcs()) { if (func.size == 0) { continue; } - for (const auto& [addr, size] : func.blocks) + for (const auto [addr, size] : func) { if (size == 0) { @@ -4724,26 +4728,102 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s u32 total_compile = 0; - while (!jit_mod.init && fpos < info.funcs.size()) + // Limit how many modules are per JIt instance + // Advantage to lower the limit: + // 1. Lowering contoniues memory requirements for allocations + // Its disadvantage: + // 1. B instruction can wander up to 16MB relatively to its range, + // each additional split of JIT instance results in a downgraded version of around (100% / N-1th) - (100% / Nth) percent of instructions + // where N is the total amunt of JIT instances + // Subject to change + constexpr u32 c_moudles_per_jit = 100; + + std::shared_ptr> local_jit_bounds = std::make_shared>(u32{umax}, 0); + + const auto shared_runtime = make_shared(); + const auto shared_map = make_shared>(); + const auto shared_mtx = make_shared(); + + auto symbols_cement = [runtime = shared_runtime, reloc, bound = info.segs[0].addr + info.segs[0].size - reloc, func_map = shared_map, shared_mtx](const std::string& name) -> u64 { - // Initialize compiler instance - if (!jit && is_being_used_in_emulation) + u32 func_addr = umax; + + if (name.starts_with("__0x")) + { + u32 addr = umax; + auto res = std::from_chars(name.c_str() + 4, name.c_str() + name.size(), addr, 16); + + if (res.ec == std::errc() && res.ptr == name.c_str() + name.size() && addr < bound) + { + func_addr = addr + reloc; + } + } + + if (func_addr == umax) + { + return {}; + } + + reader_lock rlock(*shared_mtx); + + if (auto it = func_map->find(func_addr); it != func_map->end()) + { + return it->second; + } + + rlock.upgrade(); + + u64& code_ptr = (*func_map)[func_addr]; + + if (code_ptr) { - jit = std::make_shared(s_link_table, g_cfg.core.llvm_cpu); + return +code_ptr; } + using namespace asmjit; + + auto func = build_function_asm(name, [&](native_asm& c, auto& args) + { +#if defined(ARCH_X64) + c.mov(x86::rax, x86::qword_ptr(reinterpret_cast(&vm::g_exec_addr))); + c.mov(x86::edx, func_addr); // Load PC + c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx); + + c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::edx, 1, 0)); // Load call target + c.mov(x86::rdx, x86::rax); + c.shl(x86::rax, 16); + c.shr(x86::rax, 16); + c.shr(x86::rdx, 48); + c.shl(x86::edx, 13); + c.mov(x86::r12d, x86::edx); // Load relocation base + c.jmp(x86::rax); +#else + +#endif + }, runtime.get()); + + code_ptr = reinterpret_cast(func); + return code_ptr; + }; + + if (has_mfvscr && g_cfg.core.ppu_set_sat_bit) + { + info.attr += ppu_attr::has_mfvscr; + } + + while (!jit_mod.init && fpos < info.get_funcs().size()) + { // Copy module information (TODO: optimize) ppu_module part; part.copy_part(info); - part.funcs.reserve(16000); // Overall block size in bytes usz bsize = 0; usz bcount = 0; - while (fpos < info.funcs.size()) + while (fpos < info.get_funcs().size()) { - auto& func = info.funcs[fpos]; + auto& func = info.get_funcs()[fpos]; if (!func.size) { @@ -4767,9 +4847,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s { auto far_jump = ensure(g_fxo->get().gen_jump(source)); - if (source == func.addr && jit) + if (source == func.addr) { - jit->update_global_mapping(fmt::format("__0x%x", func.addr - reloc), reinterpret_cast(far_jump)); + (*shared_map)[func.addr - reloc] = reinterpret_cast(far_jump); } ppu_register_function_at(source, 4, far_jump); @@ -4783,22 +4863,14 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s } } - // Copy block or function entry - ppu_function& entry = part.funcs.emplace_back(func); - - // Fixup some information - entry.name = fmt::format("__0x%x", entry.addr - reloc); + local_jit_bounds->first = std::min(local_jit_bounds->first, func.addr); + local_jit_bounds->second = std::max(local_jit_bounds->second, func.addr + func.size); - if (has_mfvscr && g_cfg.core.ppu_set_sat_bit) - { - // TODO - entry.attr += ppu_attr::has_mfvscr; - } + part.local_bounds.first = std::min(part.local_bounds.first, func.addr); + part.local_bounds.second = std::max(part.local_bounds.second, func.addr + func.size); - if (entry.blocks.empty()) - { - entry.blocks.emplace(func.addr, func.size); - } + // Fixup some information + func.name = fmt::format("__0x%x", func.addr - reloc); bsize += func.size; @@ -4815,9 +4887,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s int has_dcbz = !!g_cfg.core.accurate_cache_line_stores; - for (const auto& func : part.funcs) + for (const auto& func : part.get_funcs()) { - if (func.size == 0) + if (func.size == 0 || part.local_bounds.first >= func.addr + func.size || part.local_bounds.second <= func.addr) { continue; } @@ -4827,7 +4899,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s sha1_update(&ctx, reinterpret_cast(&addr), sizeof(addr)); sha1_update(&ctx, reinterpret_cast(&size), sizeof(size)); - for (const auto& block : func.blocks) + for (const auto block : func) { if (block.second == 0 || reloc) { @@ -4898,7 +4970,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s sha1_update(&ctx, ensure(info.get_ptr(func.addr)), func.size); } - if (!workload.empty() && fpos >= info.funcs.size()) + if (fpos >= info.get_funcs().size() || module_counter % c_moudles_per_jit == c_moudles_per_jit - 1) { // Hash the entire function grouped addresses for the integrity of the symbol resolver function // Potentially occuring during patches @@ -4906,9 +4978,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s std::vector> addrs; - for (const ppu_function& func : info.funcs) + for (const ppu_function& func : info.get_funcs()) { - if (func.size == 0) + if (func.size == 0 || local_jit_bounds->first >= func.addr + func.size || local_jit_bounds->second <= func.addr) { continue; } @@ -4919,7 +4991,13 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s // Hash its size too addrs.emplace_back(::size32(addrs)); - sha1_update(&ctx, reinterpret_cast(addrs.data()), addrs.size() * sizeof(be_t)); + if (module_counter != 0) + { + sha1_update(&ctx, reinterpret_cast(addrs.data()), addrs.size() * sizeof(be_t)); + } + + part.jit_bounds = std::move(local_jit_bounds); + local_jit_bounds = std::make_shared>(u32{umax}, 0); } if (false) @@ -4974,18 +5052,21 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s settings += ppu_settings::accurate_vnan, settings -= ppu_settings::fixup_vnan, fmt::throw_exception("VNAN Not implemented"); if (g_cfg.core.ppu_use_nj_bit) settings += ppu_settings::accurate_nj_mode, settings -= ppu_settings::fixup_nj_denormals, fmt::throw_exception("NJ Not implemented"); - if (fpos >= info.funcs.size()) + if (fpos >= info.get_funcs().size() || module_counter % c_moudles_per_jit == c_moudles_per_jit - 1) settings += ppu_settings::contains_symbol_resolver; // Avoid invalidating all modules for this purpose // Write version, hash, CPU, settings fmt::append(obj_name, "v6-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); } + if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()) { break; } + module_counter++; + if (!check_only) { total_compile++; @@ -4996,7 +5077,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s // Check object file if (jit_compiler::check(cache_path + obj_name)) { - if (!jit && !check_only) + if (is_being_used_in_emulation && !check_only) { ppu_log.success("LLVM: Module exists: %s", obj_name); @@ -5117,7 +5198,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s // Use another JIT instance jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1); - ppu_initialize2(jit2, part, cache_path, obj_name, i == workload.size() - 1 ? main_module : part); + ppu_initialize2(jit2, part, cache_path, obj_name); ppu_log.success("LLVM: Compiled module %s", obj_name); } @@ -5145,6 +5226,14 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s g_watchdog_hold_ctr--; } + // Initialize compiler instance + while (jits.size() < utils::aligned_div(module_counter, c_moudles_per_jit) && is_being_used_in_emulation) + { + jits.emplace_back(std::make_shared(s_link_table, g_cfg.core.llvm_cpu, 0, symbols_cement)); + } + + jit_mod.symbol_resolvers.resize(jits.size()); + bool failed_to_load = false; { if (!is_being_used_in_emulation || (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped())) @@ -5158,6 +5247,8 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s *progress_dialog = get_localized_string(localized_string_id::PROGRESS_DIALOG_LINKING_PPU_MODULES); } + usz mod_index = 0; + for (const auto& [obj_name, is_compiled] : link_workload) { if (cpu ? cpu->state.all_of(cpu_flag::exit) : Emu.IsStopped()) @@ -5165,7 +5256,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s break; } - if (!failed_to_load && !jit->add(cache_path + obj_name)) + if (!failed_to_load && !jits[mod_index / c_moudles_per_jit]->add(cache_path + obj_name)) { ppu_log.error("LLVM: Failed to load module %s", obj_name); failed_to_load = true; @@ -5205,10 +5296,10 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s progress_dialog = get_localized_string(localized_string_id::PROGRESS_DIALOG_APPLYING_PPU_CODE); - if (!jit) + if (!jits.empty()) { // No functions - nothing to do - ensure(info.funcs.empty()); + ensure(info.get_funcs().empty()); return compiled_new; } @@ -5216,25 +5307,27 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s if (is_first) { - jit->fin(); - } - - if (is_first) - { - jit_mod.symbol_resolver = reinterpret_cast(jit->get("__resolve_symbols")); - ensure(jit_mod.symbol_resolver); - } - else - { - ensure(jit_mod.symbol_resolver); + for (auto& jit : jits) + { + jit->fin(); + } } #ifdef __APPLE__ // Symbol resolver is in JIT mem, so we must enable execution pthread_jit_write_protect_np(true); #endif + { + usz index = umax; - jit_mod.symbol_resolver(vm::g_exec_addr, info.segs[0].addr); + for (auto& sim : jit_mod.symbol_resolvers) + { + index++; + + sim = ensure(!is_first ? sim : reinterpret_cast(jits[index]->get("__resolve_symbols"))); + sim(vm::g_exec_addr, info.segs[0].addr); + } + } #ifdef __APPLE__ // Symbol resolver is in JIT mem, so we must enable execution @@ -5242,7 +5335,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s #endif // Find a BLR-only function in order to copy it to all BLRs (some games need it) - for (const auto& func : info.funcs) + for (const auto& func : info.get_funcs()) { if (func.size == 4 && *info.get_ptr(func.addr) == ppu_instructions::BLR()) { @@ -5281,7 +5374,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s #endif } -static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name, const ppu_module& whole_module) +static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name) { #ifdef LLVM_AVAILABLE using namespace llvm; @@ -5308,7 +5401,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module }, false); // Initialize function list - for (const auto& func : module_part.funcs) + for (const auto& func : module_part.get_funcs()) { if (func.size) { @@ -5375,18 +5468,37 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module #endif // Translate functions - for (usz fi = 0, fmax = module_part.funcs.size(); fi < fmax; fi++) + for (auto fi = std::lower_bound(module_part.get_funcs().begin(), module_part.get_funcs().end(), module_part.local_bounds.first, FN(x.addr < y)) + , fmax = module_part.get_funcs().end(); + fi != fmax; + fi++) { + const auto& mod_func = *fi; + + if (module_part.local_bounds.second <= mod_func.addr) + { + break; + } + + if (mod_func.size == 0) + { + continue; + } + + if (module_part.local_bounds.first >= mod_func.addr + mod_func.size) + { + continue; + } + if (Emu.IsStopped()) { ppu_log.success("LLVM: Translation cancelled"); return; } - if (module_part.funcs[fi].size) { // Translate - if (const auto func = translator.Translate(module_part.funcs[fi])) + if (const auto func = translator.Translate(mod_func)) { #ifdef ARCH_X64 // TODO // Run optimization passes @@ -5405,10 +5517,10 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module } } - // Run this only in one module for all functions - if (&whole_module != &module_part) + // Run this only in one module for all functions compiled + if (module_part.jit_bounds) { - if (const auto func = translator.GetSymbolResolver(whole_module)) + if (const auto func = translator.GetSymbolResolver(module_part)) { #ifdef ARCH_X64 // TODO // Run optimization passes diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index edbb4f515a10..bcfdda305154 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -114,7 +114,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo const auto caddr = m_info.segs[0].addr; const auto cend = caddr + m_info.segs[0].size; - for (const auto& rel : m_info.relocs) + for (const auto& rel : m_info.get_relocs()) { if (rel.addr >= caddr && rel.addr < cend) { @@ -162,7 +162,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* _module, const ppu_mo } } - if (!m_info.relocs.empty()) + if (!m_info.get_relocs().empty()) { m_reloc = &m_info.segs[0]; } @@ -196,7 +196,7 @@ Function* PPUTranslator::Translate(const ppu_function& info) // Instruction address is (m_addr + base) const u64 base = m_reloc ? m_reloc->addr : 0; m_addr = info.addr - base; - m_attr = info.attr; + m_attr = m_info.attr + info.attr; // Don't emit check in small blocks without terminator bool need_check = info.size >= 16; @@ -325,6 +325,9 @@ Function* PPUTranslator::Translate(const ppu_function& info) Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) { + ensure(m_module->getFunction("__resolve_symbols") == nullptr); + ensure(info.jit_bounds); + m_function = cast(m_module->getOrInsertFunction("__resolve_symbols", FunctionType::get(get_type(), { get_type(), get_type() }, false)).getCallee()); IRBuilder<> irb(BasicBlock::Create(m_context, "__entry", m_function)); @@ -351,18 +354,24 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) // This is made in loop instead of inlined because it took tremendous amount of time to compile. std::vector vec_addrs; - vec_addrs.reserve(info.funcs.size()); // Create an array of function pointers std::vector functions; - for (const auto& f : info.funcs) + const auto [min_addr, max_addr] = *ensure(info.jit_bounds); + + for (const auto& f : info.get_funcs()) { if (!f.size) { continue; } + if (f.addr < min_addr || f.addr >= max_addr) + { + continue; + } + vec_addrs.push_back(static_cast(f.addr - base)); functions.push_back(cast(m_module->getOrInsertFunction(fmt::format("__0x%x", f.addr - base), ftype).getCallee())); } @@ -379,7 +388,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info) const auto addr_array = new GlobalVariable(*m_module, addr_array_type, false, GlobalValue::PrivateLinkage, ConstantDataArray::get(m_context, vec_addrs)); // Create an array of function pointers - const auto func_table_type = ArrayType::get(ftype->getPointerTo(), info.funcs.size()); + const auto func_table_type = ArrayType::get(ftype->getPointerTo(), functions.size()); const auto init_func_table = ConstantArray::get(func_table_type, functions); const auto func_table = new GlobalVariable(*m_module, func_table_type, false, GlobalVariable::PrivateLinkage, init_func_table); diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp index b57974f0d7ab..7543ac810a41 100644 --- a/rpcs3/util/vm_native.cpp +++ b/rpcs3/util/vm_native.cpp @@ -329,6 +329,11 @@ namespace utils void memory_decommit(void* pointer, usz size) { + if (!size) + { + return; + } + #ifdef _WIN32 ensure(::VirtualFree(pointer, size, MEM_DECOMMIT)); #else @@ -357,6 +362,11 @@ namespace utils void memory_reset(void* pointer, usz size, protection prot) { + if (!size) + { + return; + } + #ifdef _WIN32 memory_decommit(pointer, size); memory_commit(pointer, size, prot); @@ -390,6 +400,11 @@ namespace utils void memory_release(void* pointer, usz size) { + if (!size) + { + return; + } + #ifdef _WIN32 unmap_mappping_memory(reinterpret_cast(pointer), size); ensure(::VirtualFree(pointer, 0, MEM_RELEASE)); @@ -400,6 +415,11 @@ namespace utils void memory_protect(void* pointer, usz size, protection prot) { + if (!size) + { + return; + } + #ifdef _WIN32 DWORD old; @@ -429,6 +449,11 @@ namespace utils bool memory_lock(void* pointer, usz size) { + if (!size) + { + return true; + } + #ifdef _WIN32 return ::VirtualLock(pointer, size); #else