diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp index ff8f3c1c83f2..62f80e42b1b4 100644 --- a/rpcs3/Emu/Cell/PPUModule.cpp +++ b/rpcs3/Emu/Cell/PPUModule.cpp @@ -1107,6 +1107,13 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment& if (prog.p_type == 0x1u /* LOAD */ && prog.p_filesz > 0u) { + if (prog.p_vaddr) + { + extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size); + + utilize_spu_data_segment(prog.p_vaddr, (elf_header + prog.p_offset), prog.p_filesz); + } + sha1_update(&sha2, (elf_header + prog.p_offset), prog.p_filesz); } @@ -1119,7 +1126,7 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment& if (!name.empty()) { - fmt::append(dump, "\n\tSPUNAME: '%s'", name); + fmt::append(dump, "\n\tSPUNAME: '%s' (image addr: 0x%x)", name, seg.addr + i); } } } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 775695f0a2cc..a64f59ca8899 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -4030,7 +4030,7 @@ extern void ppu_initialize() const std::string mount_point = vfs::get("/dev_flash/"); - bool dev_flash_located = !Emu.GetCat().ends_with('P') && Emu.IsPathInsideDir(Emu.GetBoot(), mount_point) && g_cfg.core.ppu_llvm_precompilation; + bool dev_flash_located = !Emu.GetCat().ends_with('P') && Emu.IsPathInsideDir(Emu.GetBoot(), mount_point) && g_cfg.core.llvm_precompilation; if (compile_fw || dev_flash_located) { @@ -4050,7 +4050,7 @@ extern void ppu_initialize() } // Avoid compilation if main's cache exists or it is a standalone SELF with no PARAM.SFO - if (compile_main && g_cfg.core.ppu_llvm_precompilation && !Emu.GetTitleID().empty() && !Emu.IsChildProcess()) + if (compile_main && g_cfg.core.llvm_precompilation && !Emu.GetTitleID().empty() && !Emu.IsChildProcess()) { // Try to add all related directories const std::set dirs = Emu.GetGameDirs(); diff --git a/rpcs3/Emu/Cell/RawSPUThread.cpp b/rpcs3/Emu/Cell/RawSPUThread.cpp index f442e73f898e..fa27229ca9bd 100644 --- a/rpcs3/Emu/Cell/RawSPUThread.cpp +++ b/rpcs3/Emu/Cell/RawSPUThread.cpp @@ -382,6 +382,18 @@ void spu_load_exec(const spu_exec_object& elf) spu->status_npc = {SPU_STATUS_RUNNING, elf.header.e_entry}; atomic_storage::release(spu->pc, elf.header.e_entry); + + const auto funcs = spu->discover_functions(spu->ls, umax); + + for (u32 addr : funcs) + { + spu_log.success("Found SPU function at: 0x%08x", addr); + } + + if (!funcs.empty()) + { + spu_log.success("Found %u SPU functions", funcs.size()); + } } void spu_load_rel_exec(const spu_rel_object& elf) diff --git a/rpcs3/Emu/Cell/SPUOpcodes.h b/rpcs3/Emu/Cell/SPUOpcodes.h index 60e3d0d1b56b..cea4513e3f17 100644 --- a/rpcs3/Emu/Cell/SPUOpcodes.h +++ b/rpcs3/Emu/Cell/SPUOpcodes.h @@ -26,17 +26,17 @@ union spu_opcode_t bf_t i18; // 7..24 }; -inline u32 spu_branch_target(u32 pc, u32 imm = 0) +constexpr u32 spu_branch_target(u32 pc, u32 imm = 0) { return (pc + (imm << 2)) & 0x3fffc; } -inline u32 spu_ls_target(u32 pc, u32 imm = 0) +constexpr u32 spu_ls_target(u32 pc, u32 imm = 0) { return (pc + (imm << 2)) & 0x3fff0; } -inline u32 spu_decode(u32 inst) +constexpr u32 spu_decode(u32 inst) { return inst >> 21; } diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index c23cec9a9558..94db1e7e0620 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -516,6 +516,84 @@ spu_cache::~spu_cache() { } +struct spu_section_data +{ + struct data_t + { + u32 vaddr; + std::basic_string inst_data; + std::vector funcs; + }; + + shared_mutex mtx; + atomic_t had_been_used = false; + std::vector data; +}; + +extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size) +{ + if (vaddr % 4) + { + return; + } + + size &= -4; + + if (!size || vaddr + size > SPU_LS_SIZE) + { + return; + } + + if (!g_cfg.core.llvm_precompilation) + { + return; + } + + g_fxo->need(); + + if (g_fxo->get().had_been_used) + { + return; + } + + std::basic_string data(size / 4); + std::memcpy(data.data(), ls_data_vaddr, size); + + spu_section_data::data_t obj{vaddr, std::move(data)}; + + std::vector ls_data(SPU_LS_SIZE); + std::memcpy(ls_data.data() + vaddr, ls_data_vaddr, size); + + obj.funcs = spu_thread::discover_functions(ls_data.data(), umax); + + if (obj.funcs.empty()) + { + // Nothing to add + return; + } + + for (u32 addr : obj.funcs) + { + spu_log.notice("Found SPU function at: 0x%05x", addr); + } + + spu_log.notice("Found %u SPU functions", obj.funcs.size()); + + std::lock_guard lock(g_fxo->get().mtx); + + for (const auto& data : g_fxo->get().data) + { + // TODO: More robust duplicates filtering + if (data.vaddr == vaddr && data.inst_data.starts_with(obj.inst_data)) + { + spu_log.notice("Avoided duplicate SPU segment"); + return; + } + } + + g_fxo->get().data.emplace_back(std::move(obj)); +} + std::deque spu_cache::get() { std::deque result; @@ -618,6 +696,11 @@ void spu_cache::initialize() atomic_t fnext{}; atomic_t fail_flag{0}; + auto data_list = std::move(g_fxo->get().data); + g_fxo->get().had_been_used = true; + + atomic_t data_indexer{}; + if (g_cfg.core.spu_decoder == spu_decoder_type::dynamic || g_cfg.core.spu_decoder == spu_decoder_type::llvm) { if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11)) @@ -657,7 +740,18 @@ void spu_cache::initialize() thread_ctrl::wait_on(g_progr_ptotal, v); } - g_progr_ptotal += ::size32(func_list); + u32 add_count = ::size32(func_list); + + if (func_list.empty()) + { + for (auto& sec : data_list) + { + add_count += sec.funcs.size(); + } + } + + g_progr_ptotal += add_count; + progr.emplace("Building SPU cache..."); worker_count = rpcs3::utils::get_max_threads(); @@ -744,6 +838,7 @@ void spu_cache::initialize() { // Likely, out of JIT memory. Signal to prevent further building. fail_flag |= 1; + continue; } // Clear fake LS @@ -752,6 +847,107 @@ void spu_cache::initialize() result++; } + if (!func_list.empty() || !g_cfg.core.llvm_precompilation) + { + // Cache has already been initiated or the user does not want to precompile SPU programs + return result; + } + + u32 last_sec_idx = umax; + + for (usz func_i = data_indexer++;; func_i = data_indexer++, g_progr_pdone++) + { + u32 passed_count = 0; + u32 func_addr = 0; + u32 sec_addr = umax; + u32 sec_idx = 0; + std::string_view inst_data; + + // Try to get the data this index points to + for (auto& sec : data_list) + { + if (func_i < passed_count + sec.funcs.size()) + { + sec_addr = sec.vaddr; + func_addr = ::at32(sec.funcs, func_i - passed_count); + inst_data = sec.inst_data; + break; + } + + passed_count += sec.funcs.size(); + sec_idx++; + } + + if (sec_addr == umax) + { + // End of compilation for thread + break; + } + + if (Emu.IsStopped() || fail_flag) + { + continue; + } + + if (last_sec_idx != sec_idx) + { + if (last_sec_idx != umax) + { + // Clear fake LS of previous section + auto& sec = data_list[last_sec_idx]; + std::memset(ls.data() + sec.vaddr / 4, 0, sec.inst_data.size() * 4); + } + + // Initialize LS with the entire section data + for (u32 i = 0, pos = sec_addr; i < inst_data.size(); i++, pos += 4) + { + ls[pos / 4] = std::bit_cast>(inst_data[i]); + } + + last_sec_idx = sec_idx; + } + + // Call analyser + spu_program func2 = compiler->analyse(ls.data(), func_addr); + + while (!func2.data.empty()) + { + const u32 last_inst = std::bit_cast>(func2.data.back()); + const u32 prog_size = func2.data.size(); + + if (!compiler->compile(std::move(func2))) + { + // Likely, out of JIT memory. Signal to prevent further building. + fail_flag |= 1; + break; + } + + result++; + + if (g_cfg.core.spu_block_size >= spu_block_size_type::mega) + { + // Should already take care of the entire function + break; + } + + if (auto type = g_spu_itype.decode(last_inst); + type == spu_itype::BRSL || type == spu_itype::BRASL || type == spu_itype::BISL) + { + const u32 start_new = func_addr + prog_size * 4; + + if (start_new < SPU_LS_SIZE && ls[start_new / 4] && g_spu_itype.decode(ls[start_new / 4]) != spu_itype::UNK) + { + spu_log.notice("Precompiling fallthrough to 0x%05x", start_new); + func2 = compiler->analyse(ls.data(), start_new); + func_addr = start_new; + continue; + } + } + + break; + } + } + return result; }); @@ -1904,6 +2100,63 @@ void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/ } } +std::vector spu_thread::discover_functions(const void* ls_start, u32 /*entry*/) +{ + std::vector calls; + calls.reserve(100); + + // Discover functions + // Use the most simple method: search for instructions that calls them + // And then filter invalid cases (does not detect tail calls) + for (u32 i = 0x10; i < SPU_LS_SIZE; i += 0x10) + { + // Search for BRSL and BRASL + // TODO: BISL + const v128 inst = read_from_ptr>(static_cast(ls_start), i); + const v128 shifted = gv_shr32(inst, 23); + const v128 eq_brsl = gv_eq32(shifted, v128::from32p(0x66)); + const v128 eq_brasl = gv_eq32(shifted, v128::from32p(0x62)); + const v128 result = eq_brsl | eq_brasl; + + if (!gv_testz(result)) + { + for (u32 j = 0; j < 4; j++) + { + if (result.u32r[j]) + { + calls.push_back(i + j * 4); + } + } + } + } + + calls.erase(std::remove_if(calls.begin(), calls.end(), [&](u32 caller) + { + // Check the validity of both the callee code and the following caller code + return !is_exec_code(caller, ls_start) || !is_exec_code(caller + 4, ls_start); + }), calls.end()); + + std::vector addrs; + + for (u32 addr : calls) + { + const spu_opcode_t op{read_from_ptr>(static_cast(ls_start), addr)}; + + const u32 func = op_branch_targets(addr, op)[0]; + + if (func == umax || std::count(addrs.begin(), addrs.end(), func)) + { + continue; + } + + addrs.push_back(func); + } + + std::sort(addrs.begin(), addrs.end()); + + return addrs; +} + spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point) { // Result: addr + raw instruction data @@ -2647,6 +2900,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point) } } + spu_program result2 = result; + while (lsa > 0 || limit < 0x40000) { const u32 initial_size = ::size32(result.data); @@ -3093,7 +3348,13 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point) { workload.clear(); workload.push_back(entry_point); - ensure(m_bbs.count(entry_point)); + if (!m_bbs.count(entry_point)) + { + std::string func_bad; + dump(result2, func_bad); + spu_log.error("%s", func_bad); + return {}; + } std::basic_string new_entries; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 8ab60cec6c9d..93766638937d 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -487,7 +487,27 @@ std::array op_branch_targets(u32 pc, spu_opcode_t op) case spu_itype::BRASL: { const int index = (type == spu_itype::BR || type == spu_itype::BRA || type == spu_itype::BRSL || type == spu_itype::BRASL ? 0 : 1); + + // if (type == spu_itype::BRASL || type == spu_itype::BRA) + // { + // res[index] = spu_branch_target(0, op.i16); + // } + // else + // { + // // Treat i16 as signed, this allows the caller to detect "overflows" and "underflows" in address in order to detect invalid branches + // // Example: + // // [0x3fffc] BR +4 -> BR 0 -> invalid + // // [0x3fffc] BR 0x3fff4 -> BR 0 -> invalid + // const u32 add = static_cast(op.si16); + // } + res[index] = (spu_branch_target(type == spu_itype::BRASL || type == spu_itype::BRA ? 0 : pc, op.i16)); + + if (res[0] == res[1]) + { + res[1] = umax; + } + break; } case spu_itype::IRET: @@ -4013,7 +4033,7 @@ bool spu_thread::check_mfc_interrupts(u32 next_pc) return false; } -bool spu_thread::is_exec_code(u32 addr, const u8* ls_ptr) +bool spu_thread::is_exec_code(u32 addr, const void* ls_ptr) { if (addr & ~0x3FFFC) { @@ -4022,8 +4042,8 @@ bool spu_thread::is_exec_code(u32 addr, const u8* ls_ptr) for (u32 i = 0; i < 30; i++) { - const u32 addr0 = addr + (i * 4); - const u32 op = read_from_ptr>(ls_ptr + addr0); + const u32 addr0 = spu_branch_target(addr); + const u32 op = read_from_ptr>(static_cast(ls_ptr) + addr0); const auto type = s_spu_itype.decode(op); if (type == spu_itype::UNK || !op) @@ -4033,9 +4053,38 @@ bool spu_thread::is_exec_code(u32 addr, const u8* ls_ptr) if (type & spu_itype::branch) { - // TODO - break; + const auto results = op_branch_targets(addr, spu_opcode_t{op}); + + if (results[0] == umax) + { + break; + } + + for (usz res_i = 1; res_i < results.size(); res_i++) + { + const u32 route_pc = results[res_i]; + + if (route_pc >= SPU_LS_SIZE) + { + continue; + } + + // Test the validity of a single instruction of the optional target + // This function can't be too slow and is unlikely to improve results by a great deal + const u32 op0 = read_from_ptr>(static_cast(ls_ptr) + route_pc); + const auto type0 = s_spu_itype.decode(op); + + if (type == spu_itype::UNK || !op) + { + return false; + } + } + + addr = spu_branch_target(results[0]); + continue; } + + addr += 4; } return true; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 92ed38c66e5a..635e5655c2e6 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -825,7 +825,8 @@ class spu_thread : public cpu_thread void set_events(u32 bits); void set_interrupt_status(bool enable); bool check_mfc_interrupts(u32 next_pc); - static bool is_exec_code(u32 addr, const u8* ls_ptr); // Only a hint, do not rely on it other than debugging purposes + static bool is_exec_code(u32 addr, const void* ls_ptr); // Only a hint, do not rely on it other than debugging purposes + static std::vector discover_functions(const void* ls_start, u32 /*entry*/); u32 get_ch_count(u32 ch); s64 get_ch_value(u32 ch); bool set_ch_value(u32 ch, u32 value); diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 3355c4d4528c..6d6ef869e7f1 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -28,7 +28,7 @@ struct cfg_root : cfg::node cfg::string llvm_cpu{ this, "Use LLVM CPU" }; cfg::_int<0, 1024> llvm_threads{ this, "Max LLVM Compile Threads", 0 }; cfg::_bool ppu_llvm_greedy_mode{ this, "PPU LLVM Greedy Mode", false, false }; - cfg::_bool ppu_llvm_precompilation{ this, "PPU LLVM Precompilation", true }; + cfg::_bool llvm_precompilation{ this, "LLVM Precompilation", true }; cfg::_enum thread_scheduler{this, "Thread Scheduler Mode", thread_scheduler_mode::os}; cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false }; cfg::_enum spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm }; diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h index 62d67e43fd45..a3371d57d957 100644 --- a/rpcs3/rpcs3qt/emu_settings_type.h +++ b/rpcs3/rpcs3qt/emu_settings_type.h @@ -19,7 +19,7 @@ enum class emu_settings_type SPUDebug, MFCDebug, MaxLLVMThreads, - PPULLVMPrecompilation, + LLVMPrecompilation, EnableTSX, AccurateGETLLAR, AccurateSpuDMA, @@ -204,7 +204,7 @@ inline static const QMap settings_location = { emu_settings_type::SPUDebug, { "Core", "SPU Debug"}}, { emu_settings_type::MFCDebug, { "Core", "MFC Debug"}}, { emu_settings_type::MaxLLVMThreads, { "Core", "Max LLVM Compile Threads"}}, - { emu_settings_type::PPULLVMPrecompilation, { "Core", "PPU LLVM Precompilation"}}, + { emu_settings_type::LLVMPrecompilation, { "Core", "LLVM Precompilation"}}, { emu_settings_type::EnableTSX, { "Core", "Enable TSX"}}, { emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}}, { emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index f15369661781..3628fb33f01a 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -1452,8 +1452,8 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std m_emu_settings->EnhanceCheckBox(ui->fixupPPUVNAN, emu_settings_type::FixupPPUVNAN); SubscribeTooltip(ui->fixupPPUVNAN, tooltips.settings.fixup_ppuvnan); - m_emu_settings->EnhanceCheckBox(ui->ppuPrecompilation, emu_settings_type::PPULLVMPrecompilation); - SubscribeTooltip(ui->ppuPrecompilation, tooltips.settings.ppu_precompilation); + m_emu_settings->EnhanceCheckBox(ui->llvmPrecompilation, emu_settings_type::LLVMPrecompilation); + SubscribeTooltip(ui->llvmPrecompilation, tooltips.settings.llvm_precompilation); m_emu_settings->EnhanceCheckBox(ui->suspendSavestates, emu_settings_type::SuspendEmulationSavestateMode); SubscribeTooltip(ui->suspendSavestates, tooltips.settings.suspend_savestates); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index dd9c67dd7a9b..bafa45567617 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -2394,9 +2394,9 @@ - + - PPU LLVM Precompilation + PPU/SPU LLVM Precompilation diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h index 1982a108e0d5..6e73f2f4b2ce 100644 --- a/rpcs3/rpcs3qt/tooltips.h +++ b/rpcs3/rpcs3qt/tooltips.h @@ -75,7 +75,7 @@ class Tooltips : public QObject const QString ppu__static = tr("Interpreter (slow). Try this if PPU Recompiler (LLVM) doesn't work."); const QString ppu_dynamic = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if PPU Recompiler (LLVM) doesn't work."); const QString ppu_llvm = tr("Recompiles and caches the game's PPU code using the LLVM Recompiler once before running it for the first time.\nThis is by far the fastest option and should always be used.\nShould you face compatibility issues, fall back to one of the Interpreters and retry.\nIf unsure, use this option."); - const QString ppu_precompilation = tr("Searches the game's directory and precompiles extra PPU modules during boot.\nIf disabled, these modules will only be compiled when needed. Depending on the game, this might interrupt the gameplay unexpectedly and possibly frequently.\nOnly disable this if you want to get ingame more quickly."); + const QString llvm_precompilation = tr("Searches the game's directory and precompiles extra PPU and SPU modules during boot.\nIf disabled, these modules will only be compiled when needed. Depending on the game, this might interrupt the gameplay unexpectedly and possibly frequently.\nOnly disable this if you want to get ingame more quickly."); const QString spu__static = tr("Interpreter (slow). Try this if SPU Recompiler (LLVM) doesn't work."); const QString spu_dynamic = tr("Alternative interpreter (slow). May be faster than static interpreter. Try this if SPU Recompiler (LLVM) doesn't work."); const QString spu_asmjit = tr("Recompiles the game's SPU code using the ASMJIT Recompiler.\nThis is the fast option with very good compatibility.\nIf unsure, use this option.");