diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index 6bda27ba4c9f..7a3bdcac26fa 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -383,7 +383,7 @@ class ObjectCache final : public llvm::ObjectCache std::string name = m_path; name.append(module->getName()); fs::file(name, fs::rewrite).write(obj.getBufferStart(), obj.getBufferSize()); - LOG_SUCCESS(GENERAL, "LLVM: Created module: %s", module->getName().data()); + LOG_NOTICE(GENERAL, "LLVM: Created module: %s", module->getName().data()); } static std::unique_ptr load(const std::string& path) @@ -405,7 +405,7 @@ class ObjectCache final : public llvm::ObjectCache if (auto buf = load(path)) { - LOG_SUCCESS(GENERAL, "LLVM: Loaded module: %s", module->getName().data()); + LOG_NOTICE(GENERAL, "LLVM: Loaded module: %s", module->getName().data()); return buf; } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 435aa7ddbf99..02ef95239295 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1225,9 +1225,6 @@ extern void ppu_initialize() fmt::throw_exception("Failed to create cache directory: %s (%s)", _main->cache, fs::g_tls_error); } - // Initialize SPU cache - spu_cache::initialize(); - if (Emu.IsStopped()) { return; @@ -1248,6 +1245,9 @@ extern void ppu_initialize() { ppu_initialize(*ptr); } + + // Initialize SPU cache + spu_cache::initialize(); } extern void ppu_initialize(const ppu_module& info) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index f68a6d2aa84d..9403836d18d7 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -7,6 +7,7 @@ #include "SPUThread.h" #include "SPUInterpreter.h" #include "Utilities/sysinfo.h" +#include "PPUAnalyser.h" #include #include @@ -32,6 +33,13 @@ std::unique_ptr spu_recompiler_base::make_asmjit_recompiler spu_runtime::spu_runtime() { + m_cache_path = fxm::check_unlocked()->cache; + + if (g_cfg.core.spu_debug) + { + fs::file(m_cache_path + "spu.log", fs::rewrite); + } + LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized..."); // Initialize lookup table @@ -97,7 +105,12 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) using namespace asmjit; SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode); - dis_asm.offset = reinterpret_cast(func.data() + 1) - func[0]; + dis_asm.offset = reinterpret_cast(func.data() + 1); + + if (g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + dis_asm.offset -= func[0]; + } StringLogger logger; logger.addOptions(Logger::kOptionBinaryForm); @@ -163,15 +176,16 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // Start compilation m_pos = func[0]; - const u32 start = m_pos; - const u32 end = m_pos + (func.size() - 1) * 4; + m_size = ::size32(func) * 4 - 4; + const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga); + const u32 end = start + m_size; // Create instruction labels (TODO: some of them are unnecessary) for (u32 i = 1; i < func.size(); i++) { if (func[i]) { - instr_labels[i * 4 - 4 + m_pos] = c->newLabel(); + instr_labels[i * 4 - 4 + start] = c->newLabel(); } } @@ -210,15 +224,15 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) { // Disable check (unsafe) } - else if (func.size() - 1 == 1) + else if (m_size == 4) { - c->cmp(x86::dword_ptr(*ls, m_pos), func[1]); + c->cmp(x86::dword_ptr(*ls, start), func[1]); c->jnz(label_diff); } - else if (func.size() - 1 == 2) + else if (m_size == 8) { c->mov(*qw1, static_cast(func[2]) << 32 | func[1]); - c->cmp(*qw1, x86::qword_ptr(*ls, m_pos)); + c->cmp(*qw1, x86::qword_ptr(*ls, start)); c->jnz(label_diff); } else if (utils::has_512() && false) @@ -226,16 +240,15 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // AVX-512 optimized check using 512-bit registers (disabled) words_align = 64; - const u32 starta = m_pos & -64; + const u32 starta = start & -64; const u32 enda = ::align(end, 64); const u32 sizea = (enda - starta) / 64; verify(HERE), sizea; // Initialize pointers c->lea(x86::rax, x86::qword_ptr(label_code)); - c->lea(*qw1, x86::qword_ptr(*ls, starta)); u32 code_off = 0; - u32 ls_off = starta; + u32 ls_off = -8192; for (u32 j = starta; j < enda; j += 64) { @@ -246,6 +259,8 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) continue; } + const bool first = ls_off == -8192; + // Ensure small distance for disp8*N if (j - ls_off >= 8192) { @@ -279,7 +294,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) c->vmovdqa32(x86::zmm0, x86::zword_ptr(*qw1, j - ls_off)); } - if (j == starta) + if (first) { c->vpcmpud(x86::k1, x86::zmm0, x86::zword_ptr(x86::rax, code_off), 4); } @@ -291,7 +306,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) for (u32 i = j; i < j + 64; i += 4) { - words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); } code_off += 64; @@ -305,7 +320,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // AVX-512 optimized check using 256-bit registers words_align = 32; - const u32 starta = m_pos & -32; + const u32 starta = start & -32; const u32 enda = ::align(end, 32); const u32 sizea = (enda - starta) / 32; verify(HERE), sizea; @@ -330,10 +345,10 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) for (u32 i = starta; i < enda; i += 4) { - words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); } } - else if (sizea == 2 && (end - m_pos) <= 32) + else if (sizea == 2 && (end - start) <= 32) { const u32 cmask0 = get_code_mask(starta, starta + 32); const u32 cmask1 = get_code_mask(starta + 32, enda); @@ -347,7 +362,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) for (u32 i = starta; i < starta + 32; i += 4) { - words.push_back(i >= m_pos ? func[(i - m_pos) / 4 + 1] : i + 32 < end ? func[(i + 32 - m_pos) / 4 + 1] : 0); + words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0); } } else @@ -356,9 +371,8 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // Initialize pointers c->lea(x86::rax, x86::qword_ptr(label_code)); - c->lea(*qw1, x86::qword_ptr(*ls, starta)); u32 code_off = 0; - u32 ls_off = starta; + u32 ls_off = -4096; for (u32 j = starta; j < enda; j += 32) { @@ -369,6 +383,8 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) continue; } + const bool first = ls_off == -4096; + // Ensure small distance for disp8*N if (j - ls_off >= 4096) { @@ -398,7 +414,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) } // Perform bitwise comparison and accumulate - if (j == starta) + if (first) { c->vpxor(x86::ymm0, x86::ymm1, x86::yword_ptr(x86::rax, code_off)); } @@ -409,7 +425,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) for (u32 i = j; i < j + 32; i += 4) { - words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); } code_off += 32; @@ -424,7 +440,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // Mainstream AVX words_align = 32; - const u32 starta = m_pos & -32; + const u32 starta = start & -32; const u32 enda = ::align(end, 32); const u32 sizea = (enda - starta) / 32; verify(HERE), sizea; @@ -449,10 +465,10 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) for (u32 i = starta; i < enda; i += 4) { - words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); } } - else if (sizea == 2 && (end - m_pos) <= 32) + else if (sizea == 2 && (end - start) <= 32) { const u32 cmask0 = get_code_mask(starta, starta + 32); const u32 cmask1 = get_code_mask(starta + 32, enda); @@ -466,7 +482,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) for (u32 i = starta; i < starta + 32; i += 4) { - words.push_back(i >= m_pos ? func[(i - m_pos) / 4 + 1] : i + 32 < end ? func[(i + 32 - m_pos) / 4 + 1] : 0); + words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0); } } else @@ -541,7 +557,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) for (u32 i = j; i < j + 32; i += 4) { - words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0); + words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0); } code_off += 32; @@ -568,7 +584,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // Compatible SSE2 words_align = 16; - const u32 starta = m_pos & -16; + const u32 starta = start & -16; const u32 enda = ::align(end, 16); const u32 sizea = (enda - starta) / 16; verify(HERE), sizea; @@ -614,10 +630,10 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) } // Determine which value will be duplicated at hole positions - const u32 w3 = func.at((j - m_pos + ~::cntlz32(cmask, true) % 4 * 4) / 4 + 1); - words.push_back(cmask & 1 ? func[(j - m_pos + 0) / 4 + 1] : w3); - words.push_back(cmask & 2 ? func[(j - m_pos + 4) / 4 + 1] : w3); - words.push_back(cmask & 4 ? func[(j - m_pos + 8) / 4 + 1] : w3); + const u32 w3 = func.at((j - start + ~::cntlz32(cmask, true) % 4 * 4) / 4 + 1); + words.push_back(cmask & 1 ? func[(j - start + 0) / 4 + 1] : w3); + words.push_back(cmask & 2 ? func[(j - start + 4) / 4 + 1] : w3); + words.push_back(cmask & 4 ? func[(j - start + 8) / 4 + 1] : w3); words.push_back(w3); // PSHUFD immediate table for all possible hole mask values, holes repeat highest valid word @@ -641,7 +657,9 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) 0b11100100, // full }; - const auto& dest = !order++ ? reg0 : reg1; + const bool first = !order++; + + const auto& dest = first ? reg0 : reg1; // Load aligned code block from LS if (cmask != 0xf) @@ -656,7 +674,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // Perform bitwise comparison and accumulate c->xorps(dest, x86::dqword_ptr(x86::rax, code_off)); - if (j != starta && j != starta + 16) + if (first) { c->orps(reg0, dest); } @@ -690,24 +708,38 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) c->vzeroupper(); } - c->inc(SPU_OFF_64(block_counter)); + // Acknowledge success and add statistics + c->add(SPU_OFF_64(block_counter), ::size32(words) / (words_align / 4)); + + if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_pos != start) + { + // Jump to the entry point if necessary + c->jmp(instr_labels[m_pos]); + m_pos = -1; + } for (u32 i = 1; i < func.size(); i++) { const u32 pos = start + (i - 1) * 4; + const u32 op = se_storage::swap(func[i]); if (g_cfg.core.spu_debug) { // Disasm dis_asm.dump_pc = pos; dis_asm.disasm(pos); - compiler.comment(dis_asm.last_opcode.c_str()); - log += dis_asm.last_opcode; - log += '\n'; - } - // Get opcode - const u32 op = se_storage::swap(func[i]); + if (op) + { + log += '>'; + log += dis_asm.last_opcode; + log += '\n'; + } + else + { + fmt::append(log, ">[%08x] xx xx xx xx: \n", pos); + } + } if (!op) { @@ -738,6 +770,12 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) c->bind(found->second); } + if (g_cfg.core.spu_debug) + { + // Disasm inside the ASMJIT log + compiler.comment(dis_asm.last_opcode.c_str()); + } + // Execute recompiler function (this->*s_spu_decoder.decode(op))({op}); @@ -784,6 +822,10 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) c->align(kAlignData, 8); c->bind(instr_table); + // Get actual instruction table bounds + const u32 start = instr_labels.begin()->first; + const u32 end = instr_labels.rbegin()->first + 4; + for (u32 addr = start; addr < end; addr += 4) { const auto found = instr_labels.find(addr); @@ -825,6 +867,22 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) // Register function fn_location = fn; + if (g_cfg.core.spu_debug) + { + // Add ASMJIT logs + fmt::append(log, "Address: %p\n\n", fn); + log += logger.getString(); + log += "\n\n\n"; + + // Append log file + fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log); + } + + if (m_cache && g_cfg.core.spu_cache) + { + m_cache->add(func); + } + // Generate a dispatcher (übertrampoline) std::vector addrv{func[0]}; const auto beg = m_spurt->m_map.lower_bound(addrv); @@ -886,6 +944,12 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) it = it2; size1 = w.size - size2; + if (w.level >= w.beg->first.size()) + { + // Cannot split: smallest function is a prefix of bigger ones (TODO) + break; + } + const u32 x1 = w.beg->first.at(w.level); if (!x1) @@ -914,6 +978,20 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) } } + if (w.label.isValid()) + { + c->align(kAlignCode, 16); + c->bind(w.label); + } + + if (w.level >= w.beg->first.size()) + { + // If functions cannot be compared, assume smallest function + LOG_WARNING(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level); + c->jmp(imm_ptr(w.beg->second ? w.beg->second : &dispatch)); + continue; + } + // Value for comparison const u32 x = it->first.at(w.level); @@ -933,13 +1011,7 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) size2++; } - if (w.label.isValid()) - { - c->align(kAlignCode, 16); - c->bind(w.label); - } - - c->cmp(x86::dword_ptr(*ls, func[0] + (w.level - 1) * 4), x); + c->cmp(x86::dword_ptr(*ls, start + (w.level - 1) * 4), x); // Low subrange target label Label label_below; @@ -1044,22 +1116,6 @@ spu_function_t spu_recompiler::compile(std::vector&& func_rv) m_spurt->m_dispatcher[func[0] / 4] = tr; } - if (g_cfg.core.spu_debug) - { - // Add ASMJIT logs - fmt::append(log, "Address: %p (%p)\n\n", fn, +m_spurt->m_dispatcher[func[0] / 4]); - log += logger.getString(); - log += "\n\n\n"; - - // Append log file - fs::file(Emu.GetCachePath() + "SPUJIT.log", fs::write + fs::append).write(log); - } - - if (m_cache && g_cfg.core.spu_cache) - { - m_cache->add(func); - } - return fn; } @@ -1131,17 +1187,6 @@ static void check_state(SPUThread* _spu, spu_function_t _ret) _ret = &check_state_ret; } - if (g_cfg.core.spu_block_size != spu_block_size_type::safe) - { - // Get stack pointer, try to use native return address (check SPU return address) - const auto x = _spu->stack_mirror[(_spu->gpr[1]._u32[3] & 0x3fff0) >> 4]; - - if (x._u32[2] == _spu->pc) - { - _ret = reinterpret_cast(x._u64[0]); - } - } - _ret(*_spu, _spu->_ptr(0), nullptr); } @@ -1195,36 +1240,12 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) { using namespace asmjit; - if (g_cfg.core.spu_block_size != spu_block_size_type::giga && !jt) - { - // Simply external call (return or indirect call) - c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); - c->xor_(qw0->r32(), qw0->r32()); - } - else - { - if (!instr_table.isValid()) - { - // Request instruction table - instr_table = c->newLabel(); - } - - const u32 start = instr_labels.begin()->first; - const u32 end = instr_labels.rbegin()->first + 4; - - // Load indirect jump address, choose between local and external - c->lea(x86::r10, x86::qword_ptr(instr_table)); - c->lea(*qw1, x86::qword_ptr(*addr, 0 - start)); - c->xor_(qw0->r32(), qw0->r32()); - c->cmp(qw1->r32(), end - start); - c->cmovae(qw1->r32(), qw0->r32()); - c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0)); - c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); - } + // Initialize third arg to zero + c->xor_(qw0->r32(), qw0->r32()); if (op.d) { - c->lock().btr(SPU_OFF_8(interrupts_enabled), 0); + c->mov(SPU_OFF_8(interrupts_enabled), 0); } else if (op.e) { @@ -1232,7 +1253,7 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) Label intr = c->newLabel(); Label fail = c->newLabel(); - c->lock().bts(SPU_OFF_8(interrupts_enabled), 0); + c->mov(SPU_OFF_8(interrupts_enabled), 1); c->mov(qw1->r32(), SPU_OFF_32(ch_event_mask)); c->test(qw1->r32(), ~SPU_EVENT_INTR_IMPLEMENTED); c->jnz(fail); @@ -1244,19 +1265,50 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) c->mov(SPU_OFF_32(pc), *addr); c->mov(addr->r64(), reinterpret_cast(vm::base(0xffdead00))); c->mov(asmjit::x86::dword_ptr(addr->r64()), "INTR"_u32); + + // Save addr in srr0 and disable interrupts c->bind(intr); - c->lock().btr(SPU_OFF_8(interrupts_enabled), 0); + c->mov(SPU_OFF_8(interrupts_enabled), 0); c->mov(SPU_OFF_32(srr0), *addr); - c->mov(*addr, qw0->r32()); - c->mov(x86::r10, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher))); + + // Test for BR/BRA instructions (they are equivalent at zero pc) + c->mov(*addr, x86::dword_ptr(*ls)); + c->and_(*addr, 0xfffffffd); + c->xor_(*addr, 0x30); + c->bswap(*addr); + c->test(*addr, 0xff80007f); + c->cmovnz(*addr, qw0->r32()); + c->shr(*addr, 5); c->align(kAlignCode, 16); c->bind(no_intr); } - Label label_check = c->newLabel(); - c->mov(SPU_OFF_32(pc), *addr); - c->cmp(SPU_OFF_32(state), 0); - c->jnz(label_check); + if (!jt && g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + // Simply external call (return or indirect call) + c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); + } + else + { + if (!instr_table.isValid()) + { + // Request instruction table + instr_table = c->newLabel(); + } + + // Get actual instruction table bounds + const u32 start = instr_labels.begin()->first; + const u32 end = instr_labels.rbegin()->first + 4; + + // Load indirect jump address, choose between local and external + c->lea(*qw1, x86::qword_ptr(addr->r64(), 0 - start)); + c->lea(x86::r10, x86::qword_ptr(instr_table)); + c->cmp(qw1->r32(), end - start); + c->lea(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0)); + c->lea(*qw1, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); + c->cmovae(x86::r10, *qw1); + c->mov(x86::r10, x86::qword_ptr(x86::r10)); + } if (g_cfg.core.spu_block_size != spu_block_size_type::safe && ret) { @@ -1268,6 +1320,10 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) c->cmove(x86::r10, x86::qword_ptr(*qw1)); } + Label label_check = c->newLabel(); + c->mov(SPU_OFF_32(pc), *addr); + c->cmp(SPU_OFF_32(state), 0); + c->jnz(label_check); c->jmp(x86::r10); c->bind(label_check); c->mov(*ls, x86::r10); diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h index 6388cb157c89..ce43792c19b3 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h @@ -19,6 +19,9 @@ class spu_runtime // All dispatchers std::array, 0x10000> m_dispatcher; + // Debug module output location + std::string m_cache_path; + friend class spu_recompiler; public: diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index dd74eb00ade0..45330ad5eb33 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -90,14 +90,8 @@ void spu_cache::initialize() return; } - if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) - { - // Force Safe mode - g_cfg.core.spu_block_size.from_default(); - } - // SPU cache file (version + block size type) - const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v3.dat"; + const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v4.dat"; auto cache = std::make_shared(loc); @@ -115,11 +109,6 @@ void spu_cache::initialize() if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) { - if (g_cfg.core.spu_debug) - { - fs::file(Emu.GetCachePath() + "SPUJIT.log", fs::rewrite); - } - compiler = spu_recompiler_base::make_asmjit_recompiler(); } @@ -138,7 +127,12 @@ void spu_cache::initialize() // Fake LS std::vector> ls(0x10000); - // Initialize progress dialog + // Initialize progress dialog (wait for previous progress done) + while (g_progr_ptotal) + { + std::this_thread::sleep_for(5ms); + } + g_progr = "Building SPU cache..."; g_progr_ptotal += func_list.size(); @@ -151,8 +145,12 @@ void spu_cache::initialize() continue; } + // Get data start + const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga); + const u32 size0 = ::size32(func); + // Initialize LS with function data only - for (u32 i = 1, pos = func[0]; i < func.size(); i++, pos += 4) + for (u32 i = 1, pos = start; i < size0; i++, pos += 4) { ls[pos / 4] = se_storage::swap(func[i]); } @@ -160,15 +158,15 @@ void spu_cache::initialize() // Call analyser std::vector func2 = compiler->block(ls.data(), func[0]); - if (func2.size() != func.size()) + if (func2.size() != size0) { - LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, func.size() - 1); + LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1); } compiler->compile(std::move(func)); // Clear fake LS - for (u32 i = 1, pos = func2[0]; i < func2.size(); i++, pos += 4) + for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4) { if (se_storage::swap(func2[i]) != ls[pos / 4]) { @@ -178,6 +176,11 @@ void spu_cache::initialize() ls[pos / 4] = 0; } + if (func2.size() != size0) + { + std::memset(ls.data(), 0, 0x40000); + } + g_progr_pdone++; } @@ -236,11 +239,22 @@ void spu_recompiler_base::dispatch(SPUThread& spu, void*, u8* rip) // Compile verify(HERE), spu.jit->compile(spu.jit->block(spu._ptr(0), spu.pc)); spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc); + + // Diagnostic + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + const v128 _info = spu.stack_mirror[(spu.gpr[1]._u32[3] & 0x3fff0) >> 4]; + + if (_info._u64[0] != -1) + { + LOG_TRACE(SPU, "Called from 0x%x", _info._u32[2] - 4); + } + } } void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip) { - // Compile + // Compile (TODO: optimize search of the existing functions) const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr(0), spu.pc))); spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc); @@ -282,24 +296,27 @@ void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip) #endif } -std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) +std::vector spu_recompiler_base::block(const be_t* ls, u32 entry_point) { // Result: addr + raw instruction data std::vector result; result.reserve(256); - result.push_back(lsa); + result.push_back(entry_point); // Initialize block entries m_block_info.reset(); - m_block_info.set(lsa / 4); + m_block_info.set(entry_point / 4); + m_entry_info.reset(); + m_entry_info.set(entry_point / 4); // Simple block entry workload list - std::vector wl; - wl.push_back(lsa); + std::vector workload; + workload.push_back(entry_point); - m_regmod.fill(0xff); + std::memset(m_regmod.data(), 0xff, sizeof(m_regmod)); m_targets.clear(); m_preds.clear(); + m_preds[entry_point]; // Value flags (TODO) enum class vf : u32 @@ -316,46 +333,72 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) // Associated constant values for 32-bit preferred slot std::array values; - for (u32 wi = 0; wi < wl.size();) + // SYNC instruction found + bool sync = false; + + u32 hbr_loc = 0; + u32 hbr_tg = -1; + + // Result bounds + u32 lsa = entry_point; + u32 limit = 0x40000; + + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + // In Giga mode, all data starts from the address 0 + lsa = 0; + } + + for (u32 wi = 0, wa = workload[0]; wi < workload.size();) { const auto next_block = [&] { // Reset value information vflags.fill({}); + sync = false; + hbr_loc = 0; + hbr_tg = -1; wi++; + + if (wi < workload.size()) + { + wa = workload[wi]; + } }; - const u32 pos = wl[wi]; + const u32 pos = wa; const auto add_block = [&](u32 target) { - // Verify validity of the new target (TODO) - if (target > lsa) + // Validate new target (TODO) + if (target > lsa && target < limit) { // Check for redundancy if (!m_block_info[target / 4]) { m_block_info[target / 4] = true; - wl.push_back(target); + workload.push_back(target); } - // Add predecessor (check if already exists) - for (u32 pred : m_preds[target]) + // Add predecessor + if (m_preds[target].find_first_of(pos) == -1) { - if (pred == pos) - { - return; - } + m_preds[target].push_back(pos); } - - m_preds[target].push_back(pos); } }; + if (pos < lsa || pos >= limit) + { + // Don't analyse if already beyond the limit + next_block(); + continue; + } + const u32 data = ls[pos / 4]; const auto op = spu_opcode_t{data}; - wl[wi] += 4; + wa += 4; m_targets.erase(pos); @@ -371,7 +414,6 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) { // Stop before invalid instructions (TODO) m_targets[pos].push_back(-1); - m_block_info[pos / 4] = true; next_block(); continue; } @@ -381,11 +423,10 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) case spu_itype::STOP: case spu_itype::STOPD: { - if (data == 0 || data == 3) + if (data == 0) { // Stop before null data m_targets[pos].push_back(-1); - m_block_info[pos / 4] = true; next_block(); continue; } @@ -398,11 +439,22 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) break; } + if (type == spu_itype::SYNC) + { + // Remember + sync = true; + } + break; } case spu_itype::IRET: { + if (op.d && op.e) + { + LOG_ERROR(SPU, "[0x%x] Invalid interrupt flags (DE)", pos); + } + m_targets[pos].push_back(-1); next_block(); break; @@ -410,15 +462,22 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) case spu_itype::BI: case spu_itype::BISL: + case spu_itype::BISLED: case spu_itype::BIZ: case spu_itype::BINZ: case spu_itype::BIHZ: case spu_itype::BIHNZ: { + if (op.d && op.e) + { + LOG_ERROR(SPU, "[0x%x] Invalid interrupt flags (DE)", pos); + } + const auto af = vflags[op.ra]; const auto av = values[op.ra]; + const bool sl = type == spu_itype::BISL || type == spu_itype::BISLED; - if (type == spu_itype::BISL) + if (sl) { m_regmod[pos / 4] = op.rt; vflags[op.rt] = +vf::is_const; @@ -428,23 +487,66 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) if (test(af, vf::is_const)) { const u32 target = spu_branch_target(av); - LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target); if (target == pos + 4) { // Nop (unless BISL) - LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", lsa, pos); + LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", result[0], pos); + } + else + { + LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", result[0], pos, target); } m_targets[pos].push_back(target); - if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga) + if (!sl) + { + if (sync) + { + LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring branch to 0x%x (SYNC)", result[0], pos, target); + + if (entry_point < target) + { + limit = std::min(limit, target); + } + } + else + { + if (op.d || op.e) + { + m_entry_info[target / 4] = true; + } + + add_block(target); + } + } + + if (sl && g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + if (sync) + { + LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring call to 0x%x (SYNC)", result[0], pos, target); + + if (target > entry_point) + { + limit = std::min(limit, target); + } + } + else + { + m_entry_info[target / 4] = true; + add_block(target); + } + } + else if (sl && target > entry_point) { - add_block(target); + limit = std::min(limit, target); } - if (type == spu_itype::BISL && g_cfg.core.spu_block_size != spu_block_size_type::safe) + if (sl && g_cfg.core.spu_block_size != spu_block_size_type::safe) { + m_entry_info[pos / 4 + 1] = true; m_targets[pos].push_back(pos + 4); add_block(pos + 4); } @@ -455,7 +557,6 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) std::basic_string jt_abs; std::basic_string jt_rel; const u32 start = pos + 4; - const u32 limit = 0x40000; u64 dabs = 0; u64 drel = 0; @@ -469,13 +570,13 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) break; } - if (target >= lsa && target < limit) + if (target >= lsa && target < 0x40000) { // Possible jump table entry (absolute) jt_abs.push_back(target); } - if (target + start >= lsa && target + start < limit) + if (target + start >= lsa && target + start < 0x40000) { // Possible jump table entry (relative) jt_rel.push_back(target + start); @@ -528,6 +629,7 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) { add_block(jt_abs[i]); result[(start - lsa) / 4 + 1 + i] = se_storage::swap(jt_abs[i]); + m_targets[start + i * 4].push_back(-1); } m_targets.emplace(pos, std::move(jt_abs)); @@ -546,14 +648,40 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) { add_block(jt_rel[i]); result[(start - lsa) / 4 + 1 + i] = se_storage::swap(jt_rel[i] - start); + m_targets[start + i * 4].push_back(-1); } m_targets.emplace(pos, std::move(jt_rel)); } } + else if (start + 12 * 4 < limit && + ls[start / 4 + 0] == 0x1ce00408 && + ls[start / 4 + 1] == 0x24000389 && + ls[start / 4 + 2] == 0x24004809 && + ls[start / 4 + 3] == 0x24008809 && + ls[start / 4 + 4] == 0x2400c809 && + ls[start / 4 + 5] == 0x24010809 && + ls[start / 4 + 6] == 0x24014809 && + ls[start / 4 + 7] == 0x24018809 && + ls[start / 4 + 8] == 0x1c200807 && + ls[start / 4 + 9] == 0x2401c809) + { + LOG_WARNING(SPU, "[0x%x] Pattern 1 detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg); + + // Add 8 targets (TODO) + for (u32 addr = start + 4; addr < start + 36; addr += 4) + { + m_targets[pos].push_back(addr); + add_block(addr); + } + } + else if (hbr_loc > start && hbr_loc < limit && hbr_tg == start) + { + LOG_WARNING(SPU, "[0x%x] No patterns detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg); + } } - if (type == spu_itype::BI || type == spu_itype::BISL) + if (type == spu_itype::BI || sl) { if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe) { @@ -564,6 +692,7 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) } else { + m_entry_info[pos / 4 + 1] = true; m_targets[pos].push_back(pos + 4); add_block(pos + 4); } @@ -597,14 +726,28 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) if (g_cfg.core.spu_block_size != spu_block_size_type::safe) { + m_entry_info[pos / 4 + 1] = true; m_targets[pos].push_back(pos + 4); add_block(pos + 4); } - if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync) { + m_entry_info[target / 4] = true; add_block(target); } + else + { + if (g_cfg.core.spu_block_size == spu_block_size_type::giga) + { + LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", result[0], pos, target); + } + + if (target > entry_point) + { + limit = std::min(limit, target); + } + } next_block(); break; @@ -644,9 +787,6 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) case spu_itype::HGTI: case spu_itype::HLGT: case spu_itype::HLGTI: - case spu_itype::HBR: - case spu_itype::HBRA: - case spu_itype::HBRR: case spu_itype::LNOP: case spu_itype::NOP: case spu_itype::MTSPR: @@ -661,6 +801,27 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) break; } + case spu_itype::HBR: + { + hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt); + hbr_tg = test(vflags[op.ra], vf::is_const) && !op.c ? values[op.ra] & 0x3fffc : -1; + break; + } + + case spu_itype::HBRA: + { + hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); + hbr_tg = spu_branch_target(0x0, op.i16); + break; + } + + case spu_itype::HBRR: + { + hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt); + hbr_tg = spu_branch_target(pos, op.i16); + break; + } + case spu_itype::IL: { m_regmod[pos / 4] = op.rt; @@ -812,12 +973,12 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) } } - while (g_cfg.core.spu_block_size != spu_block_size_type::giga) + while (g_cfg.core.spu_block_size != spu_block_size_type::giga || limit < 0x40000) { const u32 initial_size = result.size(); - // Check unreachable blocks in safe and mega modes (TODO) - u32 limit = lsa + result.size() * 4 - 4; + // Check unreachable blocks + limit = std::min(limit, lsa + initial_size * 4 - 4); for (auto& pair : m_preds) { @@ -839,8 +1000,8 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) { for (u32 j = workload[i];; j -= 4) { - // Go backward from an address until the entry point (=lsa) is reached - if (j == lsa) + // Go backward from an address until the entry point is reached + if (j == result[0]) { reachable = true; break; @@ -906,21 +1067,20 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) { if (result[i] == 0) { - const u32 pos = lsa + (i - 1) * 4; + const u32 pos = lsa + (i - 1) * 4; const u32 data = ls[pos / 4]; // Allow only NOP or LNOP instructions in holes if (data == 0x200000 || (data & 0xffffff80) == 0x40200000) { - if (i + 1 < result.size()) - { - result[i] = se_storage::swap(data); - continue; - } + continue; } - result.resize(valid_size + 1); - break; + if (g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + result.resize(valid_size + 1); + break; + } } else { @@ -928,6 +1088,9 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) } } + // Even if NOP or LNOP, should be removed at the end + result.resize(valid_size + 1); + // Repeat if blocks were removed if (result.size() == initial_size) { @@ -935,6 +1098,188 @@ std::vector spu_recompiler_base::block(const be_t* ls, u32 lsa) } } + limit = std::min(limit, lsa + ::size32(result) * 4 - 4); + + // Cleanup block info + for (u32 i = 0; i < workload.size(); i++) + { + const u32 addr = workload[i]; + + if (addr < lsa || addr >= limit || !result[(addr - lsa) / 4 + 1]) + { + m_block_info[addr / 4] = false; + m_entry_info[addr / 4] = false; + m_preds.erase(addr); + } + } + + // Complete m_preds and associated m_targets for adjacent blocks + for (auto& pair : m_preds) + { + // Erase impossible predecessors + const auto new_end = std::remove_if(pair.second.begin(), pair.second.end(), [&](u32 addr) + { + return addr < lsa || addr >= limit; + }); + + pair.second.erase(new_end, pair.second.end()); + + // Don't add fallthrough target if all predecessors are removed + if (pair.second.empty() && !m_entry_info[pair.first / 4]) + { + // If not an entry point, remove the block completely + m_block_info[pair.first / 4] = false; + continue; + } + + // Previous instruction address + const u32 prev = (pair.first - 4) & 0x3fffc; + + const auto tfound = m_targets.find(prev); + + // TODO: can it be empty? + if (tfound == m_targets.end() || tfound->second.empty()) + { + // TODO: check the correctness + if (prev >= lsa && prev < limit && result[(prev - lsa) / 4 + 1]) + { + // Add target and the predecessor + m_targets[prev].push_back(pair.first); + pair.second.push_back(prev); + } + } + } + + // Erase unreachable targets + for (auto& pair : m_targets) + { + // Erase unreachable targets + const auto new_end = std::remove_if(pair.second.begin(), pair.second.end(), [&](u32 addr) + { + if (addr >> 31) + { + return false; + } + + return addr < lsa || addr >= limit; + }); + + pair.second.erase(new_end, pair.second.end()); + + if (pair.second.empty()) + { + // Add default no-target + pair.second.push_back(-1); + } + } + + // Fill holes which contain only NOP and LNOP instructions + for (u32 i = 1, nnop = 0, vsize = 0; i <= result.size(); i++) + { + if (i >= result.size() || result[i]) + { + if (nnop && nnop == i - vsize - 1) + { + // Write only complete NOP sequence + for (u32 j = vsize + 1; j < i; j++) + { + result[j] = se_storage::swap(ls[lsa / 4 + j - 1]); + } + } + + nnop = 0; + vsize = i; + } + else + { + const u32 pos = lsa + (i - 1) * 4; + const u32 data = ls[pos / 4]; + + if (data == 0x200000 || (data & 0xffffff80) == 0x40200000) + { + nnop++; + } + } + } + + // Fill entry map, add entry points + while (g_cfg.core.spu_block_size != spu_block_size_type::safe) + { + workload.clear(); + workload.push_back(entry_point); + std::memset(m_entry_map.data(), 0, sizeof(m_entry_map)); + + std::basic_string new_entries; + + for (u32 wi = 0; wi < workload.size(); wi++) + { + const u32 addr = workload[wi]; + const u16 _new = m_entry_map[addr / 4]; + + if (!m_entry_info[addr / 4]) + { + // Check block predecessors + for (u32 pred : m_preds[addr]) + { + const u16 _old = m_entry_map[pred / 4]; + + if (_old && _old != _new) + { + // If block has multiple 'entry' points, it becomes an entry point itself + new_entries.push_back(addr); + } + } + } + + // Fill value + const u16 root = m_entry_info[addr / 4] ? ::narrow(addr / 4) : _new; + + for (u32 wa = addr; wa < limit && result[(wa - lsa) / 4 + 1]; wa += 4) + { + // Fill entry address for the instruction + m_entry_map[wa / 4] = root; + + // Find targets (also means end of the block) + const auto tfound = m_targets.find(wa); + + if (tfound == m_targets.end() || tfound->second.empty() || tfound->second[0] == -1) + { + continue; + } + + for (u32 target : tfound->second) + { + const u16 value = m_entry_info[target / 4] ? ::narrow(target / 4) : root; + + if (u16& tval = m_entry_map[target / 4]) + { + if (tval != value && !m_entry_info[target / 4]) + { + new_entries.push_back(target); + } + } + else + { + tval = value; + workload.emplace_back(target); + } + } + + break; + } + } + + if (new_entries.empty()) + { + break; + } + + for (u32 entry : new_entries) + { + m_entry_info[entry / 4] = true; + } + } + if (result.size() == 1) { // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback @@ -988,13 +1333,13 @@ class spu_llvm_runtime m_map[std::vector()] = &spu_recompiler_base::dispatch; // Clear LLVM output - m_cache_path = fxm::check_unlocked()->cache + "llvm/"; - fs::create_dir(m_cache_path); - fs::remove_all(m_cache_path, false); + m_cache_path = fxm::check_unlocked()->cache; + fs::create_dir(m_cache_path + "llvm/"); + fs::remove_all(m_cache_path + "llvm/", false); if (g_cfg.core.spu_debug) { - fs::file(m_cache_path + "../spu.log", fs::rewrite); + fs::file(m_cache_path + "spu.log", fs::rewrite); } LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized..."); @@ -1005,19 +1350,158 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { std::shared_ptr m_spurt; + // Current function (chunk) llvm::Function* m_function; - using m_module = void; + // Current function chunk entry point + u32 m_entry; llvm::Value* m_thread; llvm::Value* m_lsptr; - llvm::BasicBlock* m_stop; + // Pointers to registers in the thread context + std::array m_reg_addr; + + // Global variable (function table) + llvm::GlobalVariable* m_function_table{}; + + struct block_info + { + // Current block's entry block + llvm::BasicBlock* block; + + // Final block (for PHI nodes, set after completion) + llvm::BasicBlock* block_end{}; + + // Regmod compilation (TODO) + std::bitset mod; + + // List of actual predecessors + std::basic_string preds; + + // Current register values + std::array reg{}; + + // PHI nodes created for this block (if any) + std::array phi{}; + + // Store instructions + std::array store{}; + }; + + // Current block + block_info* m_block; + + // All blocks in the current function chunk + std::unordered_map> m_blocks; + + // Block list for processing + std::vector m_block_queue; + + // All function chunks in current SPU compile unit + std::unordered_map> m_functions; + + // Function chunk list for processing + std::vector m_function_queue; + + // Helper + std::vector m_scan_queue; + + // Add or get the function chunk + llvm::Function* add_function(u32 addr) + { + // Get function chunk name + const std::string name = fmt::format("spu-chunk-0x%05x", addr); + llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, get_type(), get_type(), get_type(), get_type())); + + // Set parameters + result->setLinkage(llvm::GlobalValue::InternalLinkage); + + // Enqueue if necessary + if (m_functions.emplace(addr, result).second) + { + m_function_queue.push_back(addr); + } + + return result; + } + + void set_function(llvm::Function* func) + { + m_function = func; + m_thread = &*func->arg_begin(); + m_lsptr = &*(func->arg_begin() + 1); + + m_reg_addr.fill(nullptr); + m_block = nullptr; + m_blocks.clear(); + m_block_queue.clear(); + m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", m_function)); + } + + // Add block with current block as a predecessor + llvm::BasicBlock* add_block(u32 target) + { + // Check the predecessor + const bool pred_found = m_block_info[target / 4] && m_preds[target].find_first_of(m_pos) != -1; + + if (m_blocks.empty()) + { + // Special case: first block, proceed normally + } + else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target)) + { + // Generate a tail call to the function chunk + const auto cblock = m_ir->GetInsertBlock(); + const auto result = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->SetInsertPoint(result); + m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&SPUThread::pc)); + tail(add_function(target)); + m_ir->SetInsertPoint(cblock); + return result; + } + else if (!pred_found || !m_block_info[target / 4]) + { + if (m_block_info[target / 4]) + { + LOG_ERROR(SPU, "[0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_pos, target, m_entry, m_function_queue[0], m_size / 4); + } + + // Generate external indirect tail call + const auto cblock = m_ir->GetInsertBlock(); + const auto result = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->SetInsertPoint(result); + m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&SPUThread::pc)); + const auto addr = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher) + target * 2)); + const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo()->getPointerTo(); + tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(addr, type))); + m_ir->SetInsertPoint(cblock); + return result; + } + + auto& result = m_blocks[target].block; + + if (!result) + { + result = llvm::BasicBlock::Create(m_context, fmt::format("b-0x%x", target), m_function); - std::array, 128> m_gpr; - std::array m_flush_gpr; + // Add the block to the queue + m_block_queue.push_back(target); + } + else if (m_block && m_blocks[target].block_end) + { + // Connect PHI nodes if necessary + for (u32 i = 0; i < s_reg_max; i++) + { + if (const auto phi = m_blocks[target].phi[i]) + { + phi->addIncoming(get_vr(i).value, m_block->block_end); + } + } + } - std::map m_instr_map; + return result; + } template llvm::Value* _ptr(llvm::Value* base, u32 offset, std::string name = "") @@ -1033,89 +1517,164 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return _ptr(m_thread, ::offset32(offset_args...)); } - template - auto& init_vr(u32 index) + template + llvm::Value* spu_ptr(value_t add, Args... offset_args) { - auto& gpr = m_gpr.at(index); + const auto off = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(offset_args...))); + const auto ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(off, add.value), get_type()->getPointerTo()); + return ptr; + } - if (!gpr.first) + llvm::Value* init_vr(u32 index) + { + if (!m_reg_addr.at(index)) { // Save and restore current insert point if necessary const auto block_cur = m_ir->GetInsertBlock(); - // Emit register pointer at the beginning of function - m_ir->SetInsertPoint(&*m_function->begin()->getFirstInsertionPt()); - gpr.first = _ptr(m_thread, ::offset32(&SPUThread::gpr, index), fmt::format("Reg$%u", index)); + // Emit register pointer at the beginning of the function chunk + m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); + m_reg_addr[index] = _ptr(m_thread, ::offset32(&SPUThread::gpr, index), fmt::format("Reg$%u", index)); m_ir->SetInsertPoint(block_cur); } - return gpr; + return m_reg_addr[index]; } template value_t get_vr(u32 index) { - auto& gpr = init_vr(index); - - if (!gpr.second) + if (!m_block->reg.at(index)) { - gpr.second = m_ir->CreateLoad(gpr.first, fmt::format("Load$%u", index)); + // Load register value if necessary + m_block->reg[index] = m_ir->CreateLoad(init_vr(index), fmt::format("Load$%u", index)); } value_t r; - r.value = m_ir->CreateBitCast(gpr.second, get_type()); + r.value = m_ir->CreateBitCast(m_block->reg[index], get_type()); return r; } template void set_vr(u32 index, T expr) { - auto& gpr = init_vr(index); + // Check + verify(HERE), m_regmod[m_pos / 4] == index; - gpr.second = expr.eval(m_ir); + // Set register value + m_block->reg.at(index) = expr.eval(m_ir); - // Remember last insertion point for flush - if (m_ir->GetInsertBlock()->empty()) - { - // Insert dummy instruction if empty - m_flush_gpr.at(index) = llvm::cast(m_ir->CreateAdd(m_thread, m_ir->getInt64(8))); - } - else + // Get register location + const auto addr = init_vr(index); + + // Erase previous dead store instruction if necessary + if (m_block->store[index]) { - m_flush_gpr.at(index) = m_ir->GetInsertBlock()->end()->getPrevNode(); + // TODO: better cross-block dead store elimination + m_block->store[index]->eraseFromParent(); } + + // Write register to the context + m_block->store[index] = m_ir->CreateStore(m_ir->CreateBitCast(m_block->reg[index], addr->getType()->getPointerElementType()), addr); } - void flush(std::pair& reg, llvm::Instruction*& flush_reg) + // Return either basic block addr with single dominating value, or negative number of PHI entries + u32 find_reg_origin(u32 addr, u32 index) { - if (reg.first && reg.second && flush_reg) + u32 result = -1; + + // Handle entry point specially + if (m_entry_info[addr / 4]) { - // Save and restore current insert point if necessary - const auto block_cur = m_ir->GetInsertBlock(); + result = addr; + } + + // Used for skipping blocks from different chunks + const u16 root = ::narrow(g_cfg.core.spu_block_size == spu_block_size_type::safe ? 0 : m_entry / 4); + + // List of predecessors to check + m_scan_queue.clear(); - // Try to emit store immediately after its last use - if (const auto next = flush_reg->getNextNode()) + const auto pfound = m_preds.find(addr); + + if (pfound != m_preds.end()) + { + for (u32 pred : pfound->second) { - m_ir->SetInsertPoint(next); + if (m_entry_map[pred / 4] == root) + { + m_scan_queue.push_back(pred); + } } - - m_ir->CreateStore(m_ir->CreateBitCast(reg.second, reg.first->getType()->getPointerElementType()), reg.first); - m_ir->SetInsertPoint(block_cur); } - // Unregister store - flush_reg = nullptr; + // TODO: allow to avoid untouched registers in some cases + bool regmod_any = result == -1; - // Invalidate current value (TODO) - reg.second = nullptr; - } + for (u32 i = 0; i < m_scan_queue.size(); i++) + { + // Find whether the block modifies the selected register + bool regmod = false; - void flush() - { - for (u32 i = 0; i < 128; i++) + for (addr = m_scan_queue[i];; addr -= 4) + { + if (index == m_regmod[addr / 4]) + { + regmod = true; + regmod_any = true; + } + + const auto pfound = m_preds.find(addr); + + if (pfound == m_preds.end()) + { + continue; + } + + if (!regmod) + { + // Enqueue predecessors if register is not modified there + for (u32 pred : pfound->second) + { + if (m_entry_map[pred / 4] != root) + { + continue; + } + + // TODO + if (std::find(m_scan_queue.cbegin(), m_scan_queue.cend(), pred) == m_scan_queue.cend()) + { + m_scan_queue.push_back(pred); + } + } + } + + break; + } + + if (regmod || m_entry_info[addr / 4]) + { + if (result == -1) + { + result = addr; + } + else if (result >> 31) + { + result--; + } + else + { + result = -2; + } + } + } + + if (!regmod_any) { - flush(m_gpr[i], m_flush_gpr[i]); + result = addr; } + + return result; } void update_pc() @@ -1123,6 +1682,22 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->CreateStore(m_ir->getInt32(m_pos), spu_ptr(&SPUThread::pc)); } + // Call cpu_thread::check_state if necessary and return or continue (full check) + void check_state(u32 addr) + { + const auto pstate = spu_ptr(&SPUThread::state); + const auto _body = llvm::BasicBlock::Create(m_context, "", m_function); + const auto check = llvm::BasicBlock::Create(m_context, "", m_function); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), _body, check); + m_ir->SetInsertPoint(check); + m_ir->CreateStore(m_ir->getInt32(addr), spu_ptr(&SPUThread::pc)); + m_ir->CreateCondBr(call(&exec_check_state, m_thread), stop, _body); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(_body); + } + // Perform external call template llvm::CallInst* call(RT(*_func)(FArgs...), Args... args) @@ -1253,12 +1828,25 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator fmt::append(hash, "spu-0x%05x-%s", func[0], fmt::base57(output)); } - LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, hash); + if (m_cache) + { + LOG_SUCCESS(SPU, "LLVM: Building %s (size %u)...", hash, func.size() - 1); + } + else + { + LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, hash); + } using namespace llvm; SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode); - dis_asm.offset = reinterpret_cast(func.data() + 1) - func[0]; + dis_asm.offset = reinterpret_cast(func.data() + 1); + + if (g_cfg.core.spu_block_size != spu_block_size_type::giga) + { + dis_asm.offset -= func[0]; + } + std::string log; if (g_cfg.core.spu_debug) @@ -1268,56 +1856,36 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Create LLVM module std::unique_ptr module = std::make_unique(hash + ".obj", m_context); - - // Initialize target module->setTargetTriple(Triple::normalize(sys::getProcessTriple())); - - // Initialize pass manager - legacy::FunctionPassManager pm(module.get()); - - // Basic optimizations - pm.add(createEarlyCSEPass()); - pm.add(createDeadStoreEliminationPass()); - pm.add(createLintPass()); // Check - - // Add function - const auto main_func = cast(module->getOrInsertFunction(hash, get_type(), get_type(), get_type())); - m_function = main_func; - m_thread = &*m_function->arg_begin(); - m_lsptr = &*(m_function->arg_begin() + 1); + m_module = module.get(); // Initialize IR Builder - IRBuilder<> irb(BasicBlock::Create(m_context, "", m_function)); + IRBuilder<> irb(m_context); m_ir = &irb; + // Add entry function (contains only state/code check) + const auto main_func = llvm::cast(m_module->getOrInsertFunction(hash, get_type(), get_type(), get_type())); + set_function(main_func); + // Start compilation m_pos = func[0]; m_size = (func.size() - 1) * 4; - const u32 start = m_pos; - const u32 end = m_pos + m_size; - - m_stop = BasicBlock::Create(m_context, "", m_function); - - // Create instruction blocks - for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4) - { - if (func[i] && m_block_info[pos / 4]) - { - m_instr_map.emplace(pos, BasicBlock::Create(m_context, "", m_function)); - } - } + const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga); + const u32 end = start + m_size; update_pc(); const auto label_test = BasicBlock::Create(m_context, "", m_function); const auto label_diff = BasicBlock::Create(m_context, "", m_function); const auto label_body = BasicBlock::Create(m_context, "", m_function); + const auto label_stop = BasicBlock::Create(m_context, "", m_function); // Emit state check const auto pstate = spu_ptr(&SPUThread::state); - m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), m_stop, label_test); + m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), label_stop, label_test); // Emit code check + u32 check_iterations = 0; m_ir->SetInsertPoint(label_test); if (!g_cfg.core.spu_verification) @@ -1327,17 +1895,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } else if (func.size() - 1 == 1) { - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr(m_lsptr, m_pos)), m_ir->getInt32(func[1])); + const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr(m_lsptr, start)), m_ir->getInt32(func[1])); m_ir->CreateCondBr(cond, label_diff, label_body); } else if (func.size() - 1 == 2) { - const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr(m_lsptr, m_pos)), m_ir->getInt64(static_cast(func[2]) << 32 | func[1])); + const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr(m_lsptr, start)), m_ir->getInt64(static_cast(func[2]) << 32 | func[1])); m_ir->CreateCondBr(cond, label_diff, label_body); } else { - const u32 starta = m_pos & -32; + const u32 starta = start & -32; const u32 enda = ::align(end, 32); const u32 sizea = (enda - starta) / 32; verify(HERE), sizea; @@ -1354,7 +1922,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { const u32 k = j + i * 4; - if (k < m_pos || k >= end || !func[(k - m_pos) / 4 + 1]) + if (k < start || k >= end || !func[(k - start) / 4 + 1]) { indices[i] = 8; holes = true; @@ -1387,11 +1955,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator for (u32 i = 0; i < 8; i++) { const u32 k = j + i * 4; - words[i] = k >= m_pos && k < end ? func[(k - m_pos) / 4 + 1] : 0; + words[i] = k >= start && k < end ? func[(k - start) / 4 + 1] : 0; } vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words)); acc = acc ? m_ir->CreateOr(acc, vls) : vls; + check_iterations++; } // Pattern for PTEST @@ -1406,126 +1975,243 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->CreateCondBr(cond, label_diff, label_body); } - // Increase block counter - m_ir->SetInsertPoint(label_body); - const auto pbcount = spu_ptr(&SPUThread::block_counter); - m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(1)), pbcount); + // Increase block counter with statistics + m_ir->SetInsertPoint(label_body); + const auto pbcount = spu_ptr(&SPUThread::block_counter); + m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(check_iterations)), pbcount); + + // Call the entry function chunk + const auto entry_chunk = add_function(m_pos); + m_ir->CreateCall(entry_chunk, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall(); + m_ir->CreateRetVoid(); + + m_ir->SetInsertPoint(label_stop); + m_ir->CreateRetVoid(); + + m_ir->SetInsertPoint(label_diff); + + if (g_cfg.core.spu_verification) + { + const auto pbfail = spu_ptr(&SPUThread::block_failure); + m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail); + tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0)); + } + else + { + m_ir->CreateUnreachable(); + } + + // Create function table (uninitialized) + m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); - // Emit instructions - for (u32 i = 1; i < func.size(); i++) + // Disassemble if necessary + if (g_cfg.core.spu_debug) { - const u32 pos = start + (i - 1) * 4; - - if (g_cfg.core.spu_debug) + for (u32 i = 1; i < func.size(); i++) { + const u32 pos = start + (i - 1) * 4; + // Disasm dis_asm.dump_pc = pos; dis_asm.disasm(pos); - log += dis_asm.last_opcode; - log += '\n'; - } - // Get opcode - const u32 op = se_storage::swap(func[i]); - - if (!op) - { - // Ignore hole - if (!m_ir->GetInsertBlock()->getTerminator()) + if (func[i]) { - flush(); - branch_fixed(spu_branch_target(pos)); - LOG_ERROR(SPU, "Unexpected fallthrough to 0x%x", pos); + log += '>'; + log += dis_asm.last_opcode; + log += '\n'; + } + else + { + fmt::append(log, ">[%08x] xx xx xx xx: \n", pos); } - - continue; } + } - // Bind instruction label if necessary (TODO) - const auto found = m_instr_map.find(pos); + // Create function chunks + for (std::size_t fi = 0; fi < m_function_queue.size(); fi++) + { + // Initialize function info + m_entry = m_function_queue[fi]; + set_function(m_functions[m_entry]); + m_ir->CreateBr(add_block(m_entry)); - if (found != m_instr_map.end()) + // Emit instructions for basic blocks + for (std::size_t bi = 0; bi < m_block_queue.size(); bi++) { - if (!m_ir->GetInsertBlock()->getTerminator()) + // Initialize basic block info + const u32 baddr = m_block_queue[bi]; + m_block = &m_blocks[baddr]; + m_ir->SetInsertPoint(m_block->block); + + const auto pfound = m_preds.find(baddr); + + if (pfound != m_preds.end() && !pfound->second.empty()) { - flush(); - m_ir->CreateBr(found->second); - } + // Initialize registers and build PHI nodes if necessary + for (u32 i = 0; i < s_reg_max; i++) + { + // TODO: optimize + const u32 src = find_reg_origin(baddr, i); + + if (src >> 31) + { + // TODO: type + const auto _phi = m_ir->CreatePHI(get_type(), 0 - src); + m_block->phi[i] = _phi; + m_block->reg[i] = _phi; + + for (u32 pred : pfound->second) + { + // TODO: optimize + while (!m_block_info[pred / 4]) + { + pred -= 4; + } + + const auto bfound = m_blocks.find(pred); - m_ir->SetInsertPoint(found->second); + if (bfound != m_blocks.end() && bfound->second.block_end) + { + auto& value = bfound->second.reg[i]; - // Build state check if necessary (TODO: more conditions) - bool need_check_state = false; + if (!value || value->getType() != _phi->getType()) + { + const auto regptr = init_vr(i); + const auto cblock = m_ir->GetInsertBlock(); + m_ir->SetInsertPoint(bfound->second.block_end->getTerminator()); - const auto pfound = m_preds.find(pos); + if (!value) + { + // Value hasn't been loaded yet + value = m_ir->CreateLoad(regptr); + } - if (pfound != m_preds.end()) - { + // Value possibly needs a bitcast + value = m_ir->CreateBitCast(value, _phi->getType()); + + m_ir->SetInsertPoint(cblock); + + verify(HERE), bfound->second.block_end->getTerminator(); + } + + _phi->addIncoming(value, bfound->second.block_end); + } + } + + if (baddr == m_entry) + { + // Load value at the function chunk's entry block if necessary + const auto regptr = init_vr(i); + const auto cblock = m_ir->GetInsertBlock(); + m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); + const auto value = m_ir->CreateLoad(regptr); + m_ir->SetInsertPoint(cblock); + _phi->addIncoming(value, &m_function->getEntryBlock()); + } + } + else if (src != baddr) + { + // Passthrough static value or constant + const auto bfound = m_blocks.find(src); + + // TODO: error + if (bfound != m_blocks.end()) + { + m_block->reg[i] = bfound->second.reg[i]; + } + } + } + + // Emit state check if necessary (TODO: more conditions) for (u32 pred : pfound->second) { - if (pred >= pos) + if (pred >= baddr && bi > 0) { // If this block is a target of a backward branch (possibly loop), emit a check - need_check_state = true; + check_state(baddr); break; } } } - if (need_check_state) + // Emit instructions + for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4) { - // Call cpu_thread::check_state if necessary and return or continue (full check) - const auto _body = BasicBlock::Create(m_context, "", m_function); - const auto check = BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), _body, check); - m_ir->SetInsertPoint(check); - m_ir->CreateStore(m_ir->getInt32(pos), spu_ptr(&SPUThread::pc)); - m_ir->CreateCondBr(call(&check_state, m_thread), m_stop, _body); - m_ir->SetInsertPoint(_body); + if (m_pos != baddr && m_block_info[m_pos / 4]) + { + break; + } + + const u32 op = se_storage::swap(func[(m_pos - start) / 4 + 1]); + + if (!op) + { + LOG_ERROR(SPU, "Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_pos, m_entry, m_function_queue[0]); + break; + } + + // Execute recompiler function (TODO) + (this->*g_decoder.decode(op))({op}); } - } - if (!m_ir->GetInsertBlock()->getTerminator()) - { - // Update position - m_pos = pos; + // Finalize block with fallthrough if necessary + if (!m_ir->GetInsertBlock()->getTerminator()) + { + const u32 target = m_pos == baddr ? baddr : m_pos & 0x3fffc; + + if (m_pos != baddr) + { + m_pos -= 4; + + if (target >= start && target < end && m_targets[m_pos].find_first_of(target) == -1) + { + LOG_ERROR(SPU, "Unregistered fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", target, m_entry, m_function_queue[0]); + } + } + + m_block->block_end = m_ir->GetInsertBlock(); + m_ir->CreateBr(add_block(target)); + } - // Execute recompiler function (TODO) - (this->*g_decoder.decode(op))({op}); + verify(HERE), m_block->block_end; } } - // Make fallthrough if necessary - if (!m_ir->GetInsertBlock()->getTerminator()) - { - flush(); - branch_fixed(spu_branch_target(end)); - } + // Initialize pass manager + legacy::FunctionPassManager pm(module.get()); - // - m_ir->SetInsertPoint(m_stop); - m_ir->CreateRetVoid(); + // Basic optimizations + pm.add(createEarlyCSEPass()); + pm.add(createAggressiveDCEPass()); + pm.add(createCFGSimplificationPass()); + pm.add(createDeadStoreEliminationPass()); + //pm.add(createLintPass()); // Check - m_ir->SetInsertPoint(label_diff); + for (const auto& func : m_functions) + { + pm.run(*func.second); + } - if (g_cfg.core.spu_verification) + if (m_function_table->getNumUses()) { - const auto pbfail = spu_ptr(&SPUThread::block_failure); - m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail); - tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0)); + // TODO } else { - m_ir->CreateUnreachable(); + m_function_table->eraseFromParent(); } - // Clear context - m_gpr.fill({}); - m_flush_gpr.fill(0); - m_instr_map.clear(); + // Clear context (TODO) + m_blocks.clear(); + m_block_queue.clear(); + m_functions.clear(); + m_function_queue.clear(); + m_scan_queue.clear(); + m_function_table = nullptr; // Generate a dispatcher (übertrampoline) - std::vector addrv{start}; + std::vector addrv{func[0]}; const auto beg = m_spurt->m_map.lower_bound(addrv); addrv[0] += 4; const auto _end = m_spurt->m_map.lower_bound(addrv); @@ -1533,10 +2219,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (size0 > 1) { - const auto trampoline = cast(module->getOrInsertFunction(fmt::format("tr_0x%05x_%03u", start, size0), get_type(), get_type(), get_type())); - m_function = trampoline; - m_thread = &*m_function->arg_begin(); - m_lsptr = &*(m_function->arg_begin() + 1); + const auto trampoline = cast(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type(), get_type(), get_type())); + set_function(trampoline); struct work { @@ -1554,7 +2238,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator workload.back().level = 1; workload.back().beg = beg; workload.back().end = _end; - workload.back().label = llvm::BasicBlock::Create(m_context, "", m_function); + workload.back().label = m_ir->GetInsertBlock(); for (std::size_t i = 0; i < workload.size(); i++) { @@ -1566,9 +2250,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator llvm::BasicBlock* def{}; - while (true) + while (w.level < w.beg->first.size()) { const u32 x1 = w.beg->first.at(w.level); + + if (!x1) + { + // Cannot split: some functions contain holes at this level + w.level++; + continue; + } + auto it = w.beg; auto it2 = it; u32 x = x1; @@ -1638,6 +2330,26 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } + if (!def && targets.empty()) + { + LOG_WARNING(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level); + m_ir->SetInsertPoint(w.label); + + if (const u64 fval = reinterpret_cast(w.beg->second)) + { + const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType()); + m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall(); + } + else + { + verify(HERE, &w.beg->second == &fn_location); + m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall(); + } + + m_ir->CreateRetVoid(); + continue; + } + if (!def) { def = llvm::BasicBlock::Create(m_context, "", m_function); @@ -1659,16 +2371,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } } - // Run some optimizations - //pm.run(*main_func); - spu_function_t fn{}, tr{}; raw_string_ostream out(log); if (g_cfg.core.spu_debug) { - fmt::append(log, "LLVM IR at 0x%x:\n", start); + fmt::append(log, "LLVM IR at 0x%x:\n", func[0]); out << *module; // print IR out << "\n\n"; } @@ -1676,14 +2385,20 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (verifyModule(*module, &out)) { out.flush(); - LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", start, log); + LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func[0], log); + + if (g_cfg.core.spu_debug) + { + fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log); + } + fmt::raw_error("Compilation failed"); } if (g_cfg.core.spu_debug) { // Testing only - m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path); + m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/"); } else { @@ -1703,17 +2418,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator fn_location = fn; // Trampoline - m_spurt->m_dispatcher[start / 4] = tr; + m_spurt->m_dispatcher[func[0] / 4] = tr; - LOG_NOTICE(SPU, "[0x%x] Compiled: %p", start, fn); + LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn); if (tr != fn) - LOG_NOTICE(SPU, "[0x%x] T: %p", start, tr); + LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr); if (g_cfg.core.spu_debug) { out.flush(); - fs::file(m_spurt->m_cache_path + "../spu.log", fs::write + fs::append).write(log); + fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log); } if (m_cache && g_cfg.core.spu_cache) @@ -1724,7 +2439,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator return fn; } - static bool check_state(SPUThread* _spu) + static bool exec_check_state(SPUThread* _spu) { return _spu->check_state(); } @@ -1741,7 +2456,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator template void fall(spu_opcode_t op) { - flush(); update_pc(); call(&exec_fall, m_thread, m_ir->getInt32(op.opcode)); } @@ -1753,31 +2467,38 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void UNK(spu_opcode_t op_unk) { - flush(); + m_block->block_end = m_ir->GetInsertBlock(); update_pc(); tail(&exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); } - static void exec_stop(SPUThread* _spu, u32 code) + static bool exec_stop(SPUThread* _spu, u32 code) { - if (_spu->stop_and_signal(code)) - { - _spu->pc += 4; - } + return _spu->stop_and_signal(code); } void STOP(spu_opcode_t op) // { - flush(); update_pc(); - tail(&exec_stop, m_thread, m_ir->getInt32(op.opcode)); + const auto succ = call(&exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff)); + const auto next = llvm::BasicBlock::Create(m_context, "", m_function); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(succ, next, stop); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); + m_ir->SetInsertPoint(next); + + if (g_cfg.core.spu_block_size == spu_block_size_type::safe) + { + m_block->block_end = m_ir->GetInsertBlock(); + m_ir->CreateStore(m_ir->getInt32(m_pos + 4), spu_ptr(&SPUThread::pc)); + m_ir->CreateRetVoid(); + } } void STOPD(spu_opcode_t op) // { - flush(); - update_pc(); - tail(&exec_stop, m_thread, m_ir->getInt32(0x3fff)); + STOP(spu_opcode_t{0x3fff}); } static s64 exec_rdch(SPUThread* _spu, u32 ch) @@ -1787,12 +2508,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void RDCH(spu_opcode_t op) // { - flush(); update_pc(); value_t res; res.value = call(&exec_rdch, m_thread, m_ir->getInt32(op.ra)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), m_stop, next); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); m_ir->SetInsertPoint(next); set_vr(op.rt, insert(splat(0), 3, trunc(res))); } @@ -1816,11 +2539,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void WRCH(spu_opcode_t op) // { - flush(); update_pc(); const auto succ = call(&exec_wrch, m_thread, m_ir->getInt32(op.ra), extract(get_vr(op.rt), 3).value); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(succ, next, m_stop); + const auto stop = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(succ, next, stop); + m_ir->SetInsertPoint(stop); + m_ir->CreateRetVoid(); m_ir->SetInsertPoint(next); } @@ -1838,12 +2563,19 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { // This instruction must be used following a store instruction that modifies the instruction stream. m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); + + if (g_cfg.core.spu_block_size == spu_block_size_type::safe) + { + m_block->block_end = m_ir->GetInsertBlock(); + m_ir->CreateStore(m_ir->getInt32(m_pos + 4), spu_ptr(&SPUThread::pc)); + m_ir->CreateRetVoid(); + } } void DSYNC(spu_opcode_t op) // { // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. - m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); + SYNC(op); } void MFSPR(spu_opcode_t op) // @@ -2936,7 +3668,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void HGT(spu_opcode_t op) // { - flush(); const auto cond = eval(extract(get_vr(op.ra), 3) > extract(get_vr(op.rb), 3)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); @@ -2948,7 +3679,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void HEQ(spu_opcode_t op) // { - flush(); const auto cond = eval(extract(get_vr(op.ra), 3) == extract(get_vr(op.rb), 3)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); @@ -2960,7 +3690,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void HLGT(spu_opcode_t op) // { - flush(); const auto cond = eval(extract(get_vr(op.ra), 3) > extract(get_vr(op.rb), 3)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); @@ -2972,7 +3701,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void HGTI(spu_opcode_t op) // { - flush(); const auto cond = eval(extract(get_vr(op.ra), 3) > op.si10); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); @@ -2984,7 +3712,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void HEQI(spu_opcode_t op) // { - flush(); const auto cond = eval(extract(get_vr(op.ra), 3) == op.si10); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); @@ -2996,7 +3723,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void HLGTI(spu_opcode_t op) // { - flush(); const auto cond = eval(extract(get_vr(op.ra), 3) > op.si10); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto halt = llvm::BasicBlock::Create(m_context, "", m_function); @@ -3030,158 +3756,210 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { _spu->interrupts_enabled = false; _spu->srr0 = addr; + + // Test for BR/BRA instructions (they are equivalent at zero pc) + const u32 br = _spu->_ref(0); + + if ((br & 0xfd80007f) == 0x30000000) + { + return (br >> 5) & 0x3fffc; + } + return 0; } return addr; } - void branch_indirect(spu_opcode_t op, value_t addr) + llvm::BasicBlock* add_block_indirect(spu_opcode_t op, value_t addr, bool ret = true) { - if (op.d) - { - m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&SPUThread::interrupts_enabled))->setVolatile(true); - } - else if (op.e) - { - addr.value = call(&exec_check_interrupts, m_thread, addr.value); - } - + // Convert an indirect branch into a static one if possible if (const auto _int = llvm::dyn_cast(addr.value)) { - LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, _int->getZExtValue()); - return branch_fixed(_int->getZExtValue()); - } + const u32 target = ::narrow(_int->getZExtValue(), HERE); - m_ir->CreateStore(addr.value, spu_ptr(&SPUThread::pc)); + LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, target); - const auto tfound = m_targets.find(m_pos); + if (!op.e && !op.d) + { + return add_block(target); + } - if (tfound != m_targets.end() && tfound->second.size() >= 3) - { - const u32 start = m_instr_map.begin()->first; + if (!m_entry_info[target / 4]) + { + LOG_ERROR(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, target); + } - const std::set targets(tfound->second.begin(), tfound->second.end()); + // Fixed branch excludes the possibility it's a function return (TODO) + ret = false; + } - const auto exter = llvm::BasicBlock::Create(m_context, "", m_function); + // Load stack addr if necessary + value_t sp; - const auto sw = m_ir->CreateSwitch(m_ir->CreateLShr(addr.value, 2, "", true), exter, m_size / 4); + if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) + { + sp = eval(extract(get_vr(1), 3) & 0x3fff0); + } - for (u32 pos = start; pos < start + m_size; pos += 4) - { - const auto found = m_instr_map.find(pos); + const auto cblock = m_ir->GetInsertBlock(); + const auto result = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->SetInsertPoint(result); - if (found != m_instr_map.end() && targets.count(pos)) - { - sw->addCase(m_ir->getInt32(pos / 4), found->second); - } - else - { - sw->addCase(m_ir->getInt32(pos / 4), m_stop); - } - } + if (op.e) + { + addr.value = call(&exec_check_interrupts, m_thread, addr.value); + } - m_ir->SetInsertPoint(exter); + if (op.d) + { + m_ir->CreateStore(m_ir->getFalse(), spu_ptr(&SPUThread::interrupts_enabled))->setVolatile(true); } + m_ir->CreateStore(addr.value, spu_ptr(&SPUThread::pc)); const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher))); const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo()->getPointerTo(); - tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext(addr << 1).value), type))); - } - void branch_fixed(u32 target) - { - const auto found = m_instr_map.find(target); + if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe) + { + // Compare address stored in stack mirror with addr + const auto stack0 = eval(zext(sp) + ::offset32(&SPUThread::stack_mirror)); + const auto stack1 = eval(stack0 + 8); + const auto _ret = m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack0.value), type)); + const auto link = m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack1.value), get_type())); + const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); + const auto done = llvm::BasicBlock::Create(m_context, "", m_function); + m_ir->CreateCondBr(m_ir->CreateICmpEQ(zext(addr).value, link), done, fail); + m_ir->SetInsertPoint(done); + + // Clear stack mirror and return by tail call to the provided return address + m_ir->CreateStore(splat(-1).value, m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack0.value), get_type()->getPointerTo())); + tail(_ret); + m_ir->SetInsertPoint(fail); + } + + llvm::Value* ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext(addr << 1).value), type); - if (found != m_instr_map.end()) + if (g_cfg.core.spu_block_size != spu_block_size_type::safe) { - m_ir->CreateBr(found->second); - return; + // Try to load chunk address from the function table + } - m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&SPUThread::pc)); - const auto addr = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher) + target * 2)); - const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo()->getPointerTo(); - const auto func = m_ir->CreateLoad(m_ir->CreateIntToPtr(addr, type)); - tail(func); + tail(m_ir->CreateLoad(ptr)); + m_ir->SetInsertPoint(cblock); + return result; } void BIZ(spu_opcode_t op) // { - flush(); + m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 3) == 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_indirect(op, addr); - m_ir->SetInsertPoint(next); + const auto target = add_block_indirect(op, addr); + m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); } void BINZ(spu_opcode_t op) // { - flush(); + m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 3) != 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_indirect(op, addr); - m_ir->SetInsertPoint(next); + const auto target = add_block_indirect(op, addr); + m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); } void BIHZ(spu_opcode_t op) // { - flush(); + m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 6) == 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_indirect(op, addr); - m_ir->SetInsertPoint(next); + const auto target = add_block_indirect(op, addr); + m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); } void BIHNZ(spu_opcode_t op) // { - flush(); + m_block->block_end = m_ir->GetInsertBlock(); const auto cond = eval(extract(get_vr(op.rt), 6) != 0); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_indirect(op, addr); - m_ir->SetInsertPoint(next); + const auto target = add_block_indirect(op, addr); + m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4)); } void BI(spu_opcode_t op) // { + m_block->block_end = m_ir->GetInsertBlock(); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - flush(); - branch_indirect(op, addr); + + // Create jump table if necessary (TODO) + const auto tfound = m_targets.find(m_pos); + + if (!op.d && !op.e && tfound != m_targets.end() && (tfound->second.size() != 1 || tfound->second[0] != -1)) + { + // Shift aligned address for switch + const auto sw_arg = m_ir->CreateLShr(addr.value, 2, "", true); + + // Initialize jump table targets + std::map targets; + + for (u32 target : tfound->second) + { + if (m_block_info[target / 4]) + { + targets.emplace(target, add_block(target)); + } + } + + // Get jump table bounds (optimization) + const u32 start = targets.begin()->first; + const u32 end = targets.rbegin()->first + 4; + + // Emit switch instruction aiming for a jumptable in the end (indirectbr could guarantee it) + const auto sw = m_ir->CreateSwitch(sw_arg, llvm::BasicBlock::Create(m_context, "", m_function), (end - start) / 4); + + for (u32 pos = start; pos < end; pos += 4) + { + if (m_block_info[pos / 4] && targets.count(pos)) + { + const auto found = targets.find(pos); + + if (found != targets.end()) + { + sw->addCase(m_ir->getInt32(pos / 4), found->second); + continue; + } + } + + sw->addCase(m_ir->getInt32(pos / 4), sw->getDefaultDest()); + } + + // Exit function on unexpected target + m_ir->SetInsertPoint(sw->getDefaultDest()); + m_ir->CreateStore(addr.value, spu_ptr(&SPUThread::pc)); + m_ir->CreateRetVoid(); + } + else + { + // Simple indirect branch + m_ir->CreateBr(add_block_indirect(op, addr)); + } } void BISL(spu_opcode_t op) // { + m_block->block_end = m_ir->GetInsertBlock(); const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc); - u32 values[4]{0, 0, 0, spu_branch_target(m_pos + 4)}; - value_t r; - r.value = llvm::ConstantDataVector::get(m_context, values); - set_vr(op.rt, r); - flush(); - branch_indirect(op, addr); + set_link(op); + m_ir->CreateBr(add_block_indirect(op, addr, false)); } void IRET(spu_opcode_t op) // { + m_block->block_end = m_ir->GetInsertBlock(); value_t srr0; srr0.value = m_ir->CreateLoad(spu_ptr(&SPUThread::srr0)); - flush(); - branch_indirect(op, srr0); + m_ir->CreateBr(add_block_indirect(op, srr0)); } void BISLED(spu_opcode_t op) // @@ -3193,76 +3971,48 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { const u32 target = spu_branch_target(m_pos, op.i16); - if (target == m_pos + 4) + if (target != m_pos + 4) { - return; + m_block->block_end = m_ir->GetInsertBlock(); + const auto cond = eval(extract(get_vr(op.rt), 3) == 0); + m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); } - - flush(); - const auto cond = eval(extract(get_vr(op.rt), 3) == 0); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_fixed(target); - m_ir->SetInsertPoint(next); } void BRNZ(spu_opcode_t op) // { const u32 target = spu_branch_target(m_pos, op.i16); - if (target == m_pos + 4) + if (target != m_pos + 4) { - return; + m_block->block_end = m_ir->GetInsertBlock(); + const auto cond = eval(extract(get_vr(op.rt), 3) != 0); + m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); } - - flush(); - const auto cond = eval(extract(get_vr(op.rt), 3) != 0); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_fixed(target); - m_ir->SetInsertPoint(next); } void BRHZ(spu_opcode_t op) // { const u32 target = spu_branch_target(m_pos, op.i16); - if (target == m_pos + 4) + if (target != m_pos + 4) { - return; + m_block->block_end = m_ir->GetInsertBlock(); + const auto cond = eval(extract(get_vr(op.rt), 6) == 0); + m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); } - - flush(); - const auto cond = eval(extract(get_vr(op.rt), 6) == 0); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_fixed(target); - m_ir->SetInsertPoint(next); } void BRHNZ(spu_opcode_t op) // { const u32 target = spu_branch_target(m_pos, op.i16); - if (target == m_pos + 4) + if (target != m_pos + 4) { - return; + m_block->block_end = m_ir->GetInsertBlock(); + const auto cond = eval(extract(get_vr(op.rt), 6) != 0); + m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4)); } - - flush(); - const auto cond = eval(extract(get_vr(op.rt), 6) != 0); - const auto next = llvm::BasicBlock::Create(m_context, "", m_function); - const auto jump = llvm::BasicBlock::Create(m_context, "", m_function); - m_ir->CreateCondBr(cond.value, jump, next); - m_ir->SetInsertPoint(jump); - branch_fixed(target); - m_ir->SetInsertPoint(next); } void BRA(spu_opcode_t op) // @@ -3271,17 +4021,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (target != m_pos + 4) { - flush(); - branch_fixed(target); + m_block->block_end = m_ir->GetInsertBlock(); + m_ir->CreateBr(add_block(target)); } } void BRASL(spu_opcode_t op) // { - u32 values[4]{0, 0, 0, spu_branch_target(m_pos + 4)}; - value_t r; - r.value = llvm::ConstantDataVector::get(m_context, values); - set_vr(op.rt, r); + set_link(op); BRA(op); } @@ -3291,18 +4038,33 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (target != m_pos + 4) { - flush(); - branch_fixed(target); + m_block->block_end = m_ir->GetInsertBlock(); + m_ir->CreateBr(add_block(target)); } } void BRSL(spu_opcode_t op) // + { + set_link(op); + BR(op); + } + + void set_link(spu_opcode_t op) { u32 values[4]{0, 0, 0, spu_branch_target(m_pos + 4)}; value_t r; r.value = llvm::ConstantDataVector::get(m_context, values); set_vr(op.rt, r); - BR(op); + + if (g_cfg.core.spu_block_size != spu_block_size_type::safe && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1]) + { + // Store the return function chunk address at the stack mirror + const auto func = add_function(m_pos + 4); + const auto stack0 = eval(zext(extract(get_vr(1), 3) & 0x3fff0) + ::offset32(&SPUThread::stack_mirror)); + const auto stack1 = eval(stack0 + 8); + m_ir->CreateStore(func, m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack0.value), func->getType()->getPointerTo())); + m_ir->CreateStore(m_ir->getInt64(m_pos + 4), m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack1.value), get_type())); + } } static const spu_decoder g_decoder; diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index d73401358759..18730933ff55 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -45,9 +45,15 @@ class spu_recompiler_base // List of possible targets for the instruction ({} = next instruction, {-1} = no targets) std::unordered_map, value_hash> m_targets; - // List of block predecessors (incomplete, doesn't include all fallthrough predecessors) + // List of block predecessors std::unordered_map, value_hash> m_preds; + // List of function entry points and return points (set after BRSL, BRASL, BISL, BISLED) + std::bitset<0x10000> m_entry_info; + + // Compressed address of unique entry point for each instruction + std::array m_entry_map{}; + std::shared_ptr m_cache; private: @@ -82,4 +88,7 @@ class spu_recompiler_base // Create recompiler instance (LLVM) static std::unique_ptr make_llvm_recompiler(); + + // Max number of registers (for m_regmod) + static constexpr u8 s_reg_max = 128; }; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 7391d8afd305..c3bf3ac33180 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -527,6 +527,8 @@ void SPUThread::cpu_task() jit_dispatcher[pc / 4](*this, vm::_ptr(offset), nullptr); } + // Print some stats + LOG_NOTICE(SPU, "Stats: block %u (fails: %u);", block_counter, block_failure); return; }