diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp
index 6bda27ba4c9f..7a3bdcac26fa 100644
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@@ -383,7 +383,7 @@ class ObjectCache final : public llvm::ObjectCache
 		std::string name = m_path;
 		name.append(module->getName());
 		fs::file(name, fs::rewrite).write(obj.getBufferStart(), obj.getBufferSize());
-		LOG_SUCCESS(GENERAL, "LLVM: Created module: %s", module->getName().data());
+		LOG_NOTICE(GENERAL, "LLVM: Created module: %s", module->getName().data());
 	}
 
 	static std::unique_ptr<llvm::MemoryBuffer> load(const std::string& path)
@@ -405,7 +405,7 @@ class ObjectCache final : public llvm::ObjectCache
 
 		if (auto buf = load(path))
 		{
-			LOG_SUCCESS(GENERAL, "LLVM: Loaded module: %s", module->getName().data());
+			LOG_NOTICE(GENERAL, "LLVM: Loaded module: %s", module->getName().data());
 			return buf;
 		}
 
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index 435aa7ddbf99..02ef95239295 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -1225,9 +1225,6 @@ extern void ppu_initialize()
 		fmt::throw_exception("Failed to create cache directory: %s (%s)", _main->cache, fs::g_tls_error);
 	}
 
-	// Initialize SPU cache
-	spu_cache::initialize();
-
 	if (Emu.IsStopped())
 	{
 		return;
@@ -1248,6 +1245,9 @@ extern void ppu_initialize()
 	{
 		ppu_initialize(*ptr);
 	}
+
+	// Initialize SPU cache
+	spu_cache::initialize();
 }
 
 extern void ppu_initialize(const ppu_module& info)
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
index f68a6d2aa84d..9403836d18d7 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@@ -7,6 +7,7 @@
 #include "SPUThread.h"
 #include "SPUInterpreter.h"
 #include "Utilities/sysinfo.h"
+#include "PPUAnalyser.h"
 
 #include <cmath>
 #include <mutex>
@@ -32,6 +33,13 @@ std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_asmjit_recompiler
 
 spu_runtime::spu_runtime()
 {
+	m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
+
+	if (g_cfg.core.spu_debug)
+	{
+		fs::file(m_cache_path + "spu.log", fs::rewrite);
+	}
+
 	LOG_SUCCESS(SPU, "SPU Recompiler Runtime (ASMJIT) initialized...");
 
 	// Initialize lookup table
@@ -97,7 +105,12 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	using namespace asmjit;
 
 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
-	dis_asm.offset = reinterpret_cast<const u8*>(func.data() + 1) - func[0];
+	dis_asm.offset = reinterpret_cast<const u8*>(func.data() + 1);
+
+	if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+	{
+		dis_asm.offset -= func[0];
+	}
 
 	StringLogger logger;
 	logger.addOptions(Logger::kOptionBinaryForm);
@@ -163,15 +176,16 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 	// Start compilation
 	m_pos = func[0];
-	const u32 start = m_pos;
-	const u32 end = m_pos + (func.size() - 1) * 4;
+	m_size = ::size32(func) * 4 - 4;
+	const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+	const u32 end = start + m_size;
 
 	// Create instruction labels (TODO: some of them are unnecessary)
 	for (u32 i = 1; i < func.size(); i++)
 	{
 		if (func[i])
 		{
-			instr_labels[i * 4 - 4 + m_pos] = c->newLabel();
+			instr_labels[i * 4 - 4 + start] = c->newLabel();
 		}
 	}
 
@@ -210,15 +224,15 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	{
 		// Disable check (unsafe)
 	}
-	else if (func.size() - 1 == 1)
+	else if (m_size == 4)
 	{
-		c->cmp(x86::dword_ptr(*ls, m_pos), func[1]);
+		c->cmp(x86::dword_ptr(*ls, start), func[1]);
 		c->jnz(label_diff);
 	}
-	else if (func.size() - 1 == 2)
+	else if (m_size == 8)
 	{
 		c->mov(*qw1, static_cast<u64>(func[2]) << 32 | func[1]);
-		c->cmp(*qw1, x86::qword_ptr(*ls, m_pos));
+		c->cmp(*qw1, x86::qword_ptr(*ls, start));
 		c->jnz(label_diff);
 	}
 	else if (utils::has_512() && false)
@@ -226,16 +240,15 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// AVX-512 optimized check using 512-bit registers (disabled)
 		words_align = 64;
 
-		const u32 starta = m_pos & -64;
+		const u32 starta = start & -64;
 		const u32 enda = ::align(end, 64);
 		const u32 sizea = (enda - starta) / 64;
 		verify(HERE), sizea;
 
 		// Initialize pointers
 		c->lea(x86::rax, x86::qword_ptr(label_code));
-		c->lea(*qw1, x86::qword_ptr(*ls, starta));
 		u32 code_off = 0;
-		u32 ls_off = starta;
+		u32 ls_off = -8192;
 
 		for (u32 j = starta; j < enda; j += 64)
 		{
@@ -246,6 +259,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				continue;
 			}
 
+			const bool first = ls_off == -8192;
+
 			// Ensure small distance for disp8*N
 			if (j - ls_off >= 8192)
 			{
@@ -279,7 +294,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				c->vmovdqa32(x86::zmm0, x86::zword_ptr(*qw1, j - ls_off));
 			}
 
-			if (j == starta)
+			if (first)
 			{
 				c->vpcmpud(x86::k1, x86::zmm0, x86::zword_ptr(x86::rax, code_off), 4);
 			}
@@ -291,7 +306,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 			for (u32 i = j; i < j + 64; i += 4)
 			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 			}
 
 			code_off += 64;
@@ -305,7 +320,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// AVX-512 optimized check using 256-bit registers
 		words_align = 32;
 
-		const u32 starta = m_pos & -32;
+		const u32 starta = start & -32;
 		const u32 enda = ::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		verify(HERE), sizea;
@@ -330,10 +345,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 			for (u32 i = starta; i < enda; i += 4)
 			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 			}
 		}
-		else if (sizea == 2 && (end - m_pos) <= 32)
+		else if (sizea == 2 && (end - start) <= 32)
 		{
 			const u32 cmask0 = get_code_mask(starta, starta + 32);
 			const u32 cmask1 = get_code_mask(starta + 32, enda);
@@ -347,7 +362,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 			for (u32 i = starta; i < starta + 32; i += 4)
 			{
-				words.push_back(i >= m_pos ? func[(i - m_pos) / 4 + 1] : i + 32 < end ? func[(i + 32 - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
 			}
 		}
 		else
@@ -356,9 +371,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 			// Initialize pointers
 			c->lea(x86::rax, x86::qword_ptr(label_code));
-			c->lea(*qw1, x86::qword_ptr(*ls, starta));
 			u32 code_off = 0;
-			u32 ls_off = starta;
+			u32 ls_off = -4096;
 
 			for (u32 j = starta; j < enda; j += 32)
 			{
@@ -369,6 +383,8 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 					continue;
 				}
 
+				const bool first = ls_off == -4096;
+
 				// Ensure small distance for disp8*N
 				if (j - ls_off >= 4096)
 				{
@@ -398,7 +414,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				}
 
 				// Perform bitwise comparison and accumulate
-				if (j == starta)
+				if (first)
 				{
 					c->vpxor(x86::ymm0, x86::ymm1, x86::yword_ptr(x86::rax, code_off));
 				}
@@ -409,7 +425,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 				for (u32 i = j; i < j + 32; i += 4)
 				{
-					words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+					words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 				}
 
 				code_off += 32;
@@ -424,7 +440,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// Mainstream AVX
 		words_align = 32;
 
-		const u32 starta = m_pos & -32;
+		const u32 starta = start & -32;
 		const u32 enda = ::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		verify(HERE), sizea;
@@ -449,10 +465,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 			for (u32 i = starta; i < enda; i += 4)
 			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 			}
 		}
-		else if (sizea == 2 && (end - m_pos) <= 32)
+		else if (sizea == 2 && (end - start) <= 32)
 		{
 			const u32 cmask0 = get_code_mask(starta, starta + 32);
 			const u32 cmask1 = get_code_mask(starta + 32, enda);
@@ -466,7 +482,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 			for (u32 i = starta; i < starta + 32; i += 4)
 			{
-				words.push_back(i >= m_pos ? func[(i - m_pos) / 4 + 1] : i + 32 < end ? func[(i + 32 - m_pos) / 4 + 1] : 0);
+				words.push_back(i >= start ? func[(i - start) / 4 + 1] : i + 32 < end ? func[(i + 32 - start) / 4 + 1] : 0);
 			}
 		}
 		else
@@ -541,7 +557,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 
 				for (u32 i = j; i < j + 32; i += 4)
 				{
-					words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+					words.push_back(i >= start && i < end ? func[(i - start) / 4 + 1] : 0);
 				}
 
 				code_off += 32;
@@ -568,7 +584,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		// Compatible SSE2
 		words_align = 16;
 
-		const u32 starta = m_pos & -16;
+		const u32 starta = start & -16;
 		const u32 enda = ::align(end, 16);
 		const u32 sizea = (enda - starta) / 16;
 		verify(HERE), sizea;
@@ -614,10 +630,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 			}
 
 			// Determine which value will be duplicated at hole positions
-			const u32 w3 = func.at((j - m_pos + ~::cntlz32(cmask, true) % 4 * 4) / 4 + 1);
-			words.push_back(cmask & 1 ? func[(j - m_pos + 0) / 4 + 1] : w3);
-			words.push_back(cmask & 2 ? func[(j - m_pos + 4) / 4 + 1] : w3);
-			words.push_back(cmask & 4 ? func[(j - m_pos + 8) / 4 + 1] : w3);
+			const u32 w3 = func.at((j - start + ~::cntlz32(cmask, true) % 4 * 4) / 4 + 1);
+			words.push_back(cmask & 1 ? func[(j - start + 0) / 4 + 1] : w3);
+			words.push_back(cmask & 2 ? func[(j - start + 4) / 4 + 1] : w3);
+			words.push_back(cmask & 4 ? func[(j - start + 8) / 4 + 1] : w3);
 			words.push_back(w3);
 
 			// PSHUFD immediate table for all possible hole mask values, holes repeat highest valid word
@@ -641,7 +657,9 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				0b11100100, // full
 			};
 
-			const auto& dest = !order++ ? reg0 : reg1;
+			const bool first = !order++;
+
+			const auto& dest = first ? reg0 : reg1;
 
 			// Load aligned code block from LS
 			if (cmask != 0xf)
@@ -656,7 +674,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 			// Perform bitwise comparison and accumulate
 			c->xorps(dest, x86::dqword_ptr(x86::rax, code_off));
 
-			if (j != starta && j != starta + 16)
+			if (first)
 			{
 				c->orps(reg0, dest);
 			}
@@ -690,24 +708,38 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		c->vzeroupper();
 	}
 
-	c->inc(SPU_OFF_64(block_counter));
+	// Acknowledge success and add statistics
+	c->add(SPU_OFF_64(block_counter), ::size32(words) / (words_align / 4));
+
+	if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_pos != start)
+	{
+		// Jump to the entry point if necessary
+		c->jmp(instr_labels[m_pos]);
+		m_pos = -1;
+	}
 
 	for (u32 i = 1; i < func.size(); i++)
 	{
 		const u32 pos = start + (i - 1) * 4;
+		const u32 op  = se_storage<u32>::swap(func[i]);
 
 		if (g_cfg.core.spu_debug)
 		{
 			// Disasm
 			dis_asm.dump_pc = pos;
 			dis_asm.disasm(pos);
-			compiler.comment(dis_asm.last_opcode.c_str());
-			log += dis_asm.last_opcode;
-			log += '\n';
-		}
 
-		// Get opcode
-		const u32 op = se_storage<u32>::swap(func[i]);
+			if (op)
+			{
+				log += '>';
+				log += dis_asm.last_opcode;
+				log += '\n';
+			}
+			else
+			{
+				fmt::append(log, ">[%08x]  xx xx xx xx: <hole>\n", pos);
+			}
+		}
 
 		if (!op)
 		{
@@ -738,6 +770,12 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 			c->bind(found->second);
 		}
 
+		if (g_cfg.core.spu_debug)
+		{
+			// Disasm inside the ASMJIT log
+			compiler.comment(dis_asm.last_opcode.c_str());
+		}
+
 		// Execute recompiler function
 		(this->*s_spu_decoder.decode(op))({op});
 
@@ -784,6 +822,10 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		c->align(kAlignData, 8);
 		c->bind(instr_table);
 
+		// Get actual instruction table bounds
+		const u32 start = instr_labels.begin()->first;
+		const u32 end = instr_labels.rbegin()->first + 4;
+
 		for (u32 addr = start; addr < end; addr += 4)
 		{
 			const auto found = instr_labels.find(addr);
@@ -825,6 +867,22 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 	// Register function
 	fn_location = fn;
 
+	if (g_cfg.core.spu_debug)
+	{
+		// Add ASMJIT logs
+		fmt::append(log, "Address: %p\n\n", fn);
+		log += logger.getString();
+		log += "\n\n\n";
+
+		// Append log file
+		fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
+	}
+
+	if (m_cache && g_cfg.core.spu_cache)
+	{
+		m_cache->add(func);
+	}
+
 	// Generate a dispatcher (übertrampoline)
 	std::vector<u32> addrv{func[0]};
 	const auto beg = m_spurt->m_map.lower_bound(addrv);
@@ -886,6 +944,12 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				it = it2;
 				size1 = w.size - size2;
 
+				if (w.level >= w.beg->first.size())
+				{
+					// Cannot split: smallest function is a prefix of bigger ones (TODO)
+					break;
+				}
+
 				const u32 x1 = w.beg->first.at(w.level);
 
 				if (!x1)
@@ -914,6 +978,20 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				}
 			}
 
+			if (w.label.isValid())
+			{
+				c->align(kAlignCode, 16);
+				c->bind(w.label);
+			}
+
+			if (w.level >= w.beg->first.size())
+			{
+				// If functions cannot be compared, assume smallest function
+				LOG_WARNING(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
+				c->jmp(imm_ptr(w.beg->second ? w.beg->second : &dispatch));
+				continue;
+			}
+
 			// Value for comparison
 			const u32 x = it->first.at(w.level);
 
@@ -933,13 +1011,7 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 				size2++;
 			}
 
-			if (w.label.isValid())
-			{
-				c->align(kAlignCode, 16);
-				c->bind(w.label);
-			}
-
-			c->cmp(x86::dword_ptr(*ls, func[0] + (w.level - 1) * 4), x);
+			c->cmp(x86::dword_ptr(*ls, start + (w.level - 1) * 4), x);
 
 			// Low subrange target label
 			Label label_below;
@@ -1044,22 +1116,6 @@ spu_function_t spu_recompiler::compile(std::vector<u32>&& func_rv)
 		m_spurt->m_dispatcher[func[0] / 4] = tr;
 	}
 
-	if (g_cfg.core.spu_debug)
-	{
-		// Add ASMJIT logs
-		fmt::append(log, "Address: %p (%p)\n\n", fn, +m_spurt->m_dispatcher[func[0] / 4]);
-		log += logger.getString();
-		log += "\n\n\n";
-
-		// Append log file
-		fs::file(Emu.GetCachePath() + "SPUJIT.log", fs::write + fs::append).write(log);
-	}
-
-	if (m_cache && g_cfg.core.spu_cache)
-	{
-		m_cache->add(func);
-	}
-
 	return fn;
 }
 
@@ -1131,17 +1187,6 @@ static void check_state(SPUThread* _spu, spu_function_t _ret)
 		_ret = &check_state_ret;
 	}
 
-	if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
-	{
-		// Get stack pointer, try to use native return address (check SPU return address)
-		const auto x = _spu->stack_mirror[(_spu->gpr[1]._u32[3] & 0x3fff0) >> 4];
-
-		if (x._u32[2] == _spu->pc)
-		{
-			_ret = reinterpret_cast<spu_function_t>(x._u64[0]);
-		}
-	}
-
 	_ret(*_spu, _spu->_ptr<u8>(0), nullptr);
 }
 
@@ -1195,36 +1240,12 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 {
 	using namespace asmjit;
 
-	if (g_cfg.core.spu_block_size != spu_block_size_type::giga && !jt)
-	{
-		// Simply external call (return or indirect call)
-		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
-		c->xor_(qw0->r32(), qw0->r32());
-	}
-	else
-	{
-		if (!instr_table.isValid())
-		{
-			// Request instruction table
-			instr_table = c->newLabel();
-		}
-
-		const u32 start = instr_labels.begin()->first;
-		const u32 end = instr_labels.rbegin()->first + 4;
-
-		// Load indirect jump address, choose between local and external
-		c->lea(x86::r10, x86::qword_ptr(instr_table));
-		c->lea(*qw1, x86::qword_ptr(*addr, 0 - start));
-		c->xor_(qw0->r32(), qw0->r32());
-		c->cmp(qw1->r32(), end - start);
-		c->cmovae(qw1->r32(), qw0->r32());
-		c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
-		c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
-	}
+	// Initialize third arg to zero
+	c->xor_(qw0->r32(), qw0->r32());
 
 	if (op.d)
 	{
-		c->lock().btr(SPU_OFF_8(interrupts_enabled), 0);
+		c->mov(SPU_OFF_8(interrupts_enabled), 0);
 	}
 	else if (op.e)
 	{
@@ -1232,7 +1253,7 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 		Label intr = c->newLabel();
 		Label fail = c->newLabel();
 
-		c->lock().bts(SPU_OFF_8(interrupts_enabled), 0);
+		c->mov(SPU_OFF_8(interrupts_enabled), 1);
 		c->mov(qw1->r32(), SPU_OFF_32(ch_event_mask));
 		c->test(qw1->r32(), ~SPU_EVENT_INTR_IMPLEMENTED);
 		c->jnz(fail);
@@ -1244,19 +1265,50 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 		c->mov(SPU_OFF_32(pc), *addr);
 		c->mov(addr->r64(), reinterpret_cast<u64>(vm::base(0xffdead00)));
 		c->mov(asmjit::x86::dword_ptr(addr->r64()), "INTR"_u32);
+
+		// Save addr in srr0 and disable interrupts
 		c->bind(intr);
-		c->lock().btr(SPU_OFF_8(interrupts_enabled), 0);
+		c->mov(SPU_OFF_8(interrupts_enabled), 0);
 		c->mov(SPU_OFF_32(srr0), *addr);
-		c->mov(*addr, qw0->r32());
-		c->mov(x86::r10, x86::qword_ptr(*cpu, offset32(&SPUThread::jit_dispatcher)));
+
+		// Test for BR/BRA instructions (they are equivalent at zero pc)
+		c->mov(*addr, x86::dword_ptr(*ls));
+		c->and_(*addr, 0xfffffffd);
+		c->xor_(*addr, 0x30);
+		c->bswap(*addr);
+		c->test(*addr, 0xff80007f);
+		c->cmovnz(*addr, qw0->r32());
+		c->shr(*addr, 5);
 		c->align(kAlignCode, 16);
 		c->bind(no_intr);
 	}
 
-	Label label_check = c->newLabel();
-	c->mov(SPU_OFF_32(pc), *addr);
-	c->cmp(SPU_OFF_32(state), 0);
-	c->jnz(label_check);
+	if (!jt && g_cfg.core.spu_block_size != spu_block_size_type::giga)
+	{
+		// Simply external call (return or indirect call)
+		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
+	}
+	else
+	{
+		if (!instr_table.isValid())
+		{
+			// Request instruction table
+			instr_table = c->newLabel();
+		}
+
+		// Get actual instruction table bounds
+		const u32 start = instr_labels.begin()->first;
+		const u32 end = instr_labels.rbegin()->first + 4;
+
+		// Load indirect jump address, choose between local and external
+		c->lea(*qw1, x86::qword_ptr(addr->r64(), 0 - start));
+		c->lea(x86::r10, x86::qword_ptr(instr_table));
+		c->cmp(qw1->r32(), end - start);
+		c->lea(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
+		c->lea(*qw1, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
+		c->cmovae(x86::r10, *qw1);
+		c->mov(x86::r10, x86::qword_ptr(x86::r10));
+	}
 
 	if (g_cfg.core.spu_block_size != spu_block_size_type::safe && ret)
 	{
@@ -1268,6 +1320,10 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 		c->cmove(x86::r10, x86::qword_ptr(*qw1));
 	}
 
+	Label label_check = c->newLabel();
+	c->mov(SPU_OFF_32(pc), *addr);
+	c->cmp(SPU_OFF_32(state), 0);
+	c->jnz(label_check);
 	c->jmp(x86::r10);
 	c->bind(label_check);
 	c->mov(*ls, x86::r10);
diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
index 6388cb157c89..ce43792c19b3 100644
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@@ -19,6 +19,9 @@ class spu_runtime
 	// All dispatchers
 	std::array<atomic_t<spu_function_t>, 0x10000> m_dispatcher;
 
+	// Debug module output location
+	std::string m_cache_path;
+
 	friend class spu_recompiler;
 
 public:
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
index dd74eb00ade0..45330ad5eb33 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -90,14 +90,8 @@ void spu_cache::initialize()
 		return;
 	}
 
-	if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
-	{
-		// Force Safe mode
-		g_cfg.core.spu_block_size.from_default();
-	}
-
 	// SPU cache file (version + block size type)
-	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v3.dat";
+	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v4.dat";
 
 	auto cache = std::make_shared<spu_cache>(loc);
 
@@ -115,11 +109,6 @@ void spu_cache::initialize()
 
 	if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
 	{
-		if (g_cfg.core.spu_debug)
-		{
-			fs::file(Emu.GetCachePath() + "SPUJIT.log", fs::rewrite);
-		}
-
 		compiler = spu_recompiler_base::make_asmjit_recompiler();
 	}
 
@@ -138,7 +127,12 @@ void spu_cache::initialize()
 		// Fake LS
 		std::vector<be_t<u32>> ls(0x10000);
 
-		// Initialize progress dialog
+		// Initialize progress dialog (wait for previous progress done)
+		while (g_progr_ptotal)
+		{
+			std::this_thread::sleep_for(5ms);
+		}
+
 		g_progr = "Building SPU cache...";
 		g_progr_ptotal += func_list.size();
 
@@ -151,8 +145,12 @@ void spu_cache::initialize()
 				continue;
 			}
 
+			// Get data start
+			const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+			const u32 size0 = ::size32(func);
+
 			// Initialize LS with function data only
-			for (u32 i = 1, pos = func[0]; i < func.size(); i++, pos += 4)
+			for (u32 i = 1, pos = start; i < size0; i++, pos += 4)
 			{
 				ls[pos / 4] = se_storage<u32>::swap(func[i]);
 			}
@@ -160,15 +158,15 @@ void spu_cache::initialize()
 			// Call analyser
 			std::vector<u32> func2 = compiler->block(ls.data(), func[0]);
 
-			if (func2.size() != func.size())
+			if (func2.size() != size0)
 			{
-				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, func.size() - 1);
+				LOG_ERROR(SPU, "[0x%05x] SPU Analyser failed, %u vs %u", func2[0], func2.size() - 1, size0 - 1);
 			}
 
 			compiler->compile(std::move(func));
 
 			// Clear fake LS
-			for (u32 i = 1, pos = func2[0]; i < func2.size(); i++, pos += 4)
+			for (u32 i = 1, pos = start; i < func2.size(); i++, pos += 4)
 			{
 				if (se_storage<u32>::swap(func2[i]) != ls[pos / 4])
 				{
@@ -178,6 +176,11 @@ void spu_cache::initialize()
 				ls[pos / 4] = 0;
 			}
 
+			if (func2.size() != size0)
+			{
+				std::memset(ls.data(), 0, 0x40000);
+			}
+
 			g_progr_pdone++;
 		}
 
@@ -236,11 +239,22 @@ void spu_recompiler_base::dispatch(SPUThread& spu, void*, u8* rip)
 	// Compile
 	verify(HERE), spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), spu.pc));
 	spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc);
+
+	// Diagnostic
+	if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
+	{
+		const v128 _info = spu.stack_mirror[(spu.gpr[1]._u32[3] & 0x3fff0) >> 4];
+
+		if (_info._u64[0] != -1)
+		{
+			LOG_TRACE(SPU, "Called from 0x%x", _info._u32[2] - 4);
+		}
+	}
 }
 
 void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip)
 {
-	// Compile
+	// Compile (TODO: optimize search of the existing functions)
 	const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr<u32>(0), spu.pc)));
 	spu.jit_dispatcher[spu.pc / 4] = spu.jit->get(spu.pc);
 
@@ -282,24 +296,27 @@ void spu_recompiler_base::branch(SPUThread& spu, void*, u8* rip)
 #endif
 }
 
-std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
+std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 entry_point)
 {
 	// Result: addr + raw instruction data
 	std::vector<u32> result;
 	result.reserve(256);
-	result.push_back(lsa);
+	result.push_back(entry_point);
 
 	// Initialize block entries
 	m_block_info.reset();
-	m_block_info.set(lsa / 4);
+	m_block_info.set(entry_point / 4);
+	m_entry_info.reset();
+	m_entry_info.set(entry_point / 4);
 
 	// Simple block entry workload list
-	std::vector<u32> wl;
-	wl.push_back(lsa);
+	std::vector<u32> workload;
+	workload.push_back(entry_point);
 
-	m_regmod.fill(0xff);
+	std::memset(m_regmod.data(), 0xff, sizeof(m_regmod));
 	m_targets.clear();
 	m_preds.clear();
+	m_preds[entry_point];
 
 	// Value flags (TODO)
 	enum class vf : u32
@@ -316,46 +333,72 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 	// Associated constant values for 32-bit preferred slot
 	std::array<u32, 128> values;
 
-	for (u32 wi = 0; wi < wl.size();)
+	// SYNC instruction found
+	bool sync = false;
+
+	u32 hbr_loc = 0;
+	u32 hbr_tg = -1;
+
+	// Result bounds
+	u32 lsa = entry_point;
+	u32 limit = 0x40000;
+
+	if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
+	{
+		// In Giga mode, all data starts from the address 0
+		lsa = 0;
+	}
+
+	for (u32 wi = 0, wa = workload[0]; wi < workload.size();)
 	{
 		const auto next_block = [&]
 		{
 			// Reset value information
 			vflags.fill({});
+			sync = false;
+			hbr_loc = 0;
+			hbr_tg = -1;
 			wi++;
+
+			if (wi < workload.size())
+			{
+				wa = workload[wi];
+			}
 		};
 
-		const u32 pos = wl[wi];
+		const u32 pos = wa;
 
 		const auto add_block = [&](u32 target)
 		{
-			// Verify validity of the new target (TODO)
-			if (target > lsa)
+			// Validate new target (TODO)
+			if (target > lsa && target < limit)
 			{
 				// Check for redundancy
 				if (!m_block_info[target / 4])
 				{
 					m_block_info[target / 4] = true;
-					wl.push_back(target);
+					workload.push_back(target);
 				}
 
-				// Add predecessor (check if already exists)
-				for (u32 pred : m_preds[target])
+				// Add predecessor
+				if (m_preds[target].find_first_of(pos) == -1)
 				{
-					if (pred == pos)
-					{
-						return;
-					}
+					m_preds[target].push_back(pos);
 				}
-
-				m_preds[target].push_back(pos);
 			}
 		};
 
+		if (pos < lsa || pos >= limit)
+		{
+			// Don't analyse if already beyond the limit
+			next_block();
+			continue;
+		}
+
 		const u32 data = ls[pos / 4];
 		const auto op = spu_opcode_t{data};
 
-		wl[wi] += 4;
+		wa += 4;
 
 		m_targets.erase(pos);
 
@@ -371,7 +414,6 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		{
 			// Stop before invalid instructions (TODO)
 			m_targets[pos].push_back(-1);
-			m_block_info[pos / 4] = true;
 			next_block();
 			continue;
 		}
@@ -381,11 +423,10 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		case spu_itype::STOP:
 		case spu_itype::STOPD:
 		{
-			if (data == 0 || data == 3)
+			if (data == 0)
 			{
 				// Stop before null data
 				m_targets[pos].push_back(-1);
-				m_block_info[pos / 4] = true;
 				next_block();
 				continue;
 			}
@@ -398,11 +439,22 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				break;
 			}
 
+			if (type == spu_itype::SYNC)
+			{
+				// Remember
+				sync = true;
+			}
+
 			break;
 		}
 
 		case spu_itype::IRET:
 		{
+			if (op.d && op.e)
+			{
+				LOG_ERROR(SPU, "[0x%x] Invalid interrupt flags (DE)", pos);
+			}
+
 			m_targets[pos].push_back(-1);
 			next_block();
 			break;
@@ -410,15 +462,22 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 
 		case spu_itype::BI:
 		case spu_itype::BISL:
+		case spu_itype::BISLED:
 		case spu_itype::BIZ:
 		case spu_itype::BINZ:
 		case spu_itype::BIHZ:
 		case spu_itype::BIHNZ:
 		{
+			if (op.d && op.e)
+			{
+				LOG_ERROR(SPU, "[0x%x] Invalid interrupt flags (DE)", pos);
+			}
+
 			const auto af = vflags[op.ra];
 			const auto av = values[op.ra];
+			const bool sl = type == spu_itype::BISL || type == spu_itype::BISLED;
 
-			if (type == spu_itype::BISL)
+			if (sl)
 			{
 				m_regmod[pos / 4] = op.rt;
 				vflags[op.rt] = +vf::is_const;
@@ -428,23 +487,66 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (test(af, vf::is_const))
 			{
 				const u32 target = spu_branch_target(av);
-				LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
 
 				if (target == pos + 4)
 				{
 					// Nop (unless BISL)
-					LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", lsa, pos);
+					LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", result[0], pos);
+				}
+				else
+				{
+					LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", result[0], pos, target);
 				}
 
 				m_targets[pos].push_back(target);
 
-				if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga)
+				if (!sl)
+				{
+					if (sync)
+					{
+						LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring branch to 0x%x (SYNC)", result[0], pos, target);
+
+						if (entry_point < target)
+						{
+							limit = std::min<u32>(limit, target);
+						}
+					}
+					else
+					{
+						if (op.d || op.e)
+						{
+							m_entry_info[target / 4] = true;
+						}
+
+						add_block(target);
+					}
+				}
+
+				if (sl && g_cfg.core.spu_block_size == spu_block_size_type::giga)
+				{
+					if (sync)
+					{
+						LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring call to 0x%x (SYNC)", result[0], pos, target);
+
+						if (target > entry_point)
+						{
+							limit = std::min<u32>(limit, target);
+						}
+					}
+					else
+					{
+						m_entry_info[target / 4] = true;
+						add_block(target);
+					}
+				}
+				else if (sl && target > entry_point)
 				{
-					add_block(target);
+					limit = std::min<u32>(limit, target);
 				}
 
-				if (type == spu_itype::BISL && g_cfg.core.spu_block_size != spu_block_size_type::safe)
+				if (sl && g_cfg.core.spu_block_size != spu_block_size_type::safe)
 				{
+					m_entry_info[pos / 4 + 1] = true;
 					m_targets[pos].push_back(pos + 4);
 					add_block(pos + 4);
 				}
@@ -455,7 +557,6 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				std::basic_string<u32> jt_abs;
 				std::basic_string<u32> jt_rel;
 				const u32 start = pos + 4;
-				const u32 limit = 0x40000;
 				u64 dabs = 0;
 				u64 drel = 0;
 
@@ -469,13 +570,13 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 						break;
 					}
 
-					if (target >= lsa && target < limit)
+					if (target >= lsa && target < 0x40000)
 					{
 						// Possible jump table entry (absolute)
 						jt_abs.push_back(target);
 					}
 
-					if (target + start >= lsa && target + start < limit)
+					if (target + start >= lsa && target + start < 0x40000)
 					{
 						// Possible jump table entry (relative)
 						jt_rel.push_back(target + start);
@@ -528,6 +629,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 						{
 							add_block(jt_abs[i]);
 							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]);
+							m_targets[start + i * 4].push_back(-1);
 						}
 
 						m_targets.emplace(pos, std::move(jt_abs));
@@ -546,14 +648,40 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 						{
 							add_block(jt_rel[i]);
 							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start);
+							m_targets[start + i * 4].push_back(-1);
 						}
 
 						m_targets.emplace(pos, std::move(jt_rel));
 					}
 				}
+				else if (start + 12 * 4 < limit &&
+					ls[start / 4 + 0] == 0x1ce00408 &&
+					ls[start / 4 + 1] == 0x24000389 &&
+					ls[start / 4 + 2] == 0x24004809 &&
+					ls[start / 4 + 3] == 0x24008809 &&
+					ls[start / 4 + 4] == 0x2400c809 &&
+					ls[start / 4 + 5] == 0x24010809 &&
+					ls[start / 4 + 6] == 0x24014809 &&
+					ls[start / 4 + 7] == 0x24018809 &&
+					ls[start / 4 + 8] == 0x1c200807 &&
+					ls[start / 4 + 9] == 0x2401c809)
+				{
+					LOG_WARNING(SPU, "[0x%x] Pattern 1 detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg);
+
+					// Add 8 targets (TODO)
+					for (u32 addr = start + 4; addr < start + 36; addr += 4)
+					{
+						m_targets[pos].push_back(addr);
+						add_block(addr);
+					}
+				}
+				else if (hbr_loc > start && hbr_loc < limit && hbr_tg == start)
+				{
+					LOG_WARNING(SPU, "[0x%x] No patterns detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg);
+				}
 			}
 
-			if (type == spu_itype::BI || type == spu_itype::BISL)
+			if (type == spu_itype::BI || sl)
 			{
 				if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe)
 				{
@@ -564,6 +692,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				}
 				else
 				{
+					m_entry_info[pos / 4 + 1] = true;
 					m_targets[pos].push_back(pos + 4);
 					add_block(pos + 4);
 				}
@@ -597,14 +726,28 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 
 			if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
 			{
+				m_entry_info[pos / 4 + 1] = true;
 				m_targets[pos].push_back(pos + 4);
 				add_block(pos + 4);
 			}
 
-			if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
+			if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync)
 			{
+				m_entry_info[target / 4] = true;
 				add_block(target);
 			}
+			else
+			{
+				if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
+				{
+					LOG_NOTICE(SPU, "[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", result[0], pos, target);
+				}
+
+				if (target > entry_point)
+				{
+					limit = std::min<u32>(limit, target);
+				}
+			}
 
 			next_block();
 			break;
@@ -644,9 +787,6 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		case spu_itype::HGTI:
 		case spu_itype::HLGT:
 		case spu_itype::HLGTI:
-		case spu_itype::HBR:
-		case spu_itype::HBRA:
-		case spu_itype::HBRR:
 		case spu_itype::LNOP:
 		case spu_itype::NOP:
 		case spu_itype::MTSPR:
@@ -661,6 +801,27 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			break;
 		}
 
+		case spu_itype::HBR:
+		{
+			hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt);
+			hbr_tg  = test(vflags[op.ra], vf::is_const) && !op.c ? values[op.ra] & 0x3fffc : -1;
+			break;
+		}
+
+		case spu_itype::HBRA:
+		{
+			hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
+			hbr_tg  = spu_branch_target(0x0, op.i16);
+			break;
+		}
+
+		case spu_itype::HBRR:
+		{
+			hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
+			hbr_tg  = spu_branch_target(pos, op.i16);
+			break;
+		}
+
 		case spu_itype::IL:
 		{
 			m_regmod[pos / 4] = op.rt;
@@ -812,12 +973,12 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		}
 	}
 
-	while (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+	while (g_cfg.core.spu_block_size != spu_block_size_type::giga || limit < 0x40000)
 	{
 		const u32 initial_size = result.size();
 
-		// Check unreachable blocks in safe and mega modes (TODO)
-		u32 limit = lsa + result.size() * 4 - 4;
+		// Check unreachable blocks
+		limit = std::min<u32>(limit, lsa + initial_size * 4 - 4);
 
 		for (auto& pair : m_preds)
 		{
@@ -839,8 +1000,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			{
 				for (u32 j = workload[i];; j -= 4)
 				{
-					// Go backward from an address until the entry point (=lsa) is reached
-					if (j == lsa)
+					// Go backward from an address until the entry point is reached
+					if (j == result[0])
 					{
 						reachable = true;
 						break;
@@ -906,21 +1067,20 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		{
 			if (result[i] == 0)
 			{
-				const u32 pos = lsa + (i - 1) * 4;
+				const u32 pos  = lsa + (i - 1) * 4;
 				const u32 data = ls[pos / 4];
 
 				// Allow only NOP or LNOP instructions in holes
 				if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
 				{
-					if (i + 1 < result.size())
-					{
-						result[i] = se_storage<u32>::swap(data);
-						continue;
-					}
+					continue;
 				}
 
-				result.resize(valid_size + 1);
-				break;
+				if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+				{
+					result.resize(valid_size + 1);
+					break;
+				}
 			}
 			else
 			{
@@ -928,6 +1088,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			}
 		}
 
+		// Even if NOP or LNOP, should be removed at the end
+		result.resize(valid_size + 1);
+
 		// Repeat if blocks were removed
 		if (result.size() == initial_size)
 		{
@@ -935,6 +1098,188 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		}
 	}
 
+	limit = std::min<u32>(limit, lsa + ::size32(result) * 4 - 4);
+
+	// Cleanup block info
+	for (u32 i = 0; i < workload.size(); i++)
+	{
+		const u32 addr = workload[i];
+
+		if (addr < lsa || addr >= limit || !result[(addr - lsa) / 4 + 1])
+		{
+			m_block_info[addr / 4] = false;
+			m_entry_info[addr / 4] = false;
+			m_preds.erase(addr);
+		}
+	}
+
+	// Complete m_preds and associated m_targets for adjacent blocks
+	for (auto& pair : m_preds)
+	{
+		// Erase impossible predecessors
+		const auto new_end = std::remove_if(pair.second.begin(), pair.second.end(), [&](u32 addr)
+		{
+			return addr < lsa || addr >= limit;
+		});
+
+		pair.second.erase(new_end, pair.second.end());
+
+		// Don't add fallthrough target if all predecessors are removed
+		if (pair.second.empty() && !m_entry_info[pair.first / 4])
+		{
+			// If not an entry point, remove the block completely
+			m_block_info[pair.first / 4] = false;
+			continue;
+		}
+
+		// Previous instruction address
+		const u32 prev = (pair.first - 4) & 0x3fffc;
+
+		const auto tfound = m_targets.find(prev);
+
+		// TODO: can it be empty?
+		if (tfound == m_targets.end() || tfound->second.empty())
+		{
+			// TODO: check the correctness
+			if (prev >= lsa && prev < limit && result[(prev - lsa) / 4 + 1])
+			{
+				// Add target and the predecessor
+				m_targets[prev].push_back(pair.first);
+				pair.second.push_back(prev);
+			}
+		}
+	}
+
+	// Erase unreachable targets
+	for (auto& pair : m_targets)
+	{
+		// Erase unreachable targets
+		const auto new_end = std::remove_if(pair.second.begin(), pair.second.end(), [&](u32 addr)
+		{
+			if (addr >> 31)
+			{
+				return false;
+			}
+
+			return addr < lsa || addr >= limit;
+		});
+
+		pair.second.erase(new_end, pair.second.end());
+
+		if (pair.second.empty())
+		{
+			// Add default no-target
+			pair.second.push_back(-1);
+		}
+	}
+
+	// Fill holes which contain only NOP and LNOP instructions
+	for (u32 i = 1, nnop = 0, vsize = 0; i <= result.size(); i++)
+	{
+		if (i >= result.size() || result[i])
+		{
+			if (nnop && nnop == i - vsize - 1)
+			{
+				// Write only complete NOP sequence
+				for (u32 j = vsize + 1; j < i; j++)
+				{
+					result[j] = se_storage<u32>::swap(ls[lsa / 4 + j - 1]);
+				}
+			}
+
+			nnop  = 0;
+			vsize = i;
+		}
+		else
+		{
+			const u32 pos  = lsa + (i - 1) * 4;
+			const u32 data = ls[pos / 4];
+
+			if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
+			{
+				nnop++;
+			}
+		}
+	}
+
+	// Fill entry map, add entry points
+	while (g_cfg.core.spu_block_size != spu_block_size_type::safe)
+	{
+		workload.clear();
+		workload.push_back(entry_point);
+		std::memset(m_entry_map.data(), 0, sizeof(m_entry_map));
+
+		std::basic_string<u32> new_entries;
+
+		for (u32 wi = 0; wi < workload.size(); wi++)
+		{
+			const u32 addr = workload[wi];
+			const u16 _new = m_entry_map[addr / 4];
+
+			if (!m_entry_info[addr / 4])
+			{
+				// Check block predecessors
+				for (u32 pred : m_preds[addr])
+				{
+					const u16 _old = m_entry_map[pred / 4];
+
+					if (_old && _old != _new)
+					{
+						// If block has multiple 'entry' points, it becomes an entry point itself
+						new_entries.push_back(addr);
+					}
+				}
+			}
+
+			// Fill value
+			const u16 root = m_entry_info[addr / 4] ? ::narrow<u16>(addr / 4) : _new;
+
+			for (u32 wa = addr; wa < limit && result[(wa - lsa) / 4 + 1]; wa += 4)
+			{
+				// Fill entry address for the instruction
+				m_entry_map[wa / 4] = root;
+
+				// Find targets (also means end of the block)
+				const auto tfound = m_targets.find(wa);
+
+				if (tfound == m_targets.end() || tfound->second.empty() || tfound->second[0] == -1)
+				{
+					continue;
+				}
+
+				for (u32 target : tfound->second)
+				{
+					const u16 value = m_entry_info[target / 4] ? ::narrow<u16>(target / 4) : root;
+
+					if (u16& tval = m_entry_map[target / 4])
+					{
+						if (tval != value && !m_entry_info[target / 4])
+						{
+							new_entries.push_back(target);
+						}
+					}
+					else
+					{
+						tval = value;
+						workload.emplace_back(target);
+					}
+				}
+
+				break;
+			}
+		}
+
+		if (new_entries.empty())
+		{
+			break;
+		}
+
+		for (u32 entry : new_entries)
+		{
+			m_entry_info[entry / 4] = true;
+		}
+	}
+
 	if (result.size() == 1)
 	{
 		// Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback
@@ -988,13 +1333,13 @@ class spu_llvm_runtime
 		m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
 
 		// Clear LLVM output
-		m_cache_path = fxm::check_unlocked<ppu_module>()->cache + "llvm/";
-		fs::create_dir(m_cache_path);
-		fs::remove_all(m_cache_path, false);
+		m_cache_path = fxm::check_unlocked<ppu_module>()->cache;
+		fs::create_dir(m_cache_path + "llvm/");
+		fs::remove_all(m_cache_path + "llvm/", false);
 
 		if (g_cfg.core.spu_debug)
 		{
-			fs::file(m_cache_path + "../spu.log", fs::rewrite);
+			fs::file(m_cache_path + "spu.log", fs::rewrite);
 		}
 
 		LOG_SUCCESS(SPU, "SPU Recompiler Runtime (LLVM) initialized...");
@@ -1005,19 +1350,158 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 {
 	std::shared_ptr<spu_llvm_runtime> m_spurt;
 
+	// Current function (chunk)
 	llvm::Function* m_function;
 
-	using m_module = void;
+	// Current function chunk entry point
+	u32 m_entry;
 
 	llvm::Value* m_thread;
 	llvm::Value* m_lsptr;
 
-	llvm::BasicBlock* m_stop;
+	// Pointers to registers in the thread context
+	std::array<llvm::Value*, s_reg_max> m_reg_addr;
+
+	// Global variable (function table)
+	llvm::GlobalVariable* m_function_table{};
+
+	struct block_info
+	{
+		// Current block's entry block
+		llvm::BasicBlock* block;
+
+		// Final block (for PHI nodes, set after completion)
+		llvm::BasicBlock* block_end{};
+
+		// Regmod compilation (TODO)
+		std::bitset<s_reg_max> mod;
+
+		// List of actual predecessors
+		std::basic_string<u32> preds;
+
+		// Current register values
+		std::array<llvm::Value*, s_reg_max> reg{};
+
+		// PHI nodes created for this block (if any)
+		std::array<llvm::PHINode*, s_reg_max> phi{};
+
+		// Store instructions
+		std::array<llvm::StoreInst*, s_reg_max> store{};
+	};
+
+	// Current block
+	block_info* m_block;
+
+	// All blocks in the current function chunk
+	std::unordered_map<u32, block_info, value_hash<u32, 2>> m_blocks;
+
+	// Block list for processing
+	std::vector<u32> m_block_queue;
+
+	// All function chunks in current SPU compile unit
+	std::unordered_map<u32, llvm::Function*, value_hash<u32, 2>> m_functions;
+
+	// Function chunk list for processing
+	std::vector<u32> m_function_queue;
+
+	// Helper
+	std::vector<u32> m_scan_queue;
+
+	// Add or get the function chunk
+	llvm::Function* add_function(u32 addr)
+	{
+		// Get function chunk name
+		const std::string name = fmt::format("spu-chunk-0x%05x", addr);
+		llvm::Function* result = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(name, get_type<void>(), get_type<u64>(), get_type<u64>(), get_type<u32>()));
+
+		// Set parameters
+		result->setLinkage(llvm::GlobalValue::InternalLinkage);
+
+		// Enqueue if necessary
+		if (m_functions.emplace(addr, result).second)
+		{
+			m_function_queue.push_back(addr);
+		}
+
+		return result;
+	}
+
+	void set_function(llvm::Function* func)
+	{
+		m_function = func;
+		m_thread = &*func->arg_begin();
+		m_lsptr = &*(func->arg_begin() + 1);
+
+		m_reg_addr.fill(nullptr);
+		m_block = nullptr;
+		m_blocks.clear();
+		m_block_queue.clear();
+		m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", m_function));
+	}
+
+	// Add block with current block as a predecessor
+	llvm::BasicBlock* add_block(u32 target)
+	{
+		// Check the predecessor
+		const bool pred_found = m_block_info[target / 4] && m_preds[target].find_first_of(m_pos) != -1;
+
+		if (m_blocks.empty())
+		{
+			// Special case: first block, proceed normally
+		}
+		else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target))
+		{
+			// Generate a tail call to the function chunk
+			const auto cblock = m_ir->GetInsertBlock();
+			const auto result = llvm::BasicBlock::Create(m_context, "", m_function);
+			m_ir->SetInsertPoint(result);
+			m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&SPUThread::pc));
+			tail(add_function(target));
+			m_ir->SetInsertPoint(cblock);
+			return result;
+		}
+		else if (!pred_found || !m_block_info[target / 4])
+		{
+			if (m_block_info[target / 4])
+			{
+				LOG_ERROR(SPU, "[0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_pos, target, m_entry, m_function_queue[0], m_size / 4);
+			}
+
+			// Generate external indirect tail call
+			const auto cblock = m_ir->GetInsertBlock();
+			const auto result = llvm::BasicBlock::Create(m_context, "", m_function);
+			m_ir->SetInsertPoint(result);
+			m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&SPUThread::pc));
+			const auto addr = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher) + target * 2));
+			const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
+			tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(addr, type)));
+			m_ir->SetInsertPoint(cblock);
+			return result;
+		}
+
+		auto& result = m_blocks[target].block;
+
+		if (!result)
+		{
+			result = llvm::BasicBlock::Create(m_context, fmt::format("b-0x%x", target), m_function);
 
-	std::array<std::pair<llvm::Value*, llvm::Value*>, 128> m_gpr;
-	std::array<llvm::Instruction*, 128> m_flush_gpr;
+			// Add the block to the queue
+			m_block_queue.push_back(target);
+		}
+		else if (m_block && m_blocks[target].block_end)
+		{
+			// Connect PHI nodes if necessary
+			for (u32 i = 0; i < s_reg_max; i++)
+			{
+				if (const auto phi = m_blocks[target].phi[i])
+				{
+					phi->addIncoming(get_vr(i).value, m_block->block_end);
+				}
+			}
+		}
 
-	std::map<u32, llvm::BasicBlock*> m_instr_map;
+		return result;
+	}
 
 	template <typename T>
 	llvm::Value* _ptr(llvm::Value* base, u32 offset, std::string name = "")
@@ -1033,89 +1517,164 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		return _ptr<T>(m_thread, ::offset32(offset_args...));
 	}
 
-	template <typename T = u32[4]>
-	auto& init_vr(u32 index)
+	template <typename T, typename... Args>
+	llvm::Value* spu_ptr(value_t<u64> add, Args... offset_args)
 	{
-		auto& gpr = m_gpr.at(index);
+		const auto off = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(offset_args...)));
+		const auto ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(off, add.value), get_type<T>()->getPointerTo());
+		return ptr;
+	}
 
-		if (!gpr.first)
+	llvm::Value* init_vr(u32 index)
+	{
+		if (!m_reg_addr.at(index))
 		{
 			// Save and restore current insert point if necessary
 			const auto block_cur = m_ir->GetInsertBlock();
 
-			// Emit register pointer at the beginning of function
-			m_ir->SetInsertPoint(&*m_function->begin()->getFirstInsertionPt());
-			gpr.first = _ptr<T>(m_thread, ::offset32(&SPUThread::gpr, index), fmt::format("Reg$%u", index));
+			// Emit register pointer at the beginning of the function chunk
+			m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator());
+			m_reg_addr[index] = _ptr<u32[4]>(m_thread, ::offset32(&SPUThread::gpr, index), fmt::format("Reg$%u", index));
 			m_ir->SetInsertPoint(block_cur);
 		}
 
-		return gpr;
+		return m_reg_addr[index];
 	}
 
 	template <typename T = u32[4]>
 	value_t<T> get_vr(u32 index)
 	{
-		auto& gpr = init_vr<T>(index);
-
-		if (!gpr.second)
+		if (!m_block->reg.at(index))
 		{
-			gpr.second = m_ir->CreateLoad(gpr.first, fmt::format("Load$%u", index));
+			// Load register value if necessary
+			m_block->reg[index] = m_ir->CreateLoad(init_vr(index), fmt::format("Load$%u", index));
 		}
 
 		value_t<T> r;
-		r.value = m_ir->CreateBitCast(gpr.second, get_type<T>());
+		r.value = m_ir->CreateBitCast(m_block->reg[index], get_type<T>());
 		return r;
 	}
 
 	template <typename T>
 	void set_vr(u32 index, T expr)
 	{
-		auto& gpr = init_vr<typename T::type>(index);
+		// Check
+		verify(HERE), m_regmod[m_pos / 4] == index;
 
-		gpr.second = expr.eval(m_ir);
+		// Set register value
+		m_block->reg.at(index) = expr.eval(m_ir);
 
-		// Remember last insertion point for flush
-		if (m_ir->GetInsertBlock()->empty())
-		{
-			// Insert dummy instruction if empty
-			m_flush_gpr.at(index) = llvm::cast<llvm::Instruction>(m_ir->CreateAdd(m_thread, m_ir->getInt64(8)));
-		}
-		else
+		// Get register location
+		const auto addr = init_vr(index);
+
+		// Erase previous dead store instruction if necessary
+		if (m_block->store[index])
 		{
-			m_flush_gpr.at(index) = m_ir->GetInsertBlock()->end()->getPrevNode();
+			// TODO: better cross-block dead store elimination
+			m_block->store[index]->eraseFromParent();
 		}
+
+		// Write register to the context
+		m_block->store[index] = m_ir->CreateStore(m_ir->CreateBitCast(m_block->reg[index], addr->getType()->getPointerElementType()), addr);
 	}
 
-	void flush(std::pair<llvm::Value*, llvm::Value*>& reg, llvm::Instruction*& flush_reg)
+	// Return either basic block addr with single dominating value, or negative number of PHI entries
+	u32 find_reg_origin(u32 addr, u32 index)
 	{
-		if (reg.first && reg.second && flush_reg)
+		u32 result = -1;
+
+		// Handle entry point specially
+		if (m_entry_info[addr / 4])
 		{
-			// Save and restore current insert point if necessary
-			const auto block_cur = m_ir->GetInsertBlock();
+			result = addr;
+		}
+
+		// Used for skipping blocks from different chunks
+		const u16 root = ::narrow<u16>(g_cfg.core.spu_block_size == spu_block_size_type::safe ? 0 : m_entry / 4);
+
+		// List of predecessors to check
+		m_scan_queue.clear();
 
-			// Try to emit store immediately after its last use
-			if (const auto next = flush_reg->getNextNode())
+		const auto pfound = m_preds.find(addr);
+
+		if (pfound != m_preds.end())
+		{
+			for (u32 pred : pfound->second)
 			{
-				m_ir->SetInsertPoint(next);
+				if (m_entry_map[pred / 4] == root)
+				{
+					m_scan_queue.push_back(pred);
+				}
 			}
-
-			m_ir->CreateStore(m_ir->CreateBitCast(reg.second, reg.first->getType()->getPointerElementType()), reg.first);
-			m_ir->SetInsertPoint(block_cur);
 		}
 
-		// Unregister store
-		flush_reg = nullptr;
+		// TODO: allow to avoid untouched registers in some cases
+		bool regmod_any = result == -1;
 
-		// Invalidate current value (TODO)
-		reg.second = nullptr;
-	}
+		for (u32 i = 0; i < m_scan_queue.size(); i++)
+		{
+			// Find whether the block modifies the selected register
+			bool regmod = false;
 
-	void flush()
-	{
-		for (u32 i = 0; i < 128; i++)
+			for (addr = m_scan_queue[i];; addr -= 4)
+			{
+				if (index == m_regmod[addr / 4])
+				{
+					regmod = true;
+					regmod_any = true;
+				}
+
+				const auto pfound = m_preds.find(addr);
+
+				if (pfound == m_preds.end())
+				{
+					continue;
+				}
+
+				if (!regmod)
+				{
+					// Enqueue predecessors if register is not modified there
+					for (u32 pred : pfound->second)
+					{
+						if (m_entry_map[pred / 4] != root)
+						{
+							continue;
+						}
+
+						// TODO
+						if (std::find(m_scan_queue.cbegin(), m_scan_queue.cend(), pred) == m_scan_queue.cend())
+						{
+							m_scan_queue.push_back(pred);
+						}
+					}
+				}
+
+				break;
+			}
+
+			if (regmod || m_entry_info[addr / 4])
+			{
+				if (result == -1)
+				{
+					result = addr;
+				}
+				else if (result >> 31)
+				{
+					result--;
+				}
+				else
+				{
+					result = -2;
+				}
+			}
+		}
+
+		if (!regmod_any)
 		{
-			flush(m_gpr[i], m_flush_gpr[i]);
+			result = addr;
 		}
+
+		return result;
 	}
 
 	void update_pc()
@@ -1123,6 +1682,22 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		m_ir->CreateStore(m_ir->getInt32(m_pos), spu_ptr<u32>(&SPUThread::pc));
 	}
 
+	// Call cpu_thread::check_state if necessary and return or continue (full check)
+	void check_state(u32 addr)
+	{
+		const auto pstate = spu_ptr<u32>(&SPUThread::state);
+		const auto _body = llvm::BasicBlock::Create(m_context, "", m_function);
+		const auto check = llvm::BasicBlock::Create(m_context, "", m_function);
+		const auto stop  = llvm::BasicBlock::Create(m_context, "", m_function);
+		m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), _body, check);
+		m_ir->SetInsertPoint(check);
+		m_ir->CreateStore(m_ir->getInt32(addr), spu_ptr<u32>(&SPUThread::pc));
+		m_ir->CreateCondBr(call(&exec_check_state, m_thread), stop, _body);
+		m_ir->SetInsertPoint(stop);
+		m_ir->CreateRetVoid();
+		m_ir->SetInsertPoint(_body);
+	}
+
 	// Perform external call
 	template <typename RT, typename... FArgs, typename... Args>
 	llvm::CallInst* call(RT(*_func)(FArgs...), Args... args)
@@ -1253,12 +1828,25 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			fmt::append(hash, "spu-0x%05x-%s", func[0], fmt::base57(output));
 		}
 
-		LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, hash);
+		if (m_cache)
+		{
+			LOG_SUCCESS(SPU, "LLVM: Building %s (size %u)...", hash, func.size() - 1);
+		}
+		else
+		{
+			LOG_NOTICE(SPU, "Building function 0x%x... (size %u, %s)", func[0], func.size() - 1, hash);
+		}
 
 		using namespace llvm;
 
 		SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
-		dis_asm.offset = reinterpret_cast<const u8*>(func.data() + 1) - func[0];
+		dis_asm.offset = reinterpret_cast<const u8*>(func.data() + 1);
+
+		if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+		{
+			dis_asm.offset -= func[0];
+		}
+
 		std::string log;
 
 		if (g_cfg.core.spu_debug)
@@ -1268,56 +1856,36 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		// Create LLVM module
 		std::unique_ptr<Module> module = std::make_unique<Module>(hash + ".obj", m_context);
-
-		// Initialize target
 		module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
-
-		// Initialize pass manager
-		legacy::FunctionPassManager pm(module.get());
-
-		// Basic optimizations
-		pm.add(createEarlyCSEPass());
-		pm.add(createDeadStoreEliminationPass());
-		pm.add(createLintPass()); // Check
-
-		// Add function
-		const auto main_func = cast<Function>(module->getOrInsertFunction(hash, get_type<void>(), get_type<u64>(), get_type<u64>()));
-		m_function = main_func;
-		m_thread = &*m_function->arg_begin();
-		m_lsptr = &*(m_function->arg_begin() + 1);
+		m_module = module.get();
 
 		// Initialize IR Builder
-		IRBuilder<> irb(BasicBlock::Create(m_context, "", m_function));
+		IRBuilder<> irb(m_context);
 		m_ir = &irb;
 
+		// Add entry function (contains only state/code check)
+		const auto main_func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(hash, get_type<void>(), get_type<u64>(), get_type<u64>()));
+		set_function(main_func);
+
 		// Start compilation
 		m_pos = func[0];
 		m_size = (func.size() - 1) * 4;
-		const u32 start = m_pos;
-		const u32 end = m_pos + m_size;
-
-		m_stop = BasicBlock::Create(m_context, "", m_function);
-
-		// Create instruction blocks
-		for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4)
-		{
-			if (func[i] && m_block_info[pos / 4])
-			{
-				m_instr_map.emplace(pos, BasicBlock::Create(m_context, "", m_function));
-			}
-		}
+		const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+		const u32 end = start + m_size;
 
 		update_pc();
 
 		const auto label_test = BasicBlock::Create(m_context, "", m_function);
 		const auto label_diff = BasicBlock::Create(m_context, "", m_function);
 		const auto label_body = BasicBlock::Create(m_context, "", m_function);
+		const auto label_stop = BasicBlock::Create(m_context, "", m_function);
 
 		// Emit state check
 		const auto pstate = spu_ptr<u32>(&SPUThread::state);
-		m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), m_stop, label_test);
+		m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), label_stop, label_test);
 
 		// Emit code check
+		u32 check_iterations = 0;
 		m_ir->SetInsertPoint(label_test);
 
 		if (!g_cfg.core.spu_verification)
@@ -1327,17 +1895,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		}
 		else if (func.size() - 1 == 1)
 		{
-			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr<u32>(m_lsptr, m_pos)), m_ir->getInt32(func[1]));
+			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr<u32>(m_lsptr, start)), m_ir->getInt32(func[1]));
 			m_ir->CreateCondBr(cond, label_diff, label_body);
 		}
 		else if (func.size() - 1 == 2)
 		{
-			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr<u64>(m_lsptr, m_pos)), m_ir->getInt64(static_cast<u64>(func[2]) << 32 | func[1]));
+			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(_ptr<u64>(m_lsptr, start)), m_ir->getInt64(static_cast<u64>(func[2]) << 32 | func[1]));
 			m_ir->CreateCondBr(cond, label_diff, label_body);
 		}
 		else
 		{
-			const u32 starta = m_pos & -32;
+			const u32 starta = start & -32;
 			const u32 enda = ::align(end, 32);
 			const u32 sizea = (enda - starta) / 32;
 			verify(HERE), sizea;
@@ -1354,7 +1922,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 				{
 					const u32 k = j + i * 4;
 
-					if (k < m_pos || k >= end || !func[(k - m_pos) / 4 + 1])
+					if (k < start || k >= end || !func[(k - start) / 4 + 1])
 					{
 						indices[i] = 8;
 						holes      = true;
@@ -1387,11 +1955,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 				for (u32 i = 0; i < 8; i++)
 				{
 					const u32 k = j + i * 4;
-					words[i] = k >= m_pos && k < end ? func[(k - m_pos) / 4 + 1] : 0;
+					words[i] = k >= start && k < end ? func[(k - start) / 4 + 1] : 0;
 				}
 
 				vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words));
 				acc = acc ? m_ir->CreateOr(acc, vls) : vls;
+				check_iterations++;
 			}
 
 			// Pattern for PTEST
@@ -1406,126 +1975,243 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			m_ir->CreateCondBr(cond, label_diff, label_body);
 		}
 
-		// Increase block counter
-		m_ir->SetInsertPoint(label_body);
-		const auto pbcount = spu_ptr<u64>(&SPUThread::block_counter);
-		m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(1)), pbcount);
+		// Increase block counter with statistics
+		m_ir->SetInsertPoint(label_body);
+		const auto pbcount = spu_ptr<u64>(&SPUThread::block_counter);
+		m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(check_iterations)), pbcount);
+
+		// Call the entry function chunk
+		const auto entry_chunk = add_function(m_pos);
+		m_ir->CreateCall(entry_chunk, {m_thread, m_lsptr, m_ir->getInt32(0)})->setTailCall();
+		m_ir->CreateRetVoid();
+
+		m_ir->SetInsertPoint(label_stop);
+		m_ir->CreateRetVoid();
+
+		m_ir->SetInsertPoint(label_diff);
+
+		if (g_cfg.core.spu_verification)
+		{
+			const auto pbfail = spu_ptr<u64>(&SPUThread::block_failure);
+			m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail);
+			tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0));
+		}
+		else
+		{
+			m_ir->CreateUnreachable();
+		}
+
+		// Create function table (uninitialized)
+		m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr);
 
-		// Emit instructions
-		for (u32 i = 1; i < func.size(); i++)
+		// Disassemble if necessary
+		if (g_cfg.core.spu_debug)
 		{
-			const u32 pos = start + (i - 1) * 4;
-
-			if (g_cfg.core.spu_debug)
+			for (u32 i = 1; i < func.size(); i++)
 			{
+				const u32 pos = start + (i - 1) * 4;
+
 				// Disasm
 				dis_asm.dump_pc = pos;
 				dis_asm.disasm(pos);
-				log += dis_asm.last_opcode;
-				log += '\n';
-			}
 
-			// Get opcode
-			const u32 op = se_storage<u32>::swap(func[i]);
-
-			if (!op)
-			{
-				// Ignore hole
-				if (!m_ir->GetInsertBlock()->getTerminator())
+				if (func[i])
 				{
-					flush();
-					branch_fixed(spu_branch_target(pos));
-					LOG_ERROR(SPU, "Unexpected fallthrough to 0x%x", pos);
+					log += '>';
+					log += dis_asm.last_opcode;
+					log += '\n';
+				}
+				else
+				{
+					fmt::append(log, ">[%08x]  xx xx xx xx: <hole>\n", pos);
 				}
-
-				continue;
 			}
+		}
 
-			// Bind instruction label if necessary (TODO)
-			const auto found = m_instr_map.find(pos);
+		// Create function chunks
+		for (std::size_t fi = 0; fi < m_function_queue.size(); fi++)
+		{
+			// Initialize function info
+			m_entry = m_function_queue[fi];
+			set_function(m_functions[m_entry]);
+			m_ir->CreateBr(add_block(m_entry));
 
-			if (found != m_instr_map.end())
+			// Emit instructions for basic blocks
+			for (std::size_t bi = 0; bi < m_block_queue.size(); bi++)
 			{
-				if (!m_ir->GetInsertBlock()->getTerminator())
+				// Initialize basic block info
+				const u32 baddr = m_block_queue[bi];
+				m_block = &m_blocks[baddr];
+				m_ir->SetInsertPoint(m_block->block);
+
+				const auto pfound = m_preds.find(baddr);
+
+				if (pfound != m_preds.end() && !pfound->second.empty())
 				{
-					flush();
-					m_ir->CreateBr(found->second);
-				}
+					// Initialize registers and build PHI nodes if necessary
+					for (u32 i = 0; i < s_reg_max; i++)
+					{
+						// TODO: optimize
+						const u32 src = find_reg_origin(baddr, i);
+
+						if (src >> 31)
+						{
+							// TODO: type
+							const auto _phi = m_ir->CreatePHI(get_type<u32[4]>(), 0 - src);
+							m_block->phi[i] = _phi;
+							m_block->reg[i] = _phi;
+
+							for (u32 pred : pfound->second)
+							{
+								// TODO: optimize
+								while (!m_block_info[pred / 4])
+								{
+									pred -= 4;
+								}
+
+								const auto bfound = m_blocks.find(pred);
 
-				m_ir->SetInsertPoint(found->second);
+								if (bfound != m_blocks.end() && bfound->second.block_end)
+								{
+									auto& value = bfound->second.reg[i];
 
-				// Build state check if necessary (TODO: more conditions)
-				bool need_check_state = false;
+									if (!value || value->getType() != _phi->getType())
+									{
+										const auto regptr = init_vr(i);
+										const auto cblock = m_ir->GetInsertBlock();
+										m_ir->SetInsertPoint(bfound->second.block_end->getTerminator());
 
-				const auto pfound = m_preds.find(pos);
+										if (!value)
+										{
+											// Value hasn't been loaded yet
+											value = m_ir->CreateLoad(regptr);
+										}
 
-				if (pfound != m_preds.end())
-				{
+										// Value possibly needs a bitcast
+										value = m_ir->CreateBitCast(value, _phi->getType());
+
+										m_ir->SetInsertPoint(cblock);
+
+										verify(HERE), bfound->second.block_end->getTerminator();
+									}
+
+									_phi->addIncoming(value, bfound->second.block_end);
+								}
+							}
+
+							if (baddr == m_entry)
+							{
+								// Load value at the function chunk's entry block if necessary
+								const auto regptr = init_vr(i);
+								const auto cblock = m_ir->GetInsertBlock();
+								m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator());
+								const auto value = m_ir->CreateLoad(regptr);
+								m_ir->SetInsertPoint(cblock);
+								_phi->addIncoming(value, &m_function->getEntryBlock());
+							}
+						}
+						else if (src != baddr)
+						{
+							// Passthrough static value or constant
+							const auto bfound = m_blocks.find(src);
+
+							// TODO: error
+							if (bfound != m_blocks.end())
+							{
+								m_block->reg[i] = bfound->second.reg[i];
+							}
+						}
+					}
+
+					// Emit state check if necessary (TODO: more conditions)
 					for (u32 pred : pfound->second)
 					{
-						if (pred >= pos)
+						if (pred >= baddr && bi > 0)
 						{
 							// If this block is a target of a backward branch (possibly loop), emit a check
-							need_check_state = true;
+							check_state(baddr);
 							break;
 						}
 					}
 				}
 
-				if (need_check_state)
+				// Emit instructions
+				for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4)
 				{
-					// Call cpu_thread::check_state if necessary and return or continue (full check)
-					const auto _body = BasicBlock::Create(m_context, "", m_function);
-					const auto check = BasicBlock::Create(m_context, "", m_function);
-					m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(pstate), m_ir->getInt32(0)), _body, check);
-					m_ir->SetInsertPoint(check);
-					m_ir->CreateStore(m_ir->getInt32(pos), spu_ptr<u32>(&SPUThread::pc));
-					m_ir->CreateCondBr(call(&check_state, m_thread), m_stop, _body);
-					m_ir->SetInsertPoint(_body);
+					if (m_pos != baddr && m_block_info[m_pos / 4])
+					{
+						break;
+					}
+
+					const u32 op = se_storage<u32>::swap(func[(m_pos - start) / 4 + 1]);
+
+					if (!op)
+					{
+						LOG_ERROR(SPU, "Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_pos, m_entry, m_function_queue[0]);
+						break;
+					}
+
+					// Execute recompiler function (TODO)
+					(this->*g_decoder.decode(op))({op});
 				}
-			}
 
-			if (!m_ir->GetInsertBlock()->getTerminator())
-			{
-				// Update position
-				m_pos = pos;
+				// Finalize block with fallthrough if necessary
+				if (!m_ir->GetInsertBlock()->getTerminator())
+				{
+					const u32 target = m_pos == baddr ? baddr : m_pos & 0x3fffc;
+
+					if (m_pos != baddr)
+					{
+						m_pos -= 4;
+
+						if (target >= start && target < end && m_targets[m_pos].find_first_of(target) == -1)
+						{
+							LOG_ERROR(SPU, "Unregistered fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", target, m_entry, m_function_queue[0]);
+						}
+					}
+
+					m_block->block_end = m_ir->GetInsertBlock();
+					m_ir->CreateBr(add_block(target));
+				}
 
-				// Execute recompiler function (TODO)
-				(this->*g_decoder.decode(op))({op});
+				verify(HERE), m_block->block_end;
 			}
 		}
 
-		// Make fallthrough if necessary
-		if (!m_ir->GetInsertBlock()->getTerminator())
-		{
-			flush();
-			branch_fixed(spu_branch_target(end));
-		}
+		// Initialize pass manager
+		legacy::FunctionPassManager pm(module.get());
 
-		//
-		m_ir->SetInsertPoint(m_stop);
-		m_ir->CreateRetVoid();
+		// Basic optimizations
+		pm.add(createEarlyCSEPass());
+		pm.add(createAggressiveDCEPass());
+		pm.add(createCFGSimplificationPass());
+		pm.add(createDeadStoreEliminationPass());
+		//pm.add(createLintPass()); // Check
 
-		m_ir->SetInsertPoint(label_diff);
+		for (const auto& func : m_functions)
+		{
+			pm.run(*func.second);
+		}
 
-		if (g_cfg.core.spu_verification)
+		if (m_function_table->getNumUses())
 		{
-			const auto pbfail = spu_ptr<u64>(&SPUThread::block_failure);
-			m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail);
-			tail(&spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), m_ir->getInt32(0));
+			// TODO
 		}
 		else
 		{
-			m_ir->CreateUnreachable();
+			m_function_table->eraseFromParent();
 		}
 
-		// Clear context
-		m_gpr.fill({});
-		m_flush_gpr.fill(0);
-		m_instr_map.clear();
+		// Clear context (TODO)
+		m_blocks.clear();
+		m_block_queue.clear();
+		m_functions.clear();
+		m_function_queue.clear();
+		m_scan_queue.clear();
+		m_function_table = nullptr;
 
 		// Generate a dispatcher (übertrampoline)
-		std::vector<u32> addrv{start};
+		std::vector<u32> addrv{func[0]};
 		const auto beg = m_spurt->m_map.lower_bound(addrv);
 		addrv[0] += 4;
 		const auto _end = m_spurt->m_map.lower_bound(addrv);
@@ -1533,10 +2219,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		if (size0 > 1)
 		{
-			const auto trampoline = cast<Function>(module->getOrInsertFunction(fmt::format("tr_0x%05x_%03u", start, size0), get_type<void>(), get_type<u64>(), get_type<u64>()));
-			m_function = trampoline;
-			m_thread = &*m_function->arg_begin();
-			m_lsptr = &*(m_function->arg_begin() + 1);
+			const auto trampoline = cast<Function>(module->getOrInsertFunction(fmt::format("spu-0x%05x-trampoline-%03u", func[0], size0), get_type<void>(), get_type<u64>(), get_type<u64>()));
+			set_function(trampoline);
 
 			struct work
 			{
@@ -1554,7 +2238,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			workload.back().level = 1;
 			workload.back().beg = beg;
 			workload.back().end = _end;
-			workload.back().label = llvm::BasicBlock::Create(m_context, "", m_function);
+			workload.back().label = m_ir->GetInsertBlock();
 
 			for (std::size_t i = 0; i < workload.size(); i++)
 			{
@@ -1566,9 +2250,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 				llvm::BasicBlock* def{};
 
-				while (true)
+				while (w.level < w.beg->first.size())
 				{
 					const u32 x1 = w.beg->first.at(w.level);
+
+					if (!x1)
+					{
+						// Cannot split: some functions contain holes at this level
+						w.level++;
+						continue;
+					}
+
 					auto it = w.beg;
 					auto it2 = it;
 					u32 x = x1;
@@ -1638,6 +2330,26 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 					}
 				}
 
+				if (!def && targets.empty())
+				{
+					LOG_WARNING(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
+					m_ir->SetInsertPoint(w.label);
+
+					if (const u64 fval = reinterpret_cast<u64>(w.beg->second))
+					{
+						const auto ptr = m_ir->CreateIntToPtr(m_ir->getInt64(fval), main_func->getType());
+						m_ir->CreateCall(ptr, {m_thread, m_lsptr})->setTailCall();
+					}
+					else
+					{
+						verify(HERE, &w.beg->second == &fn_location);
+						m_ir->CreateCall(main_func, {m_thread, m_lsptr})->setTailCall();
+					}
+
+					m_ir->CreateRetVoid();
+					continue;
+				}
+
 				if (!def)
 				{
 					def = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -1659,16 +2371,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			}
 		}
 
-		// Run some optimizations
-		//pm.run(*main_func);
-
 		spu_function_t fn{}, tr{};
 
 		raw_string_ostream out(log);
 
 		if (g_cfg.core.spu_debug)
 		{
-			fmt::append(log, "LLVM IR at 0x%x:\n", start);
+			fmt::append(log, "LLVM IR at 0x%x:\n", func[0]);
 			out << *module; // print IR
 			out << "\n\n";
 		}
@@ -1676,14 +2385,20 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		if (verifyModule(*module, &out))
 		{
 			out.flush();
-			LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", start, log);
+			LOG_ERROR(SPU, "LLVM: Verification failed at 0x%x:\n%s", func[0], log);
+
+			if (g_cfg.core.spu_debug)
+			{
+				fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
+			}
+
 			fmt::raw_error("Compilation failed");
 		}
 
 		if (g_cfg.core.spu_debug)
 		{
 			// Testing only
-			m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path);
+			m_spurt->m_jit.add(std::move(module), m_spurt->m_cache_path + "llvm/");
 		}
 		else
 		{
@@ -1703,17 +2418,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		fn_location = fn;
 
 		// Trampoline
-		m_spurt->m_dispatcher[start / 4] = tr;
+		m_spurt->m_dispatcher[func[0] / 4] = tr;
 
-		LOG_NOTICE(SPU, "[0x%x] Compiled: %p", start, fn);
+		LOG_NOTICE(SPU, "[0x%x] Compiled: %p", func[0], fn);
 
 		if (tr != fn)
-			LOG_NOTICE(SPU, "[0x%x] T: %p", start, tr);
+			LOG_NOTICE(SPU, "[0x%x] T: %p", func[0], tr);
 
 		if (g_cfg.core.spu_debug)
 		{
 			out.flush();
-			fs::file(m_spurt->m_cache_path + "../spu.log", fs::write + fs::append).write(log);
+			fs::file(m_spurt->m_cache_path + "spu.log", fs::write + fs::append).write(log);
 		}
 
 		if (m_cache && g_cfg.core.spu_cache)
@@ -1724,7 +2439,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		return fn;
 	}
 
-	static bool check_state(SPUThread* _spu)
+	static bool exec_check_state(SPUThread* _spu)
 	{
 		return _spu->check_state();
 	}
@@ -1741,7 +2456,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	template <spu_inter_func_t F>
 	void fall(spu_opcode_t op)
 	{
-		flush();
 		update_pc();
 		call(&exec_fall<F>, m_thread, m_ir->getInt32(op.opcode));
 	}
@@ -1753,31 +2467,38 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void UNK(spu_opcode_t op_unk)
 	{
-		flush();
+		m_block->block_end = m_ir->GetInsertBlock();
 		update_pc();
 		tail(&exec_unk, m_thread, m_ir->getInt32(op_unk.opcode));
 	}
 
-	static void exec_stop(SPUThread* _spu, u32 code)
+	static bool exec_stop(SPUThread* _spu, u32 code)
 	{
-		if (_spu->stop_and_signal(code))
-		{
-			_spu->pc += 4;
-		}
+		return _spu->stop_and_signal(code);
 	}
 
 	void STOP(spu_opcode_t op) //
 	{
-		flush();
 		update_pc();
-		tail(&exec_stop, m_thread, m_ir->getInt32(op.opcode));
+		const auto succ = call(&exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff));
+		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
+		const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
+		m_ir->CreateCondBr(succ, next, stop);
+		m_ir->SetInsertPoint(stop);
+		m_ir->CreateRetVoid();
+		m_ir->SetInsertPoint(next);
+
+		if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
+		{
+			m_block->block_end = m_ir->GetInsertBlock();
+			m_ir->CreateStore(m_ir->getInt32(m_pos + 4), spu_ptr<u32>(&SPUThread::pc));
+			m_ir->CreateRetVoid();
+		}
 	}
 
 	void STOPD(spu_opcode_t op) //
 	{
-		flush();
-		update_pc();
-		tail(&exec_stop, m_thread, m_ir->getInt32(0x3fff));
+		STOP(spu_opcode_t{0x3fff});
 	}
 
 	static s64 exec_rdch(SPUThread* _spu, u32 ch)
@@ -1787,12 +2508,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void RDCH(spu_opcode_t op) //
 	{
-		flush();
 		update_pc();
 		value_t<s64> res;
 		res.value = call(&exec_rdch, m_thread, m_ir->getInt32(op.ra));
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), m_stop, next);
+		const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
+		m_ir->CreateCondBr(m_ir->CreateICmpSLT(res.value, m_ir->getInt64(0)), stop, next);
+		m_ir->SetInsertPoint(stop);
+		m_ir->CreateRetVoid();
 		m_ir->SetInsertPoint(next);
 		set_vr(op.rt, insert(splat<u32[4]>(0), 3, trunc<u32>(res)));
 	}
@@ -1816,11 +2539,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void WRCH(spu_opcode_t op) //
 	{
-		flush();
 		update_pc();
 		const auto succ = call(&exec_wrch, m_thread, m_ir->getInt32(op.ra), extract(get_vr(op.rt), 3).value);
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(succ, next, m_stop);
+		const auto stop = llvm::BasicBlock::Create(m_context, "", m_function);
+		m_ir->CreateCondBr(succ, next, stop);
+		m_ir->SetInsertPoint(stop);
+		m_ir->CreateRetVoid();
 		m_ir->SetInsertPoint(next);
 	}
 
@@ -1838,12 +2563,19 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	{
 		// This instruction must be used following a store instruction that modifies the instruction stream.
 		m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
+
+		if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
+		{
+			m_block->block_end = m_ir->GetInsertBlock();
+			m_ir->CreateStore(m_ir->getInt32(m_pos + 4), spu_ptr<u32>(&SPUThread::pc));
+			m_ir->CreateRetVoid();
+		}
 	}
 
 	void DSYNC(spu_opcode_t op) //
 	{
 		// This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
-		m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
+		SYNC(op);
 	}
 
 	void MFSPR(spu_opcode_t op) //
@@ -2936,7 +3668,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void HGT(spu_opcode_t op) //
 	{
-		flush();
 		const auto cond = eval(extract(get_vr<s32[4]>(op.ra), 3) > extract(get_vr<s32[4]>(op.rb), 3));
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto halt = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -2948,7 +3679,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void HEQ(spu_opcode_t op) //
 	{
-		flush();
 		const auto cond = eval(extract(get_vr(op.ra), 3) == extract(get_vr(op.rb), 3));
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto halt = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -2960,7 +3690,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void HLGT(spu_opcode_t op) //
 	{
-		flush();
 		const auto cond = eval(extract(get_vr(op.ra), 3) > extract(get_vr(op.rb), 3));
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto halt = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -2972,7 +3701,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void HGTI(spu_opcode_t op) //
 	{
-		flush();
 		const auto cond = eval(extract(get_vr<s32[4]>(op.ra), 3) > op.si10);
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto halt = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -2984,7 +3712,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void HEQI(spu_opcode_t op) //
 	{
-		flush();
 		const auto cond = eval(extract(get_vr(op.ra), 3) == op.si10);
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto halt = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -2996,7 +3723,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 	void HLGTI(spu_opcode_t op) //
 	{
-		flush();
 		const auto cond = eval(extract(get_vr(op.ra), 3) > op.si10);
 		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
 		const auto halt = llvm::BasicBlock::Create(m_context, "", m_function);
@@ -3030,158 +3756,210 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		{
 			_spu->interrupts_enabled = false;
 			_spu->srr0 = addr;
+
+			// Test for BR/BRA instructions (they are equivalent at zero pc)
+			const u32 br = _spu->_ref<const u32>(0);
+
+			if ((br & 0xfd80007f) == 0x30000000)
+			{
+				return (br >> 5) & 0x3fffc;
+			}
+
 			return 0;
 		}
 
 		return addr;
 	}
 
-	void branch_indirect(spu_opcode_t op, value_t<u32> addr)
+	llvm::BasicBlock* add_block_indirect(spu_opcode_t op, value_t<u32> addr, bool ret = true)
 	{
-		if (op.d)
-		{
-			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(&SPUThread::interrupts_enabled))->setVolatile(true);
-		}
-		else if (op.e)
-		{
-			addr.value = call(&exec_check_interrupts, m_thread, addr.value);
-		}
-
+		// Convert an indirect branch into a static one if possible
 		if (const auto _int = llvm::dyn_cast<llvm::ConstantInt>(addr.value))
 		{
-			LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, _int->getZExtValue());
-			return branch_fixed(_int->getZExtValue());
-		}
+			const u32 target = ::narrow<u32>(_int->getZExtValue(), HERE);
 
-		m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc));
+			LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, target);
 
-		const auto tfound = m_targets.find(m_pos);
+			if (!op.e && !op.d)
+			{
+				return add_block(target);
+			}
 
-		if (tfound != m_targets.end() && tfound->second.size() >= 3)
-		{
-			const u32 start = m_instr_map.begin()->first;
+			if (!m_entry_info[target / 4])
+			{
+				LOG_ERROR(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, target);
+			}
 
-			const std::set<u32> targets(tfound->second.begin(), tfound->second.end());
+			// Fixed branch excludes the possibility it's a function return (TODO)
+			ret = false;
+		}
 
-			const auto exter = llvm::BasicBlock::Create(m_context, "", m_function);
+		// Load stack addr if necessary
+		value_t<u32> sp;
 
-			const auto sw = m_ir->CreateSwitch(m_ir->CreateLShr(addr.value, 2, "", true), exter, m_size / 4);
+		if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe)
+		{
+			sp = eval(extract(get_vr(1), 3) & 0x3fff0);
+		}
 
-			for (u32 pos = start; pos < start + m_size; pos += 4)
-			{
-				const auto found = m_instr_map.find(pos);
+		const auto cblock = m_ir->GetInsertBlock();
+		const auto result = llvm::BasicBlock::Create(m_context, "", m_function);
+		m_ir->SetInsertPoint(result);
 
-				if (found != m_instr_map.end() && targets.count(pos))
-				{
-					sw->addCase(m_ir->getInt32(pos / 4), found->second);
-				}
-				else
-				{
-					sw->addCase(m_ir->getInt32(pos / 4), m_stop);
-				}
-			}
+		if (op.e)
+		{
+			addr.value = call(&exec_check_interrupts, m_thread, addr.value);
+		}
 
-			m_ir->SetInsertPoint(exter);
+		if (op.d)
+		{
+			m_ir->CreateStore(m_ir->getFalse(), spu_ptr<bool>(&SPUThread::interrupts_enabled))->setVolatile(true);
 		}
 
+		m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc));
 		const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher)));
 		const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
-		tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext<u64>(addr << 1).value), type)));
-	}
 
-	void branch_fixed(u32 target)
-	{
-		const auto found = m_instr_map.find(target);
+		if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe)
+		{
+			// Compare address stored in stack mirror with addr
+			const auto stack0 = eval(zext<u64>(sp) + ::offset32(&SPUThread::stack_mirror));
+			const auto stack1 = eval(stack0 + 8);
+			const auto _ret = m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack0.value), type));
+			const auto link = m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack1.value), get_type<u64*>()));
+			const auto fail = llvm::BasicBlock::Create(m_context, "", m_function);
+			const auto done = llvm::BasicBlock::Create(m_context, "", m_function);
+			m_ir->CreateCondBr(m_ir->CreateICmpEQ(zext<u64>(addr).value, link), done, fail);
+			m_ir->SetInsertPoint(done);
+
+			// Clear stack mirror and return by tail call to the provided return address
+			m_ir->CreateStore(splat<u64[2]>(-1).value, m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack0.value), get_type<u64[2]>()->getPointerTo()));
+			tail(_ret);
+			m_ir->SetInsertPoint(fail);
+		}
+
+		llvm::Value* ptr = m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext<u64>(addr << 1).value), type);
 
-		if (found != m_instr_map.end())
+		if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
 		{
-			m_ir->CreateBr(found->second);
-			return;
+			// Try to load chunk address from the function table
+
 		}
 
-		m_ir->CreateStore(m_ir->getInt32(target), spu_ptr<u32>(&SPUThread::pc));
-		const auto addr = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher) + target * 2));
-		const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
-		const auto func = m_ir->CreateLoad(m_ir->CreateIntToPtr(addr, type));
-		tail(func);
+		tail(m_ir->CreateLoad(ptr));
+		m_ir->SetInsertPoint(cblock);
+		return result;
 	}
 
 	void BIZ(spu_opcode_t op) //
 	{
-		flush();
+		m_block->block_end = m_ir->GetInsertBlock();
 		const auto cond = eval(extract(get_vr(op.rt), 3) == 0);
 		const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_indirect(op, addr);
-		m_ir->SetInsertPoint(next);
+		const auto target = add_block_indirect(op, addr);
+		m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4));
 	}
 
 	void BINZ(spu_opcode_t op) //
 	{
-		flush();
+		m_block->block_end = m_ir->GetInsertBlock();
 		const auto cond = eval(extract(get_vr(op.rt), 3) != 0);
 		const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_indirect(op, addr);
-		m_ir->SetInsertPoint(next);
+		const auto target = add_block_indirect(op, addr);
+		m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4));
 	}
 
 	void BIHZ(spu_opcode_t op) //
 	{
-		flush();
+		m_block->block_end = m_ir->GetInsertBlock();
 		const auto cond = eval(extract(get_vr<u16[8]>(op.rt), 6) == 0);
 		const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_indirect(op, addr);
-		m_ir->SetInsertPoint(next);
+		const auto target = add_block_indirect(op, addr);
+		m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4));
 	}
 
 	void BIHNZ(spu_opcode_t op) //
 	{
-		flush();
+		m_block->block_end = m_ir->GetInsertBlock();
 		const auto cond = eval(extract(get_vr<u16[8]>(op.rt), 6) != 0);
 		const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_indirect(op, addr);
-		m_ir->SetInsertPoint(next);
+		const auto target = add_block_indirect(op, addr);
+		m_ir->CreateCondBr(cond.value, target, add_block(m_pos + 4));
 	}
 
 	void BI(spu_opcode_t op) //
 	{
+		m_block->block_end = m_ir->GetInsertBlock();
 		const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
-		flush();
-		branch_indirect(op, addr);
+
+		// Create jump table if necessary (TODO)
+		const auto tfound = m_targets.find(m_pos);
+
+		if (!op.d && !op.e && tfound != m_targets.end() && (tfound->second.size() != 1 || tfound->second[0] != -1))
+		{
+			// Shift aligned address for switch
+			const auto sw_arg = m_ir->CreateLShr(addr.value, 2, "", true);
+
+			// Initialize jump table targets
+			std::map<u32, llvm::BasicBlock*> targets;
+
+			for (u32 target : tfound->second)
+			{
+				if (m_block_info[target / 4])
+				{
+					targets.emplace(target, add_block(target));
+				}
+			}
+
+			// Get jump table bounds (optimization)
+			const u32 start = targets.begin()->first;
+			const u32 end = targets.rbegin()->first + 4;
+
+			// Emit switch instruction aiming for a jumptable in the end (indirectbr could guarantee it)
+			const auto sw = m_ir->CreateSwitch(sw_arg, llvm::BasicBlock::Create(m_context, "", m_function), (end - start) / 4);
+
+			for (u32 pos = start; pos < end; pos += 4)
+			{
+				if (m_block_info[pos / 4] && targets.count(pos))
+				{
+					const auto found = targets.find(pos);
+
+					if (found != targets.end())
+					{
+						sw->addCase(m_ir->getInt32(pos / 4), found->second);
+						continue;
+					}
+				}
+
+				sw->addCase(m_ir->getInt32(pos / 4), sw->getDefaultDest());
+			}
+
+			// Exit function on unexpected target
+			m_ir->SetInsertPoint(sw->getDefaultDest());
+			m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc));
+			m_ir->CreateRetVoid();
+		}
+		else
+		{
+			// Simple indirect branch
+			m_ir->CreateBr(add_block_indirect(op, addr));
+		}
 	}
 
 	void BISL(spu_opcode_t op) //
 	{
+		m_block->block_end = m_ir->GetInsertBlock();
 		const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
-		u32 values[4]{0, 0, 0, spu_branch_target(m_pos + 4)};
-		value_t<u32[4]> r;
-		r.value = llvm::ConstantDataVector::get(m_context, values);
-		set_vr(op.rt, r);
-		flush();
-		branch_indirect(op, addr);
+		set_link(op);
+		m_ir->CreateBr(add_block_indirect(op, addr, false));
 	}
 
 	void IRET(spu_opcode_t op) //
 	{
+		m_block->block_end = m_ir->GetInsertBlock();
 		value_t<u32> srr0;
 		srr0.value = m_ir->CreateLoad(spu_ptr<u32>(&SPUThread::srr0));
-		flush();
-		branch_indirect(op, srr0);
+		m_ir->CreateBr(add_block_indirect(op, srr0));
 	}
 
 	void BISLED(spu_opcode_t op) //
@@ -3193,76 +3971,48 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	{
 		const u32 target = spu_branch_target(m_pos, op.i16);
 
-		if (target == m_pos + 4)
+		if (target != m_pos + 4)
 		{
-			return;
+			m_block->block_end = m_ir->GetInsertBlock();
+			const auto cond = eval(extract(get_vr(op.rt), 3) == 0);
+			m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
 		}
-
-		flush();
-		const auto cond = eval(extract(get_vr(op.rt), 3) == 0);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_fixed(target);
-		m_ir->SetInsertPoint(next);
 	}
 
 	void BRNZ(spu_opcode_t op) //
 	{
 		const u32 target = spu_branch_target(m_pos, op.i16);
 
-		if (target == m_pos + 4)
+		if (target != m_pos + 4)
 		{
-			return;
+			m_block->block_end = m_ir->GetInsertBlock();
+			const auto cond = eval(extract(get_vr(op.rt), 3) != 0);
+			m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
 		}
-
-		flush();
-		const auto cond = eval(extract(get_vr(op.rt), 3) != 0);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_fixed(target);
-		m_ir->SetInsertPoint(next);
 	}
 
 	void BRHZ(spu_opcode_t op) //
 	{
 		const u32 target = spu_branch_target(m_pos, op.i16);
 
-		if (target == m_pos + 4)
+		if (target != m_pos + 4)
 		{
-			return;
+			m_block->block_end = m_ir->GetInsertBlock();
+			const auto cond = eval(extract(get_vr<u16[8]>(op.rt), 6) == 0);
+			m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
 		}
-
-		flush();
-		const auto cond = eval(extract(get_vr<u16[8]>(op.rt), 6) == 0);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_fixed(target);
-		m_ir->SetInsertPoint(next);
 	}
 
 	void BRHNZ(spu_opcode_t op) //
 	{
 		const u32 target = spu_branch_target(m_pos, op.i16);
 
-		if (target == m_pos + 4)
+		if (target != m_pos + 4)
 		{
-			return;
+			m_block->block_end = m_ir->GetInsertBlock();
+			const auto cond = eval(extract(get_vr<u16[8]>(op.rt), 6) != 0);
+			m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
 		}
-
-		flush();
-		const auto cond = eval(extract(get_vr<u16[8]>(op.rt), 6) != 0);
-		const auto next = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto jump = llvm::BasicBlock::Create(m_context, "", m_function);
-		m_ir->CreateCondBr(cond.value, jump, next);
-		m_ir->SetInsertPoint(jump);
-		branch_fixed(target);
-		m_ir->SetInsertPoint(next);
 	}
 
 	void BRA(spu_opcode_t op) //
@@ -3271,17 +4021,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		if (target != m_pos + 4)
 		{
-			flush();
-			branch_fixed(target);
+			m_block->block_end = m_ir->GetInsertBlock();
+			m_ir->CreateBr(add_block(target));
 		}
 	}
 
 	void BRASL(spu_opcode_t op) //
 	{
-		u32 values[4]{0, 0, 0, spu_branch_target(m_pos + 4)};
-		value_t<u32[4]> r;
-		r.value = llvm::ConstantDataVector::get(m_context, values);
-		set_vr(op.rt, r);
+		set_link(op);
 		BRA(op);
 	}
 
@@ -3291,18 +4038,33 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 
 		if (target != m_pos + 4)
 		{
-			flush();
-			branch_fixed(target);
+			m_block->block_end = m_ir->GetInsertBlock();
+			m_ir->CreateBr(add_block(target));
 		}
 	}
 
 	void BRSL(spu_opcode_t op) //
+	{
+		set_link(op);
+		BR(op);
+	}
+
+	void set_link(spu_opcode_t op)
 	{
 		u32 values[4]{0, 0, 0, spu_branch_target(m_pos + 4)};
 		value_t<u32[4]> r;
 		r.value = llvm::ConstantDataVector::get(m_context, values);
 		set_vr(op.rt, r);
-		BR(op);
+
+		if (g_cfg.core.spu_block_size != spu_block_size_type::safe && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1])
+		{
+			// Store the return function chunk address at the stack mirror
+			const auto func = add_function(m_pos + 4);
+			const auto stack0 = eval(zext<u64>(extract(get_vr(1), 3) & 0x3fff0) + ::offset32(&SPUThread::stack_mirror));
+			const auto stack1 = eval(stack0 + 8);
+			m_ir->CreateStore(func, m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack0.value), func->getType()->getPointerTo()));
+			m_ir->CreateStore(m_ir->getInt64(m_pos + 4), m_ir->CreateIntToPtr(m_ir->CreateAdd(m_thread, stack1.value), get_type<u64*>()));
+		}
 	}
 
 	static const spu_decoder<spu_llvm_recompiler> g_decoder;
diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h
index d73401358759..18730933ff55 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@@ -45,9 +45,15 @@ class spu_recompiler_base
 	// List of possible targets for the instruction ({} = next instruction, {-1} = no targets)
 	std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_targets;
 
-	// List of block predecessors (incomplete, doesn't include all fallthrough predecessors)
+	// List of block predecessors
 	std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_preds;
 
+	// List of function entry points and return points (set after BRSL, BRASL, BISL, BISLED)
+	std::bitset<0x10000> m_entry_info;
+
+	// Compressed address of unique entry point for each instruction
+	std::array<u16, 0x10000> m_entry_map{};
+
 	std::shared_ptr<spu_cache> m_cache;
 
 private:
@@ -82,4 +88,7 @@ class spu_recompiler_base
 
 	// Create recompiler instance (LLVM)
 	static std::unique_ptr<spu_recompiler_base> make_llvm_recompiler();
+
+	// Max number of registers (for m_regmod)
+	static constexpr u8 s_reg_max = 128;
 };
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 7391d8afd305..c3bf3ac33180 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -527,6 +527,8 @@ void SPUThread::cpu_task()
 			jit_dispatcher[pc / 4](*this, vm::_ptr<u8>(offset), nullptr);
 		}
 
+		// Print some stats
+		LOG_NOTICE(SPU, "Stats: block %u (fails: %u);", block_counter, block_failure);
 		return;
 	}