Add new IR optimization pass, OptimizeLoadsAfterStores

2024-11-23 13:30:02 +00:00 · 2024-06-07 11:07:21 +02:00 · 2024-06-07 11:07:21 +02:00 · bd0beb68a4
commit bd0beb68a4
parent da88011805
4 changed files with 47 additions and 2 deletions
--- a/Core/MIPS/IR/IRFrontend.cpp
+++ b/Core/MIPS/IR/IRFrontend.cpp
@ -284,6 +284,7 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, u32 &m
 			&PropagateConstants,
 			&PurgeTemps,
 			&ReduceVec4Flush,
+			&OptimizeLoadsAfterStores,
 			// &ReorderLoadStore,
 			// &MergeLoadStore,
 			// &ThreeOpToTwoOp,
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@ -255,15 +255,19 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 			u32 opcode = inst & 0xFF000000;
 			if (opcode == MIPS_EMUHACK_OPCODE) {
 				u32 offset = inst & 0x00FFFFFF; // Alternatively, inst - opcode
+				const IRInst *instPtr = blocks_.GetArenaPtr() + offset;
+				_dbg_assert_(instPtr->op == IROp::Downcount);
+				mips->downcount -= instPtr->constant;
+				instPtr++;
 #ifdef IR_PROFILING
 				IRBlock *block = blocks_.GetBlock(blocks_.GetBlockNumFromOffset(offset));
 				TimeSpan span;
-				mips->pc = IRInterpret(mips, blocks_.GetArenaPtr() + offset);
+				mips->pc = IRInterpret(mips, instPtr);
 				int64_t elapsedNanos = span.ElapsedNanos();
 				block->profileStats_.executions += 1;
 				block->profileStats_.totalNanos += elapsedNanos;
 #else
-				mips->pc = IRInterpret(mips, blocks_.GetArenaPtr() + offset);
+				mips->pc = IRInterpret(mips, instPtr);
 #endif
 				// Note: this will "jump to zero" on a badly constructed block missing exits.
 				if (!Memory::IsValid4AlignedAddress(mips->pc)) {
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@ -2150,6 +2150,45 @@ bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) {
 	return logBlocks;
 }

+// This optimizes away redundant loads-after-stores, which are surprisingly not that uncommon.
+bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts) {
+	CONDITIONAL_DISABLE;
+	// This tells us to skip an AND op that has been optimized out.
+	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
+	int nextSkip = -1;
+
+	bool logBlocks = false;
+	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
+		IRInst inst = in.GetInstructions()[i];
+
+		// Just copy the last instruction.
+		if (i == n - 1) {
+			out.Write(inst);
+			break;
+		}
+
+		out.Write(inst);
+
+		IRInst next = in.GetInstructions()[i + 1];
+		switch (inst.op) {
+		case IROp::Store32:
+			if (next.op == IROp::Load32 &&
+				next.constant == inst.constant &&
+				next.dest == inst.src3 &&
+				next.src1 == inst.src1) {
+				// The upcoming load is completely redundant.
+				// Skip it.
+				i++;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	return logBlocks;
+}
+
 bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts) {
 	CONDITIONAL_DISABLE;
 	// This tells us to skip an AND op that has been optimized out.
--- a/Core/MIPS/IR/IRPassSimplify.h
+++ b/Core/MIPS/IR/IRPassSimplify.h
@ -17,4 +17,5 @@ bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts);
 bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts);
 bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts);

+bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts);
 bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts);