From 85ee7c85c1095d7fd0da4d41f3e7f3983bb7aff4 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 6 Aug 2023 12:05:51 -0700
Subject: [PATCH 1/5] irjit: Allow masked vneg.q.

---
 Core/MIPS/IR/IRCompFPU.cpp  | 3 ++-
 Core/MIPS/IR/IRCompVFPU.cpp | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp
index b0edeea0cf..3c3ed87d03 100644
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@@ -226,7 +226,8 @@ void IRFrontend::Comp_mxc1(MIPSOpcode op) {
 			UpdateRoundingMode();
 			ApplyRoundingMode();
 		} else {
-			Comp_Generic(op);
+			// Maybe not strictly invalid?  But likely invalid.
+			INVALIDOP;
 		}
 		return;
 	default:
diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index c902951d96..c80fa6e1ff 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -782,6 +782,10 @@ namespace MIPSComp {
 		if (optype == 0) {
 			if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op))
 				DISABLE;
+		} else if (optype == 1 || optype == 2) {
+			// D prefix is fine for these, and used sometimes.
+			if (js.HasUnknownPrefix() || js.HasSPrefix())
+				DISABLE;
 		} else {
 			// Many of these apply the D prefix strangely or override parts of the S prefix.
 			if (!js.HasNoPrefix())

From 79ca880ac70ba2f3848329be6e266c2f5e416fec Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 6 Aug 2023 12:06:16 -0700
Subject: [PATCH 2/5] irjit: Implement vqmul, add Vec4Blend.

Should be useful more places.
---
 Core/MIPS/IR/IRCompVFPU.cpp      | 69 ++++++++++++++++++++++++--------
 Core/MIPS/IR/IRInst.cpp          |  9 ++++-
 Core/MIPS/IR/IRInst.h            |  3 +-
 Core/MIPS/IR/IRInterpreter.cpp   |  8 +++-
 Core/MIPS/IR/IRNativeCommon.cpp  |  1 +
 Core/MIPS/IR/IRPassSimplify.cpp  |  1 +
 Core/MIPS/RiscV/RiscVCompVec.cpp |  8 ++++
 7 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index c80fa6e1ff..fe97084d65 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -1727,16 +1727,16 @@ namespace MIPSComp {
 		GetVectorRegs(tregs, sz, _VT);
 		GetVectorRegs(dregs, sz, _VD);
 
-		u8 tempregs[4]{};
-		for (int i = 0; i < n; ++i) {
-			if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
-				tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
-			} else {
-				tempregs[i] = dregs[i];
-			}
-		}
-
 		if (sz == V_Triple) {
+			u8 tempregs[4]{};
+			for (int i = 0; i < n; ++i) {
+				if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) {
+					tempregs[i] = IRVTEMP_PFX_T + i;   // using IRTEMP0 for other things
+				} else {
+					tempregs[i] = dregs[i];
+				}
+			}
+
 			int temp0 = IRVTEMP_0;
 			int temp1 = IRVTEMP_0 + 1;
 			// Compute X
@@ -1753,15 +1753,50 @@ namespace MIPSComp {
 			ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]);
 			ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]);
 			ir.Write(IROp::FSub, tempregs[2], temp0, temp1);
-		} else if (sz == V_Quad) {
-			DISABLE;
-		} else {
-			DISABLE;
-		}
 
-		for (int i = 0; i < n; i++) {
-			if (tempregs[i] != dregs[i])
-				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			for (int i = 0; i < n; i++) {
+				if (tempregs[i] != dregs[i])
+					ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			}
+		} else if (sz == V_Quad) {
+			// Rather than using vdots, we organize this as SIMD multiplies and adds.
+			// That means flipping the logic column-wise.  Also, luckily no prefix temps used.
+			if (!IsConsecutive4(sregs) || !IsConsecutive4(tregs) || !IsConsecutive4(dregs)) {
+				DISABLE;
+			}
+
+			auto shuffleImm = [](int x, int y, int z, int w) { return x | (y << 2) | (z << 4) | (w << 6); };
+			auto blendConst = [](int x, int y, int z, int w) { return x | (y << 1) | (z << 2) | (w << 3); };
+
+			// Prepare some negatives.
+			ir.Write(IROp::Vec4Neg, IRVTEMP_0, tregs[0]);
+
+			// tmp = S[x,x,x,x] * T[w,-z,y,-x]
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(0, 0, 0, 0));
+			ir.Write(IRInst{ IROp::Vec4Blend, IRVTEMP_PFX_T, tregs[0], IRVTEMP_0, blendConst(1, 0, 1, 0) });
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_T, shuffleImm(3, 2, 1, 0));
+			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_D, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
+
+			// tmp += S[y,y,y,y] * T[z,w,-x,-y]
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(1, 1, 1, 1));
+			ir.Write(IRInst{ IROp::Vec4Blend, IRVTEMP_PFX_T, tregs[0], IRVTEMP_0, blendConst(1, 1, 0, 0) });
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_T, shuffleImm(2, 3, 0, 1));
+			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
+			ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);
+
+			// tmp += S[z,z,z,z] * T[-y,x,w,-z]
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(2, 2, 2, 2));
+			ir.Write(IRInst{ IROp::Vec4Blend, IRVTEMP_PFX_T, tregs[0], IRVTEMP_0, blendConst(0, 1, 1, 0) });
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_T, shuffleImm(1, 0, 3, 2));
+			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T);
+			ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S);
+
+			// tmp += S[w,w,w,w] * T[x,y,z,w]
+			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(3, 3, 3, 3));
+			ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, tregs[0]);
+			ir.Write(IROp::Vec4Add, dregs[0], IRVTEMP_PFX_D, IRVTEMP_PFX_S);
+		} else {
+			INVALIDOP;
 		}
 	}
 
diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
index d0e0fe1d74..5cdfc3824a 100644
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@@ -126,6 +126,7 @@ static const IRMeta irMeta[] = {
 	{ IROp::FCmpVfpuAggregate, "FCmpVfpuAggregate", "I" },
 	{ IROp::Vec4Init, "Vec4Init", "Vv" },
 	{ IROp::Vec4Shuffle, "Vec4Shuffle", "VVs" },
+	{ IROp::Vec4Blend, "Vec4Blend", "VVVC" },
 	{ IROp::Vec4Mov, "Vec4Mov", "VV" },
 	{ IROp::Vec4Add, "Vec4Add", "VVV" },
 	{ IROp::Vec4Sub, "Vec4Sub", "VVV" },
@@ -328,14 +329,20 @@ void DisassembleIR(char *buf, size_t bufsize, IRInst inst) {
 	char bufDst[16];
 	char bufSrc1[16];
 	char bufSrc2[16];
+	// Only really used for constant.
+	char bufSrc3[16];
 	DisassembleParam(bufDst, sizeof(bufDst) - 2, inst.dest, meta->types[0], inst.constant);
 	DisassembleParam(bufSrc1, sizeof(bufSrc1) - 2, inst.src1, meta->types[1], inst.constant);
 	DisassembleParam(bufSrc2, sizeof(bufSrc2), inst.src2, meta->types[2], inst.constant);
+	DisassembleParam(bufSrc3, sizeof(bufSrc3), inst.src3, meta->types[3], inst.constant);
 	if (meta->types[1] && meta->types[0] != '_') {
 		strcat(bufDst, ", ");
 	}
 	if (meta->types[2] && meta->types[1] != '_') {
 		strcat(bufSrc1, ", ");
 	}
-	snprintf(buf, bufsize, "%s %s%s%s", meta->name, bufDst, bufSrc1, bufSrc2);
+	if (meta->types[3] && meta->types[2] != '_') {
+		strcat(bufSrc2, ", ");
+	}
+	snprintf(buf, bufsize, "%s %s%s%s%s", meta->name, bufDst, bufSrc1, bufSrc2, bufSrc3);
 }
diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
index 795dc75233..f626204791 100644
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@@ -164,6 +164,7 @@ enum class IROp : u8 {
 	// support SIMD.
 	Vec4Init,
 	Vec4Shuffle,
+	Vec4Blend,
 	Vec4Mov,
 	Vec4Add,
 	Vec4Sub,
@@ -330,7 +331,7 @@ enum IRFlags {
 struct IRMeta {
 	IROp op;
 	const char *name;
-	const char types[4];  // GGG
+	const char types[5];  // GGG
 	u32 flags;
 };
 
diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
index aee5c500e8..407f790255 100644
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@@ -304,13 +304,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		}
 
 		case IROp::Vec4Shuffle:
-		{
 			// Can't use the SSE shuffle here because it takes an immediate. pshufb with a table would work though,
 			// or a big switch - there are only 256 shuffles possible (4^4)
 			for (int i = 0; i < 4; i++)
 				mips->f[inst->dest + i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)];
 			break;
-		}
+
+		case IROp::Vec4Blend:
+			// Could use _mm_blendv_ps (SSE4+BMI), vbslq_f32 (ARM), __riscv_vmerge_vvm (RISC-V)
+			for (int i = 0; i < 4; i++)
+				mips->f[inst->dest + i] = ((inst->constant >> i) & 1) ? mips->f[inst->src2 + i] : mips->f[inst->src1 + i];
+			break;
 
 		case IROp::Vec4Mov:
 		{
diff --git a/Core/MIPS/IR/IRNativeCommon.cpp b/Core/MIPS/IR/IRNativeCommon.cpp
index eb56319d75..39d8cca511 100644
--- a/Core/MIPS/IR/IRNativeCommon.cpp
+++ b/Core/MIPS/IR/IRNativeCommon.cpp
@@ -299,6 +299,7 @@ void IRNativeBackend::CompileIRInst(IRInst inst) {
 
 	case IROp::Vec4Init:
 	case IROp::Vec4Shuffle:
+	case IROp::Vec4Blend:
 	case IROp::Vec4Mov:
 		CompIR_VecAssign(inst);
 		break;
diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
index c595d26ef9..b5258bd8be 100644
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -750,6 +750,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts
 		case IROp::Vec4Dot:
 		case IROp::Vec4Scale:
 		case IROp::Vec4Shuffle:
+		case IROp::Vec4Blend:
 		case IROp::Vec4Neg:
 		case IROp::Vec4Abs:
 		case IROp::Vec4Pack31To8:
diff --git a/Core/MIPS/RiscV/RiscVCompVec.cpp b/Core/MIPS/RiscV/RiscVCompVec.cpp
index 08739152ae..0f3db33c1c 100644
--- a/Core/MIPS/RiscV/RiscVCompVec.cpp
+++ b/Core/MIPS/RiscV/RiscVCompVec.cpp
@@ -117,6 +117,14 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) {
 		}
 		break;
 
+	case IROp::Vec4Blend:
+		fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
+		for (int i = 0; i < 4; ++i) {
+			int which = (inst.constant >> i) & 1;
+			FMV(32, fpr.R(inst.dest + i), fpr.R((which ? inst.src2 : inst.src1) + i));
+		}
+		break;
+
 	case IROp::Vec4Mov:
 		fpr.Map4DirtyIn(inst.dest, inst.src1);
 		for (int i = 0; i < 4; ++i)

From 2b964fd3b058fc30a2597da122615669b22efac2 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 6 Aug 2023 12:50:14 -0700
Subject: [PATCH 3/5] irjit: Handle more common Vec4 prefix cases.

---
 Core/MIPS/IR/IRCompVFPU.cpp | 48 +++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index fe97084d65..1435ef07c4 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -212,6 +212,54 @@ namespace MIPSComp {
 				ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix);
 				return;
 			}
+
+			if ((prefix & 0x000FF000) == 0x0000F000) {
+				// Handle some easy and common cases.
+				Vec4Init init = Vec4Init::AllZERO;
+				bool useInit;
+				switch (prefix & 0xFFF) {
+				case 0x00: useInit = true; init = Vec4Init::AllZERO; break;
+				case 0x01: useInit = true; init = Vec4Init::Set_1000; break;
+				case 0x04: useInit = true; init = Vec4Init::Set_0100; break;
+				case 0x10: useInit = true; init = Vec4Init::Set_0010; break;
+				case 0x40: useInit = true; init = Vec4Init::Set_0001; break;
+				case 0x55: useInit = true; init = Vec4Init::AllONE; break;
+				default: useInit = false; break;
+				}
+
+				if (useInit) {
+					InitRegs(vregs, tempReg);
+					ir.Write(IROp::Vec4Init, vregs[0], (int)init);
+					return;
+				}
+			}
+
+			// Check if we're just zeroing certain lanes - this is common.
+			u32 zeroedLanes = 0;
+			for (int i = 0; i < 4; ++i) {
+				int regnum = (prefix >> (i * 2)) & 3;
+				int abs = (prefix >> (8 + i)) & 1;
+				int negate = (prefix >> (16 + i)) & 1;
+				int constants = (prefix >> (12 + i)) & 1;
+
+				if (!constants && regnum == i && !abs && !negate)
+					continue;
+				if (constants && regnum == 0 && abs == 0 && !negate) {
+					zeroedLanes |= 1 << i;
+					continue;
+				}
+
+				// Nope, it has something else going on.
+				zeroedLanes = -1;
+				break;
+			}
+
+			if (zeroedLanes != -1) {
+				InitRegs(vregs, tempReg);
+				ir.Write(IROp::Vec4Init, vregs[0], (int)Vec4Init::AllZERO);
+				ir.Write({ IROp::Vec4Blend, vregs[0], origV[0], vregs[0], zeroedLanes });
+				return;
+			}
 		}
 
 		// Alright, fall back to the generic approach.

From 6a1dbd4cde8c937c0c5ce7d6c98e8799aa72328d Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 6 Aug 2023 13:07:17 -0700
Subject: [PATCH 4/5] irjit: Allow Vec4 to be used with masks.

---
 Core/MIPS/IR/IRCompVFPU.cpp | 71 +++++++++++++++++++++++++------------
 Core/MIPS/IR/IRFrontend.h   |  3 +-
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 1435ef07c4..56d3a5d477 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -330,6 +330,13 @@ namespace MIPSComp {
 		if (js.prefixD == 0)
 			return;
 
+		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0) {
+			// Use temps for all, we'll blend in the end (keeping in Vec4.)
+			for (int i = 0; i < 4; ++i)
+				regs[i] = IRVTEMP_PFX_D + i;
+			return;
+		}
+
 		for (int i = 0; i < n; i++) {
 			// Hopefully this is rare, we'll just write it into a dumping ground reg.
 			if (js.VfpuWriteMask(i))
@@ -343,11 +350,13 @@ namespace MIPSComp {
 
 	// "D" prefix is really a post process. No need to allocate a temporary register (except
 	// dummies to simulate writemask, which is done in GetVectorRegsPrefixD
-	void IRFrontend::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
+	void IRFrontend::ApplyPrefixD(u8 *vregs, VectorSize sz, int vectorReg) {
 		_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
 		if (!js.prefixD)
 			return;
 
+		ApplyPrefixDMask(vregs, sz, vectorReg);
+
 		int n = GetNumVectorElements(sz);
 		for (int i = 0; i < n; i++) {
 			if (js.VfpuWriteMask(i))
@@ -362,6 +371,20 @@ namespace MIPSComp {
 		}
 	}
 
+	void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) {
+		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0) {
+			u8 origV[4];
+			GetVectorRegs(origV, sz, vectorReg);
+
+			// Just keep the original values where it was masked.
+			ir.Write({ IROp::Vec4Blend, origV[0], vregs[0], origV[0], js.VfpuWriteMask() });
+
+			// So that saturate works, change it back.
+			for (int i = 0; i < 4; ++i)
+				vregs[i] = origV[i];
+		}
+	}
+
 	void IRFrontend::Comp_SV(MIPSOpcode op) {
 		CONDITIONAL_DISABLE(LSU_VFPU);
 		s32 offset = (signed short)(op & 0xFFFC);
@@ -458,7 +481,7 @@ namespace MIPSComp {
 				ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f));
 			}
 		}
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, vd);
 	}
 
 	void IRFrontend::Comp_VIdt(MIPSOpcode op) {
@@ -497,7 +520,7 @@ namespace MIPSComp {
 			}
 		}
 
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, vd);
 	}
 
 	void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) {
@@ -579,7 +602,7 @@ namespace MIPSComp {
 		}
 
 		ir.Write(IROp::FMov, dregs[0], IRVTEMP_0);
-		ApplyPrefixD(dregs, V_Single);
+		ApplyPrefixD(dregs, V_Single, vd);
 	}
 
 	alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
@@ -619,7 +642,7 @@ namespace MIPSComp {
 			break;
 		}
 
-		ApplyPrefixD(dregs, V_Single);
+		ApplyPrefixD(dregs, V_Single, _VD);
 	}
 
 	void IRFrontend::Comp_VDot(MIPSOpcode op) {
@@ -646,7 +669,7 @@ namespace MIPSComp {
 
 		if (IsVec4(sz, sregs) && IsVec4(sz, tregs) && IsOverlapSafe(dregs[0], n, sregs, n, tregs)) {
 			ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]);
-			ApplyPrefixD(dregs, V_Single);
+			ApplyPrefixD(dregs, V_Single, vd);
 			return;
 		}
 
@@ -657,7 +680,7 @@ namespace MIPSComp {
 			ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]);
 			ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1);
 		}
-		ApplyPrefixD(dregs, V_Single);
+		ApplyPrefixD(dregs, V_Single, vd);
 	}
 
 	void IRFrontend::Comp_VecDo3(MIPSOpcode op) {
@@ -772,7 +795,7 @@ namespace MIPSComp {
 			} else {
 				DISABLE;
 			}
-			ApplyPrefixD(dregs, sz);
+			ApplyPrefixD(dregs, sz, _VD);
 			return;
 		}
 
@@ -821,7 +844,7 @@ namespace MIPSComp {
 			}
 		}
 
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, _VD);
 	}
 
 	void IRFrontend::Comp_VV2Op(MIPSOpcode op) {
@@ -899,7 +922,7 @@ namespace MIPSComp {
 				ir.Write(IROp::Vec4Neg, dregs[0], sregs[0]);
 				break;
 			}
-			ApplyPrefixD(dregs, sz);
+			ApplyPrefixD(dregs, sz, vd);
 			return;
 		}
 
@@ -967,7 +990,7 @@ namespace MIPSComp {
 			}
 		}
 
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, vd);
 	}
 
 	void IRFrontend::Comp_Vi2f(MIPSOpcode op) {
@@ -994,7 +1017,7 @@ namespace MIPSComp {
 			else
 				ir.Write(IROp::FCvtScaledSW, dregs[i], sregs[i], imm);
 		}
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, _VD);
 	}
 
 	void IRFrontend::Comp_Vh2f(MIPSOpcode op) {
@@ -1060,6 +1083,8 @@ namespace MIPSComp {
 				}
 			}
 		}
+
+		ApplyPrefixDMask(dregs, sz, _VD);
 	}
 
 	void IRFrontend::Comp_Mftv(MIPSOpcode op) {
@@ -1310,7 +1335,7 @@ namespace MIPSComp {
 		if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
 			if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) {
 				ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg);
-				ApplyPrefixD(dregs, sz);
+				ApplyPrefixD(dregs, sz, vd);
 				return;
 			}
 		}
@@ -1326,7 +1351,7 @@ namespace MIPSComp {
 			}
 		}
 
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, vd);
 	}
 
 	/*
@@ -1653,7 +1678,7 @@ namespace MIPSComp {
 			}
 		}
 
-		ApplyPrefixD(dregs, outsize);
+		ApplyPrefixD(dregs, outsize, _VD);
 	}
 
 	void IRFrontend::Comp_Vx2i(MIPSOpcode op) {
@@ -1752,7 +1777,7 @@ namespace MIPSComp {
 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
 			}
 		}
-		ApplyPrefixD(dregs, outsize);
+		ApplyPrefixD(dregs, outsize, _VD);
 	}
 
 	void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) {
@@ -1909,7 +1934,7 @@ namespace MIPSComp {
 				ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7));
 			}
 		}
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, _VD);
 	}
 
 	void IRFrontend::Comp_Viim(MIPSOpcode op) {
@@ -1924,7 +1949,7 @@ namespace MIPSComp {
 		u8 dreg;
 		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
 		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm));
-		ApplyPrefixD(&dreg, V_Single);
+		ApplyPrefixD(&dreg, V_Single, _VT);
 	}
 
 	void IRFrontend::Comp_Vfim(MIPSOpcode op) {
@@ -1942,7 +1967,7 @@ namespace MIPSComp {
 		u8 dreg;
 		GetVectorRegsPrefixD(&dreg, V_Single, _VT);
 		ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f));
-		ApplyPrefixD(&dreg, V_Single);
+		ApplyPrefixD(&dreg, V_Single, _VT);
 	}
 
 	void IRFrontend::Comp_Vcst(MIPSOpcode op) {
@@ -1964,7 +1989,7 @@ namespace MIPSComp {
 		for (int i = 0; i < n; i++) {
 			ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));
 		}
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, vd);
 	}
 
 	// Very heavily used by FF:CC. Should be replaced by a fast approximation instead of
@@ -2075,7 +2100,7 @@ namespace MIPSComp {
 			}
 		}
 
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, _VD);
 	}
 
 	void IRFrontend::Comp_Vocp(MIPSOpcode op) {
@@ -2121,7 +2146,7 @@ namespace MIPSComp {
 			}
 		}
 
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, _VD);
 	}
 
 	void IRFrontend::Comp_ColorConv(MIPSOpcode op) {
@@ -2190,6 +2215,6 @@ namespace MIPSComp {
 				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
 		}
 
-		ApplyPrefixD(dregs, sz);
+		ApplyPrefixD(dregs, sz, _VD);
 	}
 }
diff --git a/Core/MIPS/IR/IRFrontend.h b/Core/MIPS/IR/IRFrontend.h
index b53e053d73..4ae7b4a095 100644
--- a/Core/MIPS/IR/IRFrontend.h
+++ b/Core/MIPS/IR/IRFrontend.h
@@ -126,7 +126,8 @@ private:
 	void CompShiftVar(MIPSOpcode op, IROp shiftType);
 
 	void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg);
-	void ApplyPrefixD(const u8 *vregs, VectorSize sz);
+	void ApplyPrefixD(u8 *vregs, VectorSize sz, int vectorReg);
+	void ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg);
 	void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg);
 	void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg);
 	void GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg);

From 3dc71cff750c898ce537f7e4672e874034daf903 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 6 Aug 2023 13:32:31 -0700
Subject: [PATCH 5/5] irjit: Keep a couple more ops in Vec4.

---
 Core/MIPS/IR/IRCompVFPU.cpp | 48 +++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp
index 56d3a5d477..b04bbe5f6f 100644
--- a/Core/MIPS/IR/IRCompVFPU.cpp
+++ b/Core/MIPS/IR/IRCompVFPU.cpp
@@ -1917,6 +1917,10 @@ namespace MIPSComp {
 		int tf = (op >> 19) & 1;
 		int imm3 = (op >> 16) & 7;
 
+		if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) {
+			// TODO: Could do a VfpuCC variant of Vec4Blend.
+		}
+
 		for (int i = 0; i < n; ++i) {
 			// Simplification: Disable if overlap unsafe
 			if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
@@ -1986,8 +1990,18 @@ namespace MIPSComp {
 
 		u8 dregs[4];
 		GetVectorRegsPrefixD(dregs, sz, vd);
-		for (int i = 0; i < n; i++) {
-			ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));
+
+		if (IsVec4(sz, dregs)) {
+			ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum]));
+			ir.Write(IROp::Vec4Shuffle, dregs[0], IRVTEMP_0, 0);
+		} else {
+			for (int i = 0; i < n; i++) {
+				// Most of the time, materializing a float is slower than copying from another float.
+				if (i == 0)
+					ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum]));
+				else
+					ir.Write(IROp::FMov, dregs[i], dregs[0]);
+			}
 		}
 		ApplyPrefixD(dregs, sz, vd);
 	}
@@ -2128,21 +2142,25 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, _VS);
 		GetVectorRegsPrefixD(dregs, sz, _VD);
 
-		u8 tempregs[4];
-		for (int i = 0; i < n; ++i) {
-			if (!IsOverlapSafe(dregs[i], n, sregs)) {
-				tempregs[i] = IRVTEMP_0 + i;
-			} else {
-				tempregs[i] = dregs[i];
+		if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+			ir.Write(IROp::Vec4Add, dregs[0], tregs[0], sregs[0]);
+		} else {
+			u8 tempregs[4];
+			for (int i = 0; i < n; ++i) {
+				if (!IsOverlapSafe(dregs[i], n, sregs)) {
+					tempregs[i] = IRVTEMP_0 + i;
+				} else {
+					tempregs[i] = dregs[i];
+				}
 			}
-		}
 
-		for (int i = 0; i < n; ++i) {
-			ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]);
-		}
-		for (int i = 0; i < n; ++i) {
-			if (dregs[i] != tempregs[i]) {
-				ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+			for (int i = 0; i < n; ++i) {
+				ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]);
+			}
+			for (int i = 0; i < n; ++i) {
+				if (dregs[i] != tempregs[i]) {
+					ir.Write(IROp::FMov, dregs[i], tempregs[i]);
+				}
 			}
 		}