From 85ee7c85c1095d7fd0da4d41f3e7f3983bb7aff4 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 6 Aug 2023 12:05:51 -0700 Subject: [PATCH 1/5] irjit: Allow masked vneg.q. --- Core/MIPS/IR/IRCompFPU.cpp | 3 ++- Core/MIPS/IR/IRCompVFPU.cpp | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Core/MIPS/IR/IRCompFPU.cpp b/Core/MIPS/IR/IRCompFPU.cpp index b0edeea0cf..3c3ed87d03 100644 --- a/Core/MIPS/IR/IRCompFPU.cpp +++ b/Core/MIPS/IR/IRCompFPU.cpp @@ -226,7 +226,8 @@ void IRFrontend::Comp_mxc1(MIPSOpcode op) { UpdateRoundingMode(); ApplyRoundingMode(); } else { - Comp_Generic(op); + // Maybe not strictly invalid? But likely invalid. + INVALIDOP; } return; default: diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp index c902951d96..c80fa6e1ff 100644 --- a/Core/MIPS/IR/IRCompVFPU.cpp +++ b/Core/MIPS/IR/IRCompVFPU.cpp @@ -782,6 +782,10 @@ namespace MIPSComp { if (optype == 0) { if (js.HasUnknownPrefix() || !IsPrefixWithinSize(js.prefixS, op)) DISABLE; + } else if (optype == 1 || optype == 2) { + // D prefix is fine for these, and used sometimes. + if (js.HasUnknownPrefix() || js.HasSPrefix()) + DISABLE; } else { // Many of these apply the D prefix strangely or override parts of the S prefix. if (!js.HasNoPrefix()) From 79ca880ac70ba2f3848329be6e266c2f5e416fec Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 6 Aug 2023 12:06:16 -0700 Subject: [PATCH 2/5] irjit: Implement vqmul, add Vec4Blend. Should be useful more places. --- Core/MIPS/IR/IRCompVFPU.cpp | 69 ++++++++++++++++++++++++-------- Core/MIPS/IR/IRInst.cpp | 9 ++++- Core/MIPS/IR/IRInst.h | 3 +- Core/MIPS/IR/IRInterpreter.cpp | 8 +++- Core/MIPS/IR/IRNativeCommon.cpp | 1 + Core/MIPS/IR/IRPassSimplify.cpp | 1 + Core/MIPS/RiscV/RiscVCompVec.cpp | 8 ++++ 7 files changed, 78 insertions(+), 21 deletions(-) diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp index c80fa6e1ff..fe97084d65 100644 --- a/Core/MIPS/IR/IRCompVFPU.cpp +++ b/Core/MIPS/IR/IRCompVFPU.cpp @@ -1727,16 +1727,16 @@ namespace MIPSComp { GetVectorRegs(tregs, sz, _VT); GetVectorRegs(dregs, sz, _VD); - u8 tempregs[4]{}; - for (int i = 0; i < n; ++i) { - if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) { - tempregs[i] = IRVTEMP_PFX_T + i; // using IRTEMP0 for other things - } else { - tempregs[i] = dregs[i]; - } - } - if (sz == V_Triple) { + u8 tempregs[4]{}; + for (int i = 0; i < n; ++i) { + if (!IsOverlapSafe(dregs[i], n, sregs, n, tregs)) { + tempregs[i] = IRVTEMP_PFX_T + i; // using IRTEMP0 for other things + } else { + tempregs[i] = dregs[i]; + } + } + int temp0 = IRVTEMP_0; int temp1 = IRVTEMP_0 + 1; // Compute X @@ -1753,15 +1753,50 @@ namespace MIPSComp { ir.Write(IROp::FMul, temp0, sregs[0], tregs[1]); ir.Write(IROp::FMul, temp1, sregs[1], tregs[0]); ir.Write(IROp::FSub, tempregs[2], temp0, temp1); - } else if (sz == V_Quad) { - DISABLE; - } else { - DISABLE; - } - for (int i = 0; i < n; i++) { - if (tempregs[i] != dregs[i]) - ir.Write(IROp::FMov, dregs[i], tempregs[i]); + for (int i = 0; i < n; i++) { + if (tempregs[i] != dregs[i]) + ir.Write(IROp::FMov, dregs[i], tempregs[i]); + } + } else if (sz == V_Quad) { + // Rather than using vdots, we organize this as SIMD multiplies and adds. + // That means flipping the logic column-wise. Also, luckily no prefix temps used. + if (!IsConsecutive4(sregs) || !IsConsecutive4(tregs) || !IsConsecutive4(dregs)) { + DISABLE; + } + + auto shuffleImm = [](int x, int y, int z, int w) { return x | (y << 2) | (z << 4) | (w << 6); }; + auto blendConst = [](int x, int y, int z, int w) { return x | (y << 1) | (z << 2) | (w << 3); }; + + // Prepare some negatives. + ir.Write(IROp::Vec4Neg, IRVTEMP_0, tregs[0]); + + // tmp = S[x,x,x,x] * T[w,-z,y,-x] + ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(0, 0, 0, 0)); + ir.Write(IRInst{ IROp::Vec4Blend, IRVTEMP_PFX_T, tregs[0], IRVTEMP_0, blendConst(1, 0, 1, 0) }); + ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_T, shuffleImm(3, 2, 1, 0)); + ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_D, IRVTEMP_PFX_S, IRVTEMP_PFX_T); + + // tmp += S[y,y,y,y] * T[z,w,-x,-y] + ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(1, 1, 1, 1)); + ir.Write(IRInst{ IROp::Vec4Blend, IRVTEMP_PFX_T, tregs[0], IRVTEMP_0, blendConst(1, 1, 0, 0) }); + ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_T, shuffleImm(2, 3, 0, 1)); + ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T); + ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S); + + // tmp += S[z,z,z,z] * T[-y,x,w,-z] + ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(2, 2, 2, 2)); + ir.Write(IRInst{ IROp::Vec4Blend, IRVTEMP_PFX_T, tregs[0], IRVTEMP_0, blendConst(0, 1, 1, 0) }); + ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, IRVTEMP_PFX_T, shuffleImm(1, 0, 3, 2)); + ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, IRVTEMP_PFX_T); + ir.Write(IROp::Vec4Add, IRVTEMP_PFX_D, IRVTEMP_PFX_D, IRVTEMP_PFX_S); + + // tmp += S[w,w,w,w] * T[x,y,z,w] + ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], shuffleImm(3, 3, 3, 3)); + ir.Write(IROp::Vec4Mul, IRVTEMP_PFX_S, IRVTEMP_PFX_S, tregs[0]); + ir.Write(IROp::Vec4Add, dregs[0], IRVTEMP_PFX_D, IRVTEMP_PFX_S); + } else { + INVALIDOP; } } diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp index d0e0fe1d74..5cdfc3824a 100644 --- a/Core/MIPS/IR/IRInst.cpp +++ b/Core/MIPS/IR/IRInst.cpp @@ -126,6 +126,7 @@ static const IRMeta irMeta[] = { { IROp::FCmpVfpuAggregate, "FCmpVfpuAggregate", "I" }, { IROp::Vec4Init, "Vec4Init", "Vv" }, { IROp::Vec4Shuffle, "Vec4Shuffle", "VVs" }, + { IROp::Vec4Blend, "Vec4Blend", "VVVC" }, { IROp::Vec4Mov, "Vec4Mov", "VV" }, { IROp::Vec4Add, "Vec4Add", "VVV" }, { IROp::Vec4Sub, "Vec4Sub", "VVV" }, @@ -328,14 +329,20 @@ void DisassembleIR(char *buf, size_t bufsize, IRInst inst) { char bufDst[16]; char bufSrc1[16]; char bufSrc2[16]; + // Only really used for constant. + char bufSrc3[16]; DisassembleParam(bufDst, sizeof(bufDst) - 2, inst.dest, meta->types[0], inst.constant); DisassembleParam(bufSrc1, sizeof(bufSrc1) - 2, inst.src1, meta->types[1], inst.constant); DisassembleParam(bufSrc2, sizeof(bufSrc2), inst.src2, meta->types[2], inst.constant); + DisassembleParam(bufSrc3, sizeof(bufSrc3), inst.src3, meta->types[3], inst.constant); if (meta->types[1] && meta->types[0] != '_') { strcat(bufDst, ", "); } if (meta->types[2] && meta->types[1] != '_') { strcat(bufSrc1, ", "); } - snprintf(buf, bufsize, "%s %s%s%s", meta->name, bufDst, bufSrc1, bufSrc2); + if (meta->types[3] && meta->types[2] != '_') { + strcat(bufSrc2, ", "); + } + snprintf(buf, bufsize, "%s %s%s%s%s", meta->name, bufDst, bufSrc1, bufSrc2, bufSrc3); } diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h index 795dc75233..f626204791 100644 --- a/Core/MIPS/IR/IRInst.h +++ b/Core/MIPS/IR/IRInst.h @@ -164,6 +164,7 @@ enum class IROp : u8 { // support SIMD. Vec4Init, Vec4Shuffle, + Vec4Blend, Vec4Mov, Vec4Add, Vec4Sub, @@ -330,7 +331,7 @@ enum IRFlags { struct IRMeta { IROp op; const char *name; - const char types[4]; // GGG + const char types[5]; // GGG u32 flags; }; diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index aee5c500e8..407f790255 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -304,13 +304,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { } case IROp::Vec4Shuffle: - { // Can't use the SSE shuffle here because it takes an immediate. pshufb with a table would work though, // or a big switch - there are only 256 shuffles possible (4^4) for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = mips->f[inst->src1 + ((inst->src2 >> (i * 2)) & 3)]; break; - } + + case IROp::Vec4Blend: + // Could use _mm_blendv_ps (SSE4+BMI), vbslq_f32 (ARM), __riscv_vmerge_vvm (RISC-V) + for (int i = 0; i < 4; i++) + mips->f[inst->dest + i] = ((inst->constant >> i) & 1) ? mips->f[inst->src2 + i] : mips->f[inst->src1 + i]; + break; case IROp::Vec4Mov: { diff --git a/Core/MIPS/IR/IRNativeCommon.cpp b/Core/MIPS/IR/IRNativeCommon.cpp index eb56319d75..39d8cca511 100644 --- a/Core/MIPS/IR/IRNativeCommon.cpp +++ b/Core/MIPS/IR/IRNativeCommon.cpp @@ -299,6 +299,7 @@ void IRNativeBackend::CompileIRInst(IRInst inst) { case IROp::Vec4Init: case IROp::Vec4Shuffle: + case IROp::Vec4Blend: case IROp::Vec4Mov: CompIR_VecAssign(inst); break; diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp index c595d26ef9..b5258bd8be 100644 --- a/Core/MIPS/IR/IRPassSimplify.cpp +++ b/Core/MIPS/IR/IRPassSimplify.cpp @@ -750,6 +750,7 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts case IROp::Vec4Dot: case IROp::Vec4Scale: case IROp::Vec4Shuffle: + case IROp::Vec4Blend: case IROp::Vec4Neg: case IROp::Vec4Abs: case IROp::Vec4Pack31To8: diff --git a/Core/MIPS/RiscV/RiscVCompVec.cpp b/Core/MIPS/RiscV/RiscVCompVec.cpp index 08739152ae..0f3db33c1c 100644 --- a/Core/MIPS/RiscV/RiscVCompVec.cpp +++ b/Core/MIPS/RiscV/RiscVCompVec.cpp @@ -117,6 +117,14 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) { } break; + case IROp::Vec4Blend: + fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2); + for (int i = 0; i < 4; ++i) { + int which = (inst.constant >> i) & 1; + FMV(32, fpr.R(inst.dest + i), fpr.R((which ? inst.src2 : inst.src1) + i)); + } + break; + case IROp::Vec4Mov: fpr.Map4DirtyIn(inst.dest, inst.src1); for (int i = 0; i < 4; ++i) From 2b964fd3b058fc30a2597da122615669b22efac2 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 6 Aug 2023 12:50:14 -0700 Subject: [PATCH 3/5] irjit: Handle more common Vec4 prefix cases. --- Core/MIPS/IR/IRCompVFPU.cpp | 48 +++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp index fe97084d65..1435ef07c4 100644 --- a/Core/MIPS/IR/IRCompVFPU.cpp +++ b/Core/MIPS/IR/IRCompVFPU.cpp @@ -212,6 +212,54 @@ namespace MIPSComp { ir.Write(IROp::Vec4Shuffle, vregs[0], origV[0], prefix); return; } + + if ((prefix & 0x000FF000) == 0x0000F000) { + // Handle some easy and common cases. + Vec4Init init = Vec4Init::AllZERO; + bool useInit; + switch (prefix & 0xFFF) { + case 0x00: useInit = true; init = Vec4Init::AllZERO; break; + case 0x01: useInit = true; init = Vec4Init::Set_1000; break; + case 0x04: useInit = true; init = Vec4Init::Set_0100; break; + case 0x10: useInit = true; init = Vec4Init::Set_0010; break; + case 0x40: useInit = true; init = Vec4Init::Set_0001; break; + case 0x55: useInit = true; init = Vec4Init::AllONE; break; + default: useInit = false; break; + } + + if (useInit) { + InitRegs(vregs, tempReg); + ir.Write(IROp::Vec4Init, vregs[0], (int)init); + return; + } + } + + // Check if we're just zeroing certain lanes - this is common. + u32 zeroedLanes = 0; + for (int i = 0; i < 4; ++i) { + int regnum = (prefix >> (i * 2)) & 3; + int abs = (prefix >> (8 + i)) & 1; + int negate = (prefix >> (16 + i)) & 1; + int constants = (prefix >> (12 + i)) & 1; + + if (!constants && regnum == i && !abs && !negate) + continue; + if (constants && regnum == 0 && abs == 0 && !negate) { + zeroedLanes |= 1 << i; + continue; + } + + // Nope, it has something else going on. + zeroedLanes = -1; + break; + } + + if (zeroedLanes != -1) { + InitRegs(vregs, tempReg); + ir.Write(IROp::Vec4Init, vregs[0], (int)Vec4Init::AllZERO); + ir.Write({ IROp::Vec4Blend, vregs[0], origV[0], vregs[0], zeroedLanes }); + return; + } } // Alright, fall back to the generic approach. From 6a1dbd4cde8c937c0c5ce7d6c98e8799aa72328d Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 6 Aug 2023 13:07:17 -0700 Subject: [PATCH 4/5] irjit: Allow Vec4 to be used with masks. --- Core/MIPS/IR/IRCompVFPU.cpp | 71 +++++++++++++++++++++++++------------ Core/MIPS/IR/IRFrontend.h | 3 +- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp index 1435ef07c4..56d3a5d477 100644 --- a/Core/MIPS/IR/IRCompVFPU.cpp +++ b/Core/MIPS/IR/IRCompVFPU.cpp @@ -330,6 +330,13 @@ namespace MIPSComp { if (js.prefixD == 0) return; + if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0) { + // Use temps for all, we'll blend in the end (keeping in Vec4.) + for (int i = 0; i < 4; ++i) + regs[i] = IRVTEMP_PFX_D + i; + return; + } + for (int i = 0; i < n; i++) { // Hopefully this is rare, we'll just write it into a dumping ground reg. if (js.VfpuWriteMask(i)) @@ -343,11 +350,13 @@ namespace MIPSComp { // "D" prefix is really a post process. No need to allocate a temporary register (except // dummies to simulate writemask, which is done in GetVectorRegsPrefixD - void IRFrontend::ApplyPrefixD(const u8 *vregs, VectorSize sz) { + void IRFrontend::ApplyPrefixD(u8 *vregs, VectorSize sz, int vectorReg) { _assert_(js.prefixDFlag & JitState::PREFIX_KNOWN); if (!js.prefixD) return; + ApplyPrefixDMask(vregs, sz, vectorReg); + int n = GetNumVectorElements(sz); for (int i = 0; i < n; i++) { if (js.VfpuWriteMask(i)) @@ -362,6 +371,20 @@ namespace MIPSComp { } } + void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) { + if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0) { + u8 origV[4]; + GetVectorRegs(origV, sz, vectorReg); + + // Just keep the original values where it was masked. + ir.Write({ IROp::Vec4Blend, origV[0], vregs[0], origV[0], js.VfpuWriteMask() }); + + // So that saturate works, change it back. + for (int i = 0; i < 4; ++i) + vregs[i] = origV[i]; + } + } + void IRFrontend::Comp_SV(MIPSOpcode op) { CONDITIONAL_DISABLE(LSU_VFPU); s32 offset = (signed short)(op & 0xFFFC); @@ -458,7 +481,7 @@ namespace MIPSComp { ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(type == 6 ? 0.0f : 1.0f)); } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, vd); } void IRFrontend::Comp_VIdt(MIPSOpcode op) { @@ -497,7 +520,7 @@ namespace MIPSComp { } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, vd); } void IRFrontend::Comp_VMatrixInit(MIPSOpcode op) { @@ -579,7 +602,7 @@ namespace MIPSComp { } ir.Write(IROp::FMov, dregs[0], IRVTEMP_0); - ApplyPrefixD(dregs, V_Single); + ApplyPrefixD(dregs, V_Single, vd); } alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f }; @@ -619,7 +642,7 @@ namespace MIPSComp { break; } - ApplyPrefixD(dregs, V_Single); + ApplyPrefixD(dregs, V_Single, _VD); } void IRFrontend::Comp_VDot(MIPSOpcode op) { @@ -646,7 +669,7 @@ namespace MIPSComp { if (IsVec4(sz, sregs) && IsVec4(sz, tregs) && IsOverlapSafe(dregs[0], n, sregs, n, tregs)) { ir.Write(IROp::Vec4Dot, dregs[0], sregs[0], tregs[0]); - ApplyPrefixD(dregs, V_Single); + ApplyPrefixD(dregs, V_Single, vd); return; } @@ -657,7 +680,7 @@ namespace MIPSComp { ir.Write(IROp::FMul, temp1, sregs[i], tregs[i]); ir.Write(IROp::FAdd, i == (n - 1) ? dregs[0] : temp0, temp0, temp1); } - ApplyPrefixD(dregs, V_Single); + ApplyPrefixD(dregs, V_Single, vd); } void IRFrontend::Comp_VecDo3(MIPSOpcode op) { @@ -772,7 +795,7 @@ namespace MIPSComp { } else { DISABLE; } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, _VD); return; } @@ -821,7 +844,7 @@ namespace MIPSComp { } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, _VD); } void IRFrontend::Comp_VV2Op(MIPSOpcode op) { @@ -899,7 +922,7 @@ namespace MIPSComp { ir.Write(IROp::Vec4Neg, dregs[0], sregs[0]); break; } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, vd); return; } @@ -967,7 +990,7 @@ namespace MIPSComp { } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, vd); } void IRFrontend::Comp_Vi2f(MIPSOpcode op) { @@ -994,7 +1017,7 @@ namespace MIPSComp { else ir.Write(IROp::FCvtScaledSW, dregs[i], sregs[i], imm); } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, _VD); } void IRFrontend::Comp_Vh2f(MIPSOpcode op) { @@ -1060,6 +1083,8 @@ namespace MIPSComp { } } } + + ApplyPrefixDMask(dregs, sz, _VD); } void IRFrontend::Comp_Mftv(MIPSOpcode op) { @@ -1310,7 +1335,7 @@ namespace MIPSComp { if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) { if (!overlap || (vs == vd && IsOverlapSafe(treg, n, dregs))) { ir.Write(IROp::Vec4Scale, dregs[0], sregs[0], treg); - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, vd); return; } } @@ -1326,7 +1351,7 @@ namespace MIPSComp { } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, vd); } /* @@ -1653,7 +1678,7 @@ namespace MIPSComp { } } - ApplyPrefixD(dregs, outsize); + ApplyPrefixD(dregs, outsize, _VD); } void IRFrontend::Comp_Vx2i(MIPSOpcode op) { @@ -1752,7 +1777,7 @@ namespace MIPSComp { ir.Write(IROp::FMov, dregs[i], tempregs[i]); } } - ApplyPrefixD(dregs, outsize); + ApplyPrefixD(dregs, outsize, _VD); } void IRFrontend::Comp_VCrossQuat(MIPSOpcode op) { @@ -1909,7 +1934,7 @@ namespace MIPSComp { ir.Write(IROp::FCmovVfpuCC, dregs[i], sregs[i], (i) | ((!tf) << 7)); } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, _VD); } void IRFrontend::Comp_Viim(MIPSOpcode op) { @@ -1924,7 +1949,7 @@ namespace MIPSComp { u8 dreg; GetVectorRegsPrefixD(&dreg, V_Single, _VT); ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat((float)imm)); - ApplyPrefixD(&dreg, V_Single); + ApplyPrefixD(&dreg, V_Single, _VT); } void IRFrontend::Comp_Vfim(MIPSOpcode op) { @@ -1942,7 +1967,7 @@ namespace MIPSComp { u8 dreg; GetVectorRegsPrefixD(&dreg, V_Single, _VT); ir.Write(IROp::SetConstF, dreg, ir.AddConstantFloat(fval.f)); - ApplyPrefixD(&dreg, V_Single); + ApplyPrefixD(&dreg, V_Single, _VT); } void IRFrontend::Comp_Vcst(MIPSOpcode op) { @@ -1964,7 +1989,7 @@ namespace MIPSComp { for (int i = 0; i < n; i++) { ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum])); } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, vd); } // Very heavily used by FF:CC. Should be replaced by a fast approximation instead of @@ -2075,7 +2100,7 @@ namespace MIPSComp { } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, _VD); } void IRFrontend::Comp_Vocp(MIPSOpcode op) { @@ -2121,7 +2146,7 @@ namespace MIPSComp { } } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, _VD); } void IRFrontend::Comp_ColorConv(MIPSOpcode op) { @@ -2190,6 +2215,6 @@ namespace MIPSComp { ir.Write(IROp::FMov, dregs[i], tempregs[i]); } - ApplyPrefixD(dregs, sz); + ApplyPrefixD(dregs, sz, _VD); } } diff --git a/Core/MIPS/IR/IRFrontend.h b/Core/MIPS/IR/IRFrontend.h index b53e053d73..4ae7b4a095 100644 --- a/Core/MIPS/IR/IRFrontend.h +++ b/Core/MIPS/IR/IRFrontend.h @@ -126,7 +126,8 @@ private: void CompShiftVar(MIPSOpcode op, IROp shiftType); void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz, int tempReg); - void ApplyPrefixD(const u8 *vregs, VectorSize sz); + void ApplyPrefixD(u8 *vregs, VectorSize sz, int vectorReg); + void ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg); void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg); void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg); void GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg); From 3dc71cff750c898ce537f7e4672e874034daf903 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 6 Aug 2023 13:32:31 -0700 Subject: [PATCH 5/5] irjit: Keep a couple more ops in Vec4. --- Core/MIPS/IR/IRCompVFPU.cpp | 48 +++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/Core/MIPS/IR/IRCompVFPU.cpp b/Core/MIPS/IR/IRCompVFPU.cpp index 56d3a5d477..b04bbe5f6f 100644 --- a/Core/MIPS/IR/IRCompVFPU.cpp +++ b/Core/MIPS/IR/IRCompVFPU.cpp @@ -1917,6 +1917,10 @@ namespace MIPSComp { int tf = (op >> 19) & 1; int imm3 = (op >> 16) & 7; + if (IsVec4(sz, sregs) && IsVec4(sz, dregs)) { + // TODO: Could do a VfpuCC variant of Vec4Blend. + } + for (int i = 0; i < n; ++i) { // Simplification: Disable if overlap unsafe if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) { @@ -1986,8 +1990,18 @@ namespace MIPSComp { u8 dregs[4]; GetVectorRegsPrefixD(dregs, sz, vd); - for (int i = 0; i < n; i++) { - ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum])); + + if (IsVec4(sz, dregs)) { + ir.Write(IROp::SetConstF, IRVTEMP_0, ir.AddConstantFloat(cst_constants[conNum])); + ir.Write(IROp::Vec4Shuffle, dregs[0], IRVTEMP_0, 0); + } else { + for (int i = 0; i < n; i++) { + // Most of the time, materializing a float is slower than copying from another float. + if (i == 0) + ir.Write(IROp::SetConstF, dregs[i], ir.AddConstantFloat(cst_constants[conNum])); + else + ir.Write(IROp::FMov, dregs[i], dregs[0]); + } } ApplyPrefixD(dregs, sz, vd); } @@ -2128,21 +2142,25 @@ namespace MIPSComp { GetVectorRegsPrefixT(tregs, sz, _VS); GetVectorRegsPrefixD(dregs, sz, _VD); - u8 tempregs[4]; - for (int i = 0; i < n; ++i) { - if (!IsOverlapSafe(dregs[i], n, sregs)) { - tempregs[i] = IRVTEMP_0 + i; - } else { - tempregs[i] = dregs[i]; + if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) { + ir.Write(IROp::Vec4Add, dregs[0], tregs[0], sregs[0]); + } else { + u8 tempregs[4]; + for (int i = 0; i < n; ++i) { + if (!IsOverlapSafe(dregs[i], n, sregs)) { + tempregs[i] = IRVTEMP_0 + i; + } else { + tempregs[i] = dregs[i]; + } } - } - for (int i = 0; i < n; ++i) { - ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]); - } - for (int i = 0; i < n; ++i) { - if (dregs[i] != tempregs[i]) { - ir.Write(IROp::FMov, dregs[i], tempregs[i]); + for (int i = 0; i < n; ++i) { + ir.Write(IROp::FAdd, tempregs[i], tregs[i], sregs[i]); + } + for (int i = 0; i < n; ++i) { + if (dregs[i] != tempregs[i]) { + ir.Write(IROp::FMov, dregs[i], tempregs[i]); + } } }