From 86396ba39b023a255e1f8f0dca0ac95fdc2d85c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 7 Jul 2017 14:50:50 +0200 Subject: [PATCH] Turning off the "close memory finder" lets us find more RIP addressing... --- Core/MIPS/x86/CompFPU.cpp | 10 ++++---- Core/MIPS/x86/CompVFPU.cpp | 41 +++++++++++++++++++++++++-------- GPU/Common/VertexDecoderX86.cpp | 24 +++++++++++-------- 3 files changed, 51 insertions(+), 24 deletions(-) diff --git a/Core/MIPS/x86/CompFPU.cpp b/Core/MIPS/x86/CompFPU.cpp index d276b926d6..5b64512477 100644 --- a/Core/MIPS/x86/CompFPU.cpp +++ b/Core/MIPS/x86/CompFPU.cpp @@ -277,14 +277,15 @@ void Jit::Comp_FPU2op(MIPSOpcode op) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); + MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssNoSignMask[0])); if (fd != fs && fpr.IsMapped(fs)) { - MOVAPS(fpr.RX(fd), M(ssNoSignMask)); + MOVAPS(fpr.RX(fd), MatR(TEMPREG)); ANDPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } - ANDPS(fpr.RX(fd), M(ssNoSignMask)); + ANDPS(fpr.RX(fd), MatR(TEMPREG)); } break; @@ -299,14 +300,15 @@ void Jit::Comp_FPU2op(MIPSOpcode op) { case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); + MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssSignBits2[0])); if (fd != fs && fpr.IsMapped(fs)) { - MOVAPS(fpr.RX(fd), M(ssSignBits2)); + MOVAPS(fpr.RX(fd), MatR(TEMPREG)); XORPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } - XORPS(fpr.RX(fd), M(ssSignBits2)); + XORPS(fpr.RX(fd), MatR(TEMPREG)); } break; diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 04ef64808e..6deaa3f872 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -575,7 +575,12 @@ void Jit::Comp_VIdt(MIPSOpcode op) { GetVectorRegsPrefixD(dregs, sz, _VD); if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) { int row = vd & (n - 1); - MOVAPS(fpr.VSX(dregs), M(identityMatrix[row])); + if (RipAccessible(identityMatrix)) { + MOVAPS(fpr.VSX(dregs), M(identityMatrix[row])); // rip accessible + } else { + MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[row])); + MOVAPS(fpr.VSX(dregs), MatR(TEMPREG)); + } ApplyPrefixD(dregs, sz); fpr.ReleaseSpillLocks(); return; @@ -1603,7 +1608,12 @@ void Jit::Comp_Vh2f(MIPSOpcode op) { SSE_CONST4(magic, (254 - 15) << 23); SSE_CONST4(was_infnan, 0x7bff); SSE_CONST4(exp_infnan, 255 << 23); - + + // TODO: Fix properly + if (!RipAccessible(mask_nosign)) { + DISABLE; + } + #undef SSE_CONST4 VectorSize sz = GetVecSize(op); VectorSize outsize; @@ -1639,14 +1649,14 @@ void Jit::Comp_Vh2f(MIPSOpcode op) { // OK, 16 bits in each word. // Let's go. Deep magic here. MOVAPS(XMM1, R(XMM0)); - ANDPS(XMM0, M(mask_nosign)); // xmm0 = expmant + ANDPS(XMM0, M(&mask_nosign[0])); // xmm0 = expmant XORPS(XMM1, R(XMM0)); // xmm1 = justsign = expmant ^ xmm0 MOVAPS(tempR, R(XMM0)); - PCMPGTD(tempR, M(was_infnan)); // xmm2 = b_wasinfnan + PCMPGTD(tempR, M(&was_infnan[0])); // xmm2 = b_wasinfnan PSLLD(XMM0, 13); MULPS(XMM0, M(magic)); /// xmm0 = scaled PSLLD(XMM1, 16); // xmm1 = sign - ANDPS(tempR, M(exp_infnan)); + ANDPS(tempR, M(&exp_infnan[0])); ORPS(XMM1, R(tempR)); ORPS(XMM0, R(XMM1)); @@ -1732,7 +1742,7 @@ void Jit::Comp_Vx2i(MIPSOpcode op) { MOVSS(XMM0, fpr.V(sregs[0])); if (cpu_info.bSSSE3) { // Not really different speed. Generates a bit less code. - PSHUFB(XMM0, M(vuc2i_shuffle)); + PSHUFB(XMM0, M(&vuc2i_shuffle[0])); } else { // First, we change 0xDDCCBBAA to 0xDDDDCCCCBBBBAAAA. PUNPCKLBW(XMM0, R(XMM0)); @@ -1742,7 +1752,7 @@ void Jit::Comp_Vx2i(MIPSOpcode op) { } else { if (cpu_info.bSSSE3) { MOVSS(XMM0, fpr.V(sregs[0])); - PSHUFB(XMM0, M(vc2i_shuffle)); + PSHUFB(XMM0, M(&vc2i_shuffle[0])); } else { PXOR(XMM1, R(XMM1)); MOVSS(XMM0, fpr.V(sregs[0])); @@ -1861,8 +1871,14 @@ void Jit::Comp_Vf2i(MIPSOpcode op) { } } - if (*mult != 1.0f) - MOVSD(XMM1, M(mult)); + if (*mult != 1.0f) { + if (RipAccessible(mult)) { + MOVSD(XMM1, M(mult)); // rip accessible + } else { + MOV(PTRBITS, R(TEMPREG), ImmPtr(mult)); + MOVSD(XMM1, MatR(TEMPREG)); + } + } fpr.MapRegsV(tempregs, sz, MAP_DIRTY | MAP_NOINIT); for (int i = 0; i < n; i++) { @@ -3453,7 +3469,12 @@ void Jit::CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin) { case 'S': MOVSS(fpr.V(dregs[i]), XMM0); if (negSin) { - XORPS(fpr.VX(dregs[i]), M(&signBitLower)); + if (RipAccessible(&signBitLower)) { + XORPS(fpr.VX(dregs[i]), M(&signBitLower)); // rip accessible + } else { + MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower)); + XORPS(fpr.VX(dregs[i]), MatR(TEMPREG)); + } } break; case '0': diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 2493685cff..0d2364c605 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -882,18 +882,18 @@ void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() { MOV(32, R(tempReg2), R(tempReg1)); SHR(32, R(tempReg2), Imm8(16)); - auto updateSide = [&](X64Reg r, CCFlags skipCC, u16 *value) { - CMP(16, R(r), M(value)); + MOV(PTRBITS, R(tempReg3), ImmPtr(&gstate_c.vertBounds)); + auto updateSide = [&](X64Reg r, CCFlags skipCC, int offset) { + CMP(16, R(r), MDisp(tempReg3, offset)); FixupBranch skip = J_CC(skipCC); - MOV(16, M(value), R(r)); + MOV(16, MDisp(tempReg3, offset), R(r)); SetJumpTarget(skip); }; - // TODO: Can this actually be fast? Hmm, floats aren't better. - updateSide(tempReg1, CC_GE, &gstate_c.vertBounds.minU); - updateSide(tempReg1, CC_LE, &gstate_c.vertBounds.maxU); - updateSide(tempReg2, CC_GE, &gstate_c.vertBounds.minV); - updateSide(tempReg2, CC_LE, &gstate_c.vertBounds.maxV); + updateSide(tempReg1, CC_GE, offsetof(KnownVertexBounds, minU)); + updateSide(tempReg1, CC_LE, offsetof(KnownVertexBounds, maxU)); + updateSide(tempReg2, CC_GE, offsetof(KnownVertexBounds, minV)); + updateSide(tempReg2, CC_LE, offsetof(KnownVertexBounds, maxV)); } void VertexDecoderJitCache::Jit_TcFloatThrough() { @@ -923,7 +923,6 @@ void VertexDecoderJitCache::Jit_Color8888() { SetJumpTarget(skip); } -static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; static const u32 MEMORY_ALIGNED16(color4444mask[4]) = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, }; void VertexDecoderJitCache::Jit_Color4444() { @@ -931,7 +930,12 @@ void VertexDecoderJitCache::Jit_Color4444() { MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff)); // Spread to RGBA -> R00GB00A. PUNPCKLBW(fpScratchReg, R(fpScratchReg)); - PAND(fpScratchReg, M(color4444mask)); + if (RipAccessible(&color4444mask[0])) { + PAND(fpScratchReg, M(&color4444mask[0])); + } else { + MOV(PTRBITS, R(tempReg1), ImmPtr(&color4444mask)); + PAND(fpScratchReg, MatR(tempReg1)); + } MOVSS(fpScratchReg2, R(fpScratchReg)); MOVSS(fpScratchReg3, R(fpScratchReg)); // Create 0R000B00 and 00G000A0.