Turning off the "close memory finder" lets us find more RIP addressing...

This commit is contained in:
Henrik Rydgård 2017-07-07 14:50:50 +02:00
parent 8872057a2d
commit 86396ba39b
3 changed files with 51 additions and 24 deletions

View File

@ -277,14 +277,15 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
case 5: //F(fd) = fabsf(F(fs)); break; //abs
fpr.SpillLock(fd, fs);
fpr.MapReg(fd, fd == fs, true);
MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssNoSignMask[0]));
if (fd != fs && fpr.IsMapped(fs)) {
MOVAPS(fpr.RX(fd), M(ssNoSignMask));
MOVAPS(fpr.RX(fd), MatR(TEMPREG));
ANDPS(fpr.RX(fd), fpr.R(fs));
} else {
if (fd != fs) {
MOVSS(fpr.RX(fd), fpr.R(fs));
}
ANDPS(fpr.RX(fd), M(ssNoSignMask));
ANDPS(fpr.RX(fd), MatR(TEMPREG));
}
break;
@ -299,14 +300,15 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
case 7: //F(fd) = -F(fs); break; //neg
fpr.SpillLock(fd, fs);
fpr.MapReg(fd, fd == fs, true);
MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssSignBits2[0]));
if (fd != fs && fpr.IsMapped(fs)) {
MOVAPS(fpr.RX(fd), M(ssSignBits2));
MOVAPS(fpr.RX(fd), MatR(TEMPREG));
XORPS(fpr.RX(fd), fpr.R(fs));
} else {
if (fd != fs) {
MOVSS(fpr.RX(fd), fpr.R(fs));
}
XORPS(fpr.RX(fd), M(ssSignBits2));
XORPS(fpr.RX(fd), MatR(TEMPREG));
}
break;

View File

@ -575,7 +575,12 @@ void Jit::Comp_VIdt(MIPSOpcode op) {
GetVectorRegsPrefixD(dregs, sz, _VD);
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
int row = vd & (n - 1);
MOVAPS(fpr.VSX(dregs), M(identityMatrix[row]));
if (RipAccessible(identityMatrix)) {
MOVAPS(fpr.VSX(dregs), M(identityMatrix[row])); // rip accessible
} else {
MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[row]));
MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
}
ApplyPrefixD(dregs, sz);
fpr.ReleaseSpillLocks();
return;
@ -1603,7 +1608,12 @@ void Jit::Comp_Vh2f(MIPSOpcode op) {
SSE_CONST4(magic, (254 - 15) << 23);
SSE_CONST4(was_infnan, 0x7bff);
SSE_CONST4(exp_infnan, 255 << 23);
// TODO: Fix properly
if (!RipAccessible(mask_nosign)) {
DISABLE;
}
#undef SSE_CONST4
VectorSize sz = GetVecSize(op);
VectorSize outsize;
@ -1639,14 +1649,14 @@ void Jit::Comp_Vh2f(MIPSOpcode op) {
// OK, 16 bits in each word.
// Let's go. Deep magic here.
MOVAPS(XMM1, R(XMM0));
ANDPS(XMM0, M(mask_nosign)); // xmm0 = expmant
ANDPS(XMM0, M(&mask_nosign[0])); // xmm0 = expmant
XORPS(XMM1, R(XMM0)); // xmm1 = justsign = expmant ^ xmm0
MOVAPS(tempR, R(XMM0));
PCMPGTD(tempR, M(was_infnan)); // xmm2 = b_wasinfnan
PCMPGTD(tempR, M(&was_infnan[0])); // xmm2 = b_wasinfnan
PSLLD(XMM0, 13);
MULPS(XMM0, M(magic)); /// xmm0 = scaled
PSLLD(XMM1, 16); // xmm1 = sign
ANDPS(tempR, M(exp_infnan));
ANDPS(tempR, M(&exp_infnan[0]));
ORPS(XMM1, R(tempR));
ORPS(XMM0, R(XMM1));
@ -1732,7 +1742,7 @@ void Jit::Comp_Vx2i(MIPSOpcode op) {
MOVSS(XMM0, fpr.V(sregs[0]));
if (cpu_info.bSSSE3) {
// Not really different speed. Generates a bit less code.
PSHUFB(XMM0, M(vuc2i_shuffle));
PSHUFB(XMM0, M(&vuc2i_shuffle[0]));
} else {
// First, we change 0xDDCCBBAA to 0xDDDDCCCCBBBBAAAA.
PUNPCKLBW(XMM0, R(XMM0));
@ -1742,7 +1752,7 @@ void Jit::Comp_Vx2i(MIPSOpcode op) {
} else {
if (cpu_info.bSSSE3) {
MOVSS(XMM0, fpr.V(sregs[0]));
PSHUFB(XMM0, M(vc2i_shuffle));
PSHUFB(XMM0, M(&vc2i_shuffle[0]));
} else {
PXOR(XMM1, R(XMM1));
MOVSS(XMM0, fpr.V(sregs[0]));
@ -1861,8 +1871,14 @@ void Jit::Comp_Vf2i(MIPSOpcode op) {
}
}
if (*mult != 1.0f)
MOVSD(XMM1, M(mult));
if (*mult != 1.0f) {
if (RipAccessible(mult)) {
MOVSD(XMM1, M(mult)); // rip accessible
} else {
MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));
MOVSD(XMM1, MatR(TEMPREG));
}
}
fpr.MapRegsV(tempregs, sz, MAP_DIRTY | MAP_NOINIT);
for (int i = 0; i < n; i++) {
@ -3453,7 +3469,12 @@ void Jit::CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin) {
case 'S':
MOVSS(fpr.V(dregs[i]), XMM0);
if (negSin) {
XORPS(fpr.VX(dregs[i]), M(&signBitLower));
if (RipAccessible(&signBitLower)) {
XORPS(fpr.VX(dregs[i]), M(&signBitLower)); // rip accessible
} else {
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
XORPS(fpr.VX(dregs[i]), MatR(TEMPREG));
}
}
break;
case '0':

View File

@ -882,18 +882,18 @@ void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
MOV(32, R(tempReg2), R(tempReg1));
SHR(32, R(tempReg2), Imm8(16));
auto updateSide = [&](X64Reg r, CCFlags skipCC, u16 *value) {
CMP(16, R(r), M(value));
MOV(PTRBITS, R(tempReg3), ImmPtr(&gstate_c.vertBounds));
auto updateSide = [&](X64Reg r, CCFlags skipCC, int offset) {
CMP(16, R(r), MDisp(tempReg3, offset));
FixupBranch skip = J_CC(skipCC);
MOV(16, M(value), R(r));
MOV(16, MDisp(tempReg3, offset), R(r));
SetJumpTarget(skip);
};
// TODO: Can this actually be fast? Hmm, floats aren't better.
updateSide(tempReg1, CC_GE, &gstate_c.vertBounds.minU);
updateSide(tempReg1, CC_LE, &gstate_c.vertBounds.maxU);
updateSide(tempReg2, CC_GE, &gstate_c.vertBounds.minV);
updateSide(tempReg2, CC_LE, &gstate_c.vertBounds.maxV);
updateSide(tempReg1, CC_GE, offsetof(KnownVertexBounds, minU));
updateSide(tempReg1, CC_LE, offsetof(KnownVertexBounds, maxU));
updateSide(tempReg2, CC_GE, offsetof(KnownVertexBounds, minV));
updateSide(tempReg2, CC_LE, offsetof(KnownVertexBounds, maxV));
}
void VertexDecoderJitCache::Jit_TcFloatThrough() {
@ -923,7 +923,6 @@ void VertexDecoderJitCache::Jit_Color8888() {
SetJumpTarget(skip);
}
static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, };
static const u32 MEMORY_ALIGNED16(color4444mask[4]) = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
void VertexDecoderJitCache::Jit_Color4444() {
@ -931,7 +930,12 @@ void VertexDecoderJitCache::Jit_Color4444() {
MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff));
// Spread to RGBA -> R00GB00A.
PUNPCKLBW(fpScratchReg, R(fpScratchReg));
PAND(fpScratchReg, M(color4444mask));
if (RipAccessible(&color4444mask[0])) {
PAND(fpScratchReg, M(&color4444mask[0]));
} else {
MOV(PTRBITS, R(tempReg1), ImmPtr(&color4444mask));
PAND(fpScratchReg, MatR(tempReg1));
}
MOVSS(fpScratchReg2, R(fpScratchReg));
MOVSS(fpScratchReg3, R(fpScratchReg));
// Create 0R000B00 and 00G000A0.