x86jit: Speed up float to int conversions.

This commit is contained in:
Unknown W. Brackets 2023-09-19 18:28:47 -07:00
parent 7dc18a94af
commit 1c81d47dd4
2 changed files with 149 additions and 77 deletions

View File

@ -47,6 +47,7 @@ void X64JitBackend::EmitFPUConstants() {
EmitConst4x32(&constants.qNAN, 0x7FC00000); EmitConst4x32(&constants.qNAN, 0x7FC00000);
EmitConst4x32(&constants.positiveOnes, 0x3F800000); EmitConst4x32(&constants.positiveOnes, 0x3F800000);
EmitConst4x32(&constants.negativeOnes, 0xBF800000); EmitConst4x32(&constants.negativeOnes, 0xBF800000);
EmitConst4x32(&constants.maxIntBelowAsFloat, 0x4EFFFFFF);
constants.mulTableVi2f = (const float *)GetCodePointer(); constants.mulTableVi2f = (const float *)GetCodePointer();
for (uint8_t i = 0; i < 32; ++i) { for (uint8_t i = 0; i < 32; ++i) {
@ -57,20 +58,14 @@ void X64JitBackend::EmitFPUConstants() {
Write32(val); Write32(val);
} }
constants.mulTableVf2i = (const double *)GetCodePointer(); constants.mulTableVf2i = (const float *)GetCodePointer();
for (uint8_t i = 0; i < 32; ++i) { for (uint8_t i = 0; i < 32; ++i) {
double fval = (1UL << i); float fval = (float)(1ULL << i);
uint64_t val; uint32_t val;
memcpy(&val, &fval, sizeof(val)); memcpy(&val, &fval, sizeof(val));
Write64(val); Write32(val);
} }
// Note: this first one is (double)(int)0x80000000, sign extended.
constants.minIntAsDouble = (const double *)GetCodePointer();
Write64(0xC1E0000000000000ULL);
constants.maxIntAsDouble = (const double *)GetCodePointer();
Write64(0x41DFFFFFFFC00000ULL);
} }
void X64JitBackend::CopyVec4ToFPRLane0(Gen::X64Reg dest, Gen::X64Reg src, int lane) { void X64JitBackend::CopyVec4ToFPRLane0(Gen::X64Reg dest, Gen::X64Reg src, int lane) {
@ -579,11 +574,14 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
case IROp::FCvtWS: case IROp::FCvtWS:
{ {
regs_.Map(inst); regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), M(constants.positiveInfinity)); // rip accessible UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1)); CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
// UCOMISS set ZF if EQUAL (to infinity) or UNORDERED. // UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
FixupBranch skip = J_CC(CC_NZ); // We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip); SetJumpTarget(skip);
@ -599,54 +597,65 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
regs_.Map(inst); regs_.Map(inst);
if (cpu_info.bSSE4_1) { if (cpu_info.bSSE4_1) {
int scale = inst.src2 & 0x1F; int scale = inst.src2 & 0x1F;
int rmode = inst.src2 >> 6; IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);
CVTSS2SD(regs_.FX(inst.dest), regs_.F(inst.src1)); if (scale != 0 && cpu_info.bAVX) {
VMULSS(regs_.FX(inst.dest), regs_.FX(inst.src1), M(&constants.mulTableVf2i[scale])); // rip accessible
} else {
if (inst.dest != inst.src1)
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
if (scale != 0) if (scale != 0)
MULSD(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible
}
// On NAN, we want maxInt anyway, so let's let it be the second param. UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible
MAXSD(regs_.FX(inst.dest), M(constants.minIntAsDouble)); // rip accessible
MINSD(regs_.FX(inst.dest), M(constants.maxIntAsDouble)); // rip accessible
switch (rmode) { switch (rmode) {
case 0: case IRRoundMode::RINT_0:
ROUNDNEARPD(regs_.FX(inst.dest), regs_.F(inst.dest)); ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest)); CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break; break;
case 1: case IRRoundMode::CAST_1:
CVTTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest)); CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break; break;
case 2: case IRRoundMode::CEIL_2:
ROUNDCEILPD(regs_.FX(inst.dest), regs_.F(inst.dest)); ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest)); CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break; break;
case 3: case IRRoundMode::FLOOR_3:
ROUNDFLOORPD(regs_.FX(inst.dest), regs_.F(inst.dest)); ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.dest));
CVTPD2DQ(regs_.FX(inst.dest), regs_.F(inst.dest)); CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
break; break;
} }
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
} else { } else {
int scale = inst.src2 & 0x1F; int scale = inst.src2 & 0x1F;
int rmode = inst.src2 >> 6; IRRoundMode rmode = (IRRoundMode)(inst.src2 >> 6);
int setMXCSR = -1; int setMXCSR = -1;
bool useTrunc = false; bool useTrunc = false;
switch (rmode) { switch (rmode) {
case 0: case IRRoundMode::RINT_0:
// TODO: Could skip if hasSetRounding, but we don't have the flag. // TODO: Could skip if hasSetRounding, but we don't have the flag.
setMXCSR = 0; setMXCSR = 0;
break; break;
case 1: case IRRoundMode::CAST_1:
useTrunc = true; useTrunc = true;
break; break;
case 2: case IRRoundMode::CEIL_2:
setMXCSR = 2; setMXCSR = 2;
break; break;
case 3: case IRRoundMode::FLOOR_3:
setMXCSR = 1; setMXCSR = 1;
break; break;
} }
@ -665,21 +674,26 @@ void X64JitBackend::CompIR_FCvt(IRInst inst) {
LDMXCSR(MDisp(CTXREG, tempOffset)); LDMXCSR(MDisp(CTXREG, tempOffset));
} }
CVTSS2SD(regs_.FX(inst.dest), regs_.F(inst.src1)); if (inst.dest != inst.src1)
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
if (scale != 0) if (scale != 0)
MULSD(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); MULSS(regs_.FX(inst.dest), M(&constants.mulTableVf2i[scale])); // rip accessible
// On NAN, we want maxInt anyway, so let's let it be the second param. UCOMISS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible
MAXSD(regs_.FX(inst.dest), M(constants.minIntAsDouble));
MINSD(regs_.FX(inst.dest), M(constants.maxIntAsDouble));
if (useTrunc) { if (useTrunc) {
CVTTSD2SI(SCRATCH1, regs_.F(inst.dest)); CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
} else { } else {
CVTSD2SI(SCRATCH1, regs_.F(inst.dest)); CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
} }
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1)); // UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
// Return MXCSR to its previous value. // Return MXCSR to its previous value.
if (setMXCSR != -1) { if (setMXCSR != -1) {
@ -704,47 +718,106 @@ void X64JitBackend::CompIR_FRound(IRInst inst) {
CONDITIONAL_DISABLE; CONDITIONAL_DISABLE;
switch (inst.op) { switch (inst.op) {
case IROp::FCeil:
case IROp::FFloor:
case IROp::FRound: case IROp::FRound:
CompIR_Generic(inst); if (cpu_info.bSSE4_1) {
regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
switch (inst.op) {
case IROp::FCeil:
ROUNDCEILPS(regs_.FX(inst.dest), regs_.F(inst.src1));
break;
case IROp::FFloor:
ROUNDFLOORPS(regs_.FX(inst.dest), regs_.F(inst.src1));
break;
case IROp::FRound:
ROUNDNEARPS(regs_.FX(inst.dest), regs_.F(inst.src1));
break;
default:
INVALIDOP;
}
CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.dest));
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
} else {
regs_.Map(inst);
int setMXCSR = -1;
switch (inst.op) {
case IROp::FRound:
// TODO: Could skip if hasSetRounding, but we don't have the flag.
setMXCSR = 0;
break;
case IROp::FCeil:
setMXCSR = 2;
break;
case IROp::FFloor:
setMXCSR = 1;
break;
default:
INVALIDOP;
}
// TODO: Might be possible to cache this and update between instructions?
// Probably kinda expensive to switch each time...
if (setMXCSR != -1) {
STMXCSR(MDisp(CTXREG, mxcsrTempOffset));
MOV(32, R(SCRATCH1), MDisp(CTXREG, mxcsrTempOffset));
AND(32, R(SCRATCH1), Imm32(~(3 << 13)));
if (setMXCSR != 0) {
OR(32, R(SCRATCH1), Imm32(setMXCSR << 13));
}
MOV(32, MDisp(CTXREG, tempOffset), R(SCRATCH1));
LDMXCSR(MDisp(CTXREG, tempOffset));
}
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
CVTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
SetJumpTarget(skip);
// Return MXCSR to its previous value.
if (setMXCSR != -1) {
LDMXCSR(MDisp(CTXREG, mxcsrTempOffset));
}
}
break; break;
case IROp::FTrunc: case IROp::FTrunc:
{ {
regs_.SpillLockFPR(inst.dest, inst.src1);
X64Reg tempZero = regs_.GetAndLockTempFPR();
regs_.Map(inst); regs_.Map(inst);
UCOMISS(regs_.FX(inst.src1), M(constants.maxIntBelowAsFloat)); // rip accessible
CVTTSS2SI(SCRATCH1, regs_.F(inst.src1)); CVTTPS2DQ(regs_.FX(inst.dest), regs_.F(inst.src1));
// UCOMISS set CF if LESS and ZF if EQUAL to maxIntBelowAsFloat.
// We want noSignMask otherwise, GREATER or UNORDERED.
FixupBranch isNAN = J_CC(CC_P);
FixupBranch skip = J_CC(CC_BE);
SetJumpTarget(isNAN);
MOVAPS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible
// Did we get an indefinite integer value? SetJumpTarget(skip);
CMP(32, R(SCRATCH1), Imm32(0x80000000));
FixupBranch wasExact = J_CC(CC_NE);
XORPS(tempZero, R(tempZero));
if (inst.dest == inst.src1) {
CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
} else if (cpu_info.bAVX) {
VCMPSS(regs_.FX(inst.dest), regs_.FX(inst.src1), R(tempZero), CMP_LT);
} else {
MOVAPS(regs_.FX(inst.dest), regs_.F(inst.src1));
CMPSS(regs_.FX(inst.dest), R(tempZero), CMP_LT);
}
// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.dest));
XOR(32, R(SCRATCH1), Imm32(0x7fffffff));
SetJumpTarget(wasExact);
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
break; break;
} }
case IROp::FCeil:
case IROp::FFloor:
CompIR_Generic(inst);
break;
default: default:
INVALIDOP; INVALIDOP;
break; break;

View File

@ -148,10 +148,9 @@ private:
const void *positiveOnes; const void *positiveOnes;
const void *negativeOnes; const void *negativeOnes;
const void *qNAN; const void *qNAN;
const void *maxIntBelowAsFloat;
const float *mulTableVi2f; const float *mulTableVi2f;
const double *mulTableVf2i; const float *mulTableVf2i;
const double *minIntAsDouble;
const double *maxIntAsDouble;
const Float4Constant *vec4InitValues; const Float4Constant *vec4InitValues;
}; };
Constants constants; Constants constants;