x64jit: Initial reg transfer.

This commit is contained in:
Unknown W. Brackets 2023-09-23 11:14:42 -07:00
parent 88b6442527
commit d9f6bae1ff
4 changed files with 195 additions and 7 deletions

View File

@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
// THESE TWO ARE UNTESTED.
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); }
void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits,
void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); }
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); }
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); }
void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); }
void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); }

View File

@ -684,12 +684,14 @@ public:
// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
void DPPD(X64Reg dest, OpArg src, u8 arg);
// These are probably useful for VFPU emulation.
void INSERTPS(X64Reg dest, OpArg src, u8 arg);
void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
#endif
// SSE4: Insert and extract for floats.
// Note: insert from memory or an XMM.
void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
// Extract to memory or GPR.
void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg);
// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
void HADDPS(X64Reg dest, OpArg src);
@ -1040,7 +1042,7 @@ public:
// Can only extract from the low 128 bits.
void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg);
// Can only insert into the low 128 bits, zeros upper bits. Inserts from XMM.
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg);
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
void VLDDQU(int bits, X64Reg regOp1, OpArg arg);
void VMOVAPS(int bits, X64Reg regOp1, OpArg arg);
void VMOVAPD(int bits, X64Reg regOp1, OpArg arg);

View File

@ -453,6 +453,189 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
}
}
bool X64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
bool allowed = !mr[nr[nreg].mipsReg].isStatic;
// There's currently no support for non-XMMs here.
allowed = allowed && type == MIPSLoc::FREG;
if (dest == -1)
dest = nreg;
if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) {
// Alright, changing lane count (possibly including lane position.)
IRReg oldfirst = nr[nreg].mipsReg;
int oldlanes = 0;
while (mr[oldfirst + oldlanes].nReg == nreg)
oldlanes++;
_assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch");
_assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?");
if (lanes == 1) {
// Okay, start by storing if dirty.
if (nr[nreg].isDirty) {
StoreNativeReg(nreg, oldfirst, oldlanes);
nr[nreg].isDirty = false;
}
// Next, shuffle the desired element into first place.
u8 shuf = VFPU_SWIZZLE(mr[first].lane, mr[first].lane, mr[first].lane, mr[first].lane);
if (mr[first].lane > 0 && cpu_info.bAVX && dest != nreg) {
emit_->VSHUFPS(128, FromNativeReg(dest), FromNativeReg(nreg), ::R(FromNativeReg(nreg)), shuf);
} else if (mr[first].lane <= 0 && dest != nreg) {
emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
} else if (mr[first].lane > 0) {
if (dest != nreg)
emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
emit_->SHUFPS(FromNativeReg(dest), ::R(FromNativeReg(dest)), shuf);
}
// TODO: Consider moving the others to free regs if available? Likely will be wanted later.
// Now update accounting.
for (int i = 0; i < oldlanes; ++i) {
auto &mreg = mr[oldfirst + i];
if (oldfirst + i == first) {
mreg.lane = 0;
mreg.nReg = dest;
} else {
// No longer in a register.
mreg.nReg = -1;
mreg.lane = -1;
mreg.loc = MIPSLoc::MEM;
}
}
if (dest != nreg) {
nr[dest].isDirty = nr[nreg].isDirty;
nr[nreg].mipsReg = -1;
nr[nreg].isDirty = false;
}
nr[dest].mipsReg = first;
return true;
}
if ((lanes == 4 || lanes == 2) && oldlanes == 1) {
X64Reg cur[4]{};
int numInRegs = 0;
int numDirty = 0;
bool unavail = false;
for (int i = 0; i < lanes; ++i) {
if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) {
unavail = true;
break;
}
if (mr[first + i].nReg == -1) {
cur[i] = INVALID_REG;
} else {
cur[i] = FromNativeReg(mr[first + i].nReg);
numInRegs++;
if (nr[cur[i]].isDirty)
numDirty++;
}
}
if (numInRegs == 0)
unavail = true;
bool handled = false;
if (!unavail) {
// If everything's currently in a reg, move it into this reg.
if (lanes == 4) {
if (cur[0] == INVALID_REG) {
cur[0] = FromNativeReg(dest);
emit_->MOVSS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));
numInRegs++;
}
// A lot of other methods are possible, but seem to make things slower in practice.
if (numInRegs == 4) {
// y = yw##, x = xz##, x = xyzw.
emit_->UNPCKLPS(cur[1], ::R(cur[3]));
emit_->UNPCKLPS(cur[0], ::R(cur[2]));
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
handled = true;
} else if (numInRegs == 2 && cur[1] != INVALID_REG) {
// x = xy##, then load zw.
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
emit_->MOVHPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));
handled = true;
} else if (cpu_info.bSSE4_1 && cur[1] != INVALID_REG && cur[2] != INVALID_REG) {
// x = xz##, z=w###, y=yw##, x=xyzw.
emit_->UNPCKLPS(cur[0], ::R(cur[2]));
emit_->MOVSS(cur[2], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 3)));
emit_->UNPCKLPS(cur[1], ::R(cur[2]));
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
handled = true;
} else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[1] != INVALID_REG && cur[3] != INVALID_REG) {
// y = yw##, load z into x[1], x = xyzw.
emit_->UNPCKLPS(cur[1], ::R(cur[3]));
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)), 1);
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
handled = true;
} else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[2] != INVALID_REG && cur[3] != INVALID_REG) {
// load y to x[1], z = zw##, x = xyzw.
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
emit_->UNPCKLPS(cur[2], ::R(cur[3]));
emit_->MOVLHPS(cur[0], cur[2]);
handled = true;
} else if (cpu_info.bSSE4_1) {
// TODO: This might be worse than flushing depending?
for (int i = 1; i < 4; ++i) {
if (cur[i] == INVALID_REG)
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + i)), i);
else
emit_->INSERTPS(cur[0], ::R(cur[i]), i, 0);
}
handled = true;
}
} else if (lanes == 2) {
if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) {
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
handled = true;
} else if (cur[0] != INVALID_REG && cpu_info.bSSE4_1) {
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
handled = true;
}
}
}
if (handled) {
mr[first].lane = 0;
for (int i = 0; i < lanes; ++i) {
if (mr[first + i].nReg != -1) {
// If this was dirty, the combined reg is now dirty.
if (nr[mr[first + i].nReg].isDirty)
nr[dest].isDirty = true;
// Throw away the other register we're no longer using.
if (i != 0)
DiscardNativeReg(mr[first + i].nReg);
}
// And set it as using the new one.
mr[first + i].lane = i;
mr[first + i].loc = type;
mr[first + i].nReg = dest;
}
if (cur[0] != FromNativeReg(dest))
emit_->MOVAPS(FromNativeReg(dest), ::R(cur[0]));
if (dest != nreg) {
nr[dest].mipsReg = first;
nr[nreg].mipsReg = -1;
nr[nreg].isDirty = false;
}
return true;
}
}
}
return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags);
}
void X64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) {
X64Reg r = FromNativeReg(nreg);
_dbg_assert_(nreg >= 0 && nreg < NUM_X_REGS);

View File

@ -117,6 +117,7 @@ protected:
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
void StoreRegValue(IRReg mreg, uint32_t imm) override;
bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) override;
private:
IRNativeReg GPRToNativeReg(Gen::X64Reg r) {