mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-22 21:09:52 +00:00
x64jit: Initial reg transfer.
This commit is contained in:
parent
88b6442527
commit
d9f6bae1ff
@ -1697,7 +1697,6 @@ void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, ar
|
||||
|
||||
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
|
||||
|
||||
// THESE TWO ARE UNTESTED.
|
||||
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
|
||||
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
|
||||
|
||||
@ -1892,6 +1891,9 @@ void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest
|
||||
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
|
||||
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
|
||||
|
||||
void XEmitter::INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteSSE41Op(0x66, 0x3A21, dest, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
|
||||
void XEmitter::EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg) { WriteSSE41Op(0x66, 0x3A17, arg, dest, 1); Write8(subreg); }
|
||||
|
||||
void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
|
||||
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
|
||||
void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
|
||||
@ -2084,7 +2086,7 @@ void XEmitter::VCVTTPD2DQ(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits,
|
||||
void XEmitter::VCVTTSS2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF3, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
|
||||
void XEmitter::VCVTTSD2SI(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(0, 0xF2, 0x2C, regOp1, arg, 0, bits == 64 ? 1 : 0); }
|
||||
void XEmitter::VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A17, regOp1, arg, 1); Write8(subreg); }
|
||||
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8(subreg); }
|
||||
void XEmitter::VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg, u8 zmask) { WriteAVXOp(0, 0x66, 0x3A21, regOp1, regOp2, arg, 1); Write8((srcsubreg << 6) | (dstsubreg << 4) | zmask); }
|
||||
void XEmitter::VLDDQU(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0xF2, sseLDDQU, regOp1, arg); }
|
||||
void XEmitter::VMOVAPS(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x00, sseMOVAPfromRM, regOp1, arg); }
|
||||
void XEmitter::VMOVAPD(int bits, X64Reg regOp1, OpArg arg) { WriteAVXOp(bits, 0x66, sseMOVAPfromRM, regOp1, arg); }
|
||||
|
@ -684,12 +684,14 @@ public:
|
||||
|
||||
// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
|
||||
void DPPD(X64Reg dest, OpArg src, u8 arg);
|
||||
|
||||
// These are probably useful for VFPU emulation.
|
||||
void INSERTPS(X64Reg dest, OpArg src, u8 arg);
|
||||
void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
|
||||
#endif
|
||||
|
||||
// SSE4: Insert and extract for floats.
|
||||
// Note: insert from memory or an XMM.
|
||||
void INSERTPS(X64Reg dest, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
|
||||
// Extract to memory or GPR.
|
||||
void EXTRACTPS(OpArg dest, X64Reg arg, u8 subreg);
|
||||
|
||||
// SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
|
||||
void HADDPS(X64Reg dest, OpArg src);
|
||||
|
||||
@ -1040,7 +1042,7 @@ public:
|
||||
// Can only extract from the low 128 bits.
|
||||
void VEXTRACTPS(OpArg arg, X64Reg regOp1, u8 subreg);
|
||||
// Can only insert into the low 128 bits, zeros upper bits. Inserts from XMM.
|
||||
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 subreg);
|
||||
void VINSERTPS(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 dstsubreg, u8 srcsubreg = 0, u8 zmask = 0);
|
||||
void VLDDQU(int bits, X64Reg regOp1, OpArg arg);
|
||||
void VMOVAPS(int bits, X64Reg regOp1, OpArg arg);
|
||||
void VMOVAPD(int bits, X64Reg regOp1, OpArg arg);
|
||||
|
@ -453,6 +453,189 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) {
|
||||
}
|
||||
}
|
||||
|
||||
bool X64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) {
|
||||
bool allowed = !mr[nr[nreg].mipsReg].isStatic;
|
||||
// There's currently no support for non-XMMs here.
|
||||
allowed = allowed && type == MIPSLoc::FREG;
|
||||
|
||||
if (dest == -1)
|
||||
dest = nreg;
|
||||
|
||||
if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) {
|
||||
// Alright, changing lane count (possibly including lane position.)
|
||||
IRReg oldfirst = nr[nreg].mipsReg;
|
||||
int oldlanes = 0;
|
||||
while (mr[oldfirst + oldlanes].nReg == nreg)
|
||||
oldlanes++;
|
||||
_assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch");
|
||||
_assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?");
|
||||
|
||||
if (lanes == 1) {
|
||||
// Okay, start by storing if dirty.
|
||||
if (nr[nreg].isDirty) {
|
||||
StoreNativeReg(nreg, oldfirst, oldlanes);
|
||||
nr[nreg].isDirty = false;
|
||||
}
|
||||
// Next, shuffle the desired element into first place.
|
||||
u8 shuf = VFPU_SWIZZLE(mr[first].lane, mr[first].lane, mr[first].lane, mr[first].lane);
|
||||
if (mr[first].lane > 0 && cpu_info.bAVX && dest != nreg) {
|
||||
emit_->VSHUFPS(128, FromNativeReg(dest), FromNativeReg(nreg), ::R(FromNativeReg(nreg)), shuf);
|
||||
} else if (mr[first].lane <= 0 && dest != nreg) {
|
||||
emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
|
||||
} else if (mr[first].lane > 0) {
|
||||
if (dest != nreg)
|
||||
emit_->MOVAPS(FromNativeReg(dest), ::R(FromNativeReg(nreg)));
|
||||
emit_->SHUFPS(FromNativeReg(dest), ::R(FromNativeReg(dest)), shuf);
|
||||
}
|
||||
|
||||
// TODO: Consider moving the others to free regs if available? Likely will be wanted later.
|
||||
|
||||
// Now update accounting.
|
||||
for (int i = 0; i < oldlanes; ++i) {
|
||||
auto &mreg = mr[oldfirst + i];
|
||||
if (oldfirst + i == first) {
|
||||
mreg.lane = 0;
|
||||
mreg.nReg = dest;
|
||||
} else {
|
||||
// No longer in a register.
|
||||
mreg.nReg = -1;
|
||||
mreg.lane = -1;
|
||||
mreg.loc = MIPSLoc::MEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (dest != nreg) {
|
||||
nr[dest].isDirty = nr[nreg].isDirty;
|
||||
nr[nreg].mipsReg = -1;
|
||||
nr[nreg].isDirty = false;
|
||||
}
|
||||
nr[dest].mipsReg = first;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if ((lanes == 4 || lanes == 2) && oldlanes == 1) {
|
||||
X64Reg cur[4]{};
|
||||
int numInRegs = 0;
|
||||
int numDirty = 0;
|
||||
bool unavail = false;
|
||||
for (int i = 0; i < lanes; ++i) {
|
||||
if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) {
|
||||
unavail = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (mr[first + i].nReg == -1) {
|
||||
cur[i] = INVALID_REG;
|
||||
} else {
|
||||
cur[i] = FromNativeReg(mr[first + i].nReg);
|
||||
numInRegs++;
|
||||
if (nr[cur[i]].isDirty)
|
||||
numDirty++;
|
||||
}
|
||||
}
|
||||
|
||||
if (numInRegs == 0)
|
||||
unavail = true;
|
||||
|
||||
bool handled = false;
|
||||
if (!unavail) {
|
||||
// If everything's currently in a reg, move it into this reg.
|
||||
if (lanes == 4) {
|
||||
if (cur[0] == INVALID_REG) {
|
||||
cur[0] = FromNativeReg(dest);
|
||||
emit_->MOVSS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 0)));
|
||||
numInRegs++;
|
||||
}
|
||||
|
||||
// A lot of other methods are possible, but seem to make things slower in practice.
|
||||
if (numInRegs == 4) {
|
||||
// y = yw##, x = xz##, x = xyzw.
|
||||
emit_->UNPCKLPS(cur[1], ::R(cur[3]));
|
||||
emit_->UNPCKLPS(cur[0], ::R(cur[2]));
|
||||
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
|
||||
handled = true;
|
||||
} else if (numInRegs == 2 && cur[1] != INVALID_REG) {
|
||||
// x = xy##, then load zw.
|
||||
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
|
||||
emit_->MOVHPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)));
|
||||
handled = true;
|
||||
} else if (cpu_info.bSSE4_1 && cur[1] != INVALID_REG && cur[2] != INVALID_REG) {
|
||||
// x = xz##, z=w###, y=yw##, x=xyzw.
|
||||
emit_->UNPCKLPS(cur[0], ::R(cur[2]));
|
||||
emit_->MOVSS(cur[2], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 3)));
|
||||
emit_->UNPCKLPS(cur[1], ::R(cur[2]));
|
||||
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
|
||||
handled = true;
|
||||
} else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[1] != INVALID_REG && cur[3] != INVALID_REG) {
|
||||
// y = yw##, load z into x[1], x = xyzw.
|
||||
emit_->UNPCKLPS(cur[1], ::R(cur[3]));
|
||||
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 2)), 1);
|
||||
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
|
||||
handled = true;
|
||||
} else if (cpu_info.bSSE4_1 && numDirty != 0 && cur[2] != INVALID_REG && cur[3] != INVALID_REG) {
|
||||
// load y to x[1], z = zw##, x = xyzw.
|
||||
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
|
||||
emit_->UNPCKLPS(cur[2], ::R(cur[3]));
|
||||
emit_->MOVLHPS(cur[0], cur[2]);
|
||||
handled = true;
|
||||
} else if (cpu_info.bSSE4_1) {
|
||||
// TODO: This might be worse than flushing depending?
|
||||
for (int i = 1; i < 4; ++i) {
|
||||
if (cur[i] == INVALID_REG)
|
||||
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + i)), i);
|
||||
else
|
||||
emit_->INSERTPS(cur[0], ::R(cur[i]), i, 0);
|
||||
}
|
||||
handled = true;
|
||||
}
|
||||
} else if (lanes == 2) {
|
||||
if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) {
|
||||
emit_->UNPCKLPS(cur[0], ::R(cur[1]));
|
||||
handled = true;
|
||||
} else if (cur[0] != INVALID_REG && cpu_info.bSSE4_1) {
|
||||
emit_->INSERTPS(cur[0], MDisp(CTXREG, -128 + GetMipsRegOffset(first + 1)), 1);
|
||||
handled = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (handled) {
|
||||
mr[first].lane = 0;
|
||||
for (int i = 0; i < lanes; ++i) {
|
||||
if (mr[first + i].nReg != -1) {
|
||||
// If this was dirty, the combined reg is now dirty.
|
||||
if (nr[mr[first + i].nReg].isDirty)
|
||||
nr[dest].isDirty = true;
|
||||
|
||||
// Throw away the other register we're no longer using.
|
||||
if (i != 0)
|
||||
DiscardNativeReg(mr[first + i].nReg);
|
||||
}
|
||||
|
||||
// And set it as using the new one.
|
||||
mr[first + i].lane = i;
|
||||
mr[first + i].loc = type;
|
||||
mr[first + i].nReg = dest;
|
||||
}
|
||||
|
||||
if (cur[0] != FromNativeReg(dest))
|
||||
emit_->MOVAPS(FromNativeReg(dest), ::R(cur[0]));
|
||||
|
||||
if (dest != nreg) {
|
||||
nr[dest].mipsReg = first;
|
||||
nr[nreg].mipsReg = -1;
|
||||
nr[nreg].isDirty = false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags);
|
||||
}
|
||||
|
||||
void X64IRRegCache::SetNativeRegValue(IRNativeReg nreg, uint32_t imm) {
|
||||
X64Reg r = FromNativeReg(nreg);
|
||||
_dbg_assert_(nreg >= 0 && nreg < NUM_X_REGS);
|
||||
|
@ -117,6 +117,7 @@ protected:
|
||||
void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override;
|
||||
void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override;
|
||||
void StoreRegValue(IRReg mreg, uint32_t imm) override;
|
||||
bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) override;
|
||||
|
||||
private:
|
||||
IRNativeReg GPRToNativeReg(Gen::X64Reg r) {
|
||||
|
Loading…
Reference in New Issue
Block a user