Merge pull request #707 from unknownbrackets/jit-vfpu

Copy over temp fpu regs, enable VecDo3/VDot
This commit is contained in:
Henrik Rydgård 2013-02-16 15:35:34 -08:00
commit 1ffde31328
4 changed files with 129 additions and 24 deletions

View File

@ -113,7 +113,10 @@ void ApplyPrefixST(float *v, u32 data, VectorSize size)
// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
// TODO: But some ops seem to use const 0 instead?
if (regnum >= n)
{
ERROR_LOG(CPU, "Invalid VFPU swizzle: %08x / %d", data, size);
regnum = 0;
}
v[i] = origV[regnum];
if (abs)
@ -1185,7 +1188,11 @@ namespace MIPSInt
ReadVector(s, sz, vs);
ApplySwizzleS(s, sz);
float scale = V(vt);
ApplySwizzleT(&scale, V_Single);
if (currentMIPS->vfpuCtrl[VFPU_CTRL_TPREFIX] != 0xE4)
{
WARN_LOG(CPU, "Broken T prefix used with VScl: %08x / %08x", currentMIPS->vfpuCtrl[VFPU_CTRL_TPREFIX], op);
ApplySwizzleT(&scale, V_Single);
}
int n = GetNumVectorElements(sz);
for (int i = 0; i < n; i++)
{

View File

@ -49,7 +49,7 @@ namespace MIPSComp
static const float one = 1.0f;
static const float minus_one = -1.0f;
static const float zero = -1.0f;
static const float zero = 0.0f;
const u32 GC_ALIGNED16( noSignMask[4] ) = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
const u32 GC_ALIGNED16( signBitLower[4] ) = {0x80000000, 0, 0, 0};
@ -144,6 +144,25 @@ void Jit::ApplyPrefixD(const u8 *vregs, u32 prefix, VectorSize sz, bool onlyWrit
}
}
// Vector regs can overlap in all sorts of swizzled ways.
// This does allow a single overlap in sregs[i].
bool DestRegOverlaps(int dreg, int di, int sn, u8 sregs[], int tn, u8 tregs[])
{
for (int i = 0; i < sn; ++i)
{
if (sregs[i] == dreg && i != di)
return true;
}
for (int i = 0; i < tn; ++i)
{
if (tregs[i] == dreg)
return true;
}
// Hurray, no overlap, we can write directly.
return false;
}
static u32 GC_ALIGNED16(ssLoadStoreTemp[1]);
void Jit::Comp_SV(u32 op) {
@ -292,7 +311,7 @@ void Jit::Comp_SVQ(u32 op)
}
void Jit::Comp_VDot(u32 op) {
DISABLE;
CONDITIONAL_DISABLE;
// WARNING: No prefix support!
if (js.MayHavePrefix()) {
@ -314,32 +333,42 @@ void Jit::Comp_VDot(u32 op) {
// TODO: applyprefixST here somehow (shuffle, etc...)
MOVSS(XMM0, fpr.V(sregs[0]));
MULSS(XMM0, fpr.V(tregs[0]));
int n = GetNumVectorElements(sz);
X64Reg tempxreg = XMM0;
if (!DestRegOverlaps(dregs[0], 0, n, sregs, n, tregs))
{
fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
tempxreg = fpr.VX(dregs[0]);
}
MOVSS(tempxreg, M((void *) &zero));
MOVSS(XMM1, fpr.V(sregs[0]));
MULSS(XMM1, fpr.V(tregs[0]));
ADDSS(tempxreg, R(XMM1));
for (int i = 1; i < n; i++)
{
// sum += s[i]*t[i];
MOVSS(XMM1, fpr.V(sregs[i]));
MULSS(XMM1, fpr.V(tregs[i]));
ADDSS(XMM0, R(XMM1));
ADDSS(tempxreg, R(XMM1));
}
fpr.ReleaseSpillLocks();
fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg))
{
fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
MOVSS(fpr.V(dregs[0]), XMM0);
}
// TODO: applyprefixD here somehow (write mask etc..)
MOVSS(fpr.V(vd), XMM0);
fpr.ReleaseSpillLocks();
js.EatPrefix();
}
void Jit::Comp_VecDo3(u32 op) {
DISABLE;
CONDITIONAL_DISABLE;
// WARNING: No prefix support!
if (js.MayHavePrefix())
@ -394,16 +423,42 @@ void Jit::Comp_VecDo3(u32 op) {
}
int n = GetNumVectorElements(sz);
// We need at least n temporaries...
if (n > 2)
fpr.Flush();
X64Reg tempxregs[4];
for (int i = 0; i < n; ++i)
{
if (DestRegOverlaps(dregs[i], i, n, sregs, n, tregs))
{
// On 32-bit we only have 6 xregs for mips regs, use XMM0/XMM1 if possible.
if (i < 2)
tempxregs[i] = (X64Reg) (XMM0 + i);
else
{
fpr.BindToRegister(TEMP0 + i, false, true);
fpr.SpillLock(TEMP0 + i);
tempxregs[i] = fpr.RX(TEMP0 + i);
}
}
else
{
fpr.MapRegV(dregs[i], (dregs[i] == sregs[i] ? 0 : MAP_NOINIT) | MAP_DIRTY);
fpr.SpillLockV(dregs[i]);
tempxregs[i] = fpr.VX(dregs[i]);
}
}
for (int i = 0; i < n; ++i)
MOVSS((X64Reg) (XMM0 + i), fpr.V(sregs[i]));
{
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
MOVSS(tempxregs[i], fpr.V(sregs[i]));
}
for (int i = 0; i < n; ++i)
(this->*xmmop)((X64Reg) (XMM0 + i), fpr.V(tregs[i]));
(this->*xmmop)(tempxregs[i], fpr.V(tregs[i]));
for (int i = 0; i < n; ++i)
MOVSS(fpr.V(dregs[i]), (X64Reg) (XMM0 + i));
{
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
}
fpr.ReleaseSpillLocks();

View File

@ -82,6 +82,8 @@ void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) {
void FPURegCache::ReleaseSpillLocks() {
for (int i = 0; i < NUM_MIPS_FPRS; i++)
regs[i].locked = false;
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i)
DiscardR(i);
}
void FPURegCache::BindToRegister(const int i, bool doLoad, bool makeDirty) {
@ -97,7 +99,9 @@ void FPURegCache::BindToRegister(const int i, bool doLoad, bool makeDirty) {
if (!regs[i].location.IsImm() && (regs[i].location.offset & 0x3)) {
PanicAlert("WARNING - misaligned fp register location %i", i);
}
emit->MOVSS(xr, regs[i].location);
if (i < TEMP0) {
emit->MOVSS(xr, regs[i].location);
}
}
regs[i].location = newloc;
regs[i].away = true;
@ -124,8 +128,27 @@ void FPURegCache::StoreFromRegister(int i) {
}
}
void FPURegCache::DiscardR(int i) {
_assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "FPU can't handle imm yet.");
if (regs[i].away) {
X64Reg xr = regs[i].location.GetSimpleReg();
_assert_msg_(DYNA_REC, xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");
// Note that we DO NOT write it back here. That's the whole point of Discard.
xregs[xr].dirty = false;
xregs[xr].mipsReg = -1;
regs[i].location = GetDefaultLocation(i);
regs[i].away = false;
} else {
// _assert_msg_(DYNA_REC,0,"already stored");
}
}
bool FPURegCache::IsTemp(X64Reg xr) {
return xregs[xr].mipsReg >= TEMP0;
}
void FPURegCache::Flush() {
for (int i = 0; i < NUM_MIPS_FPRS; i++) {
for (int i = 0; i < TEMP0; i++) {
if (regs[i].locked) {
PanicAlert("Somebody forgot to unlock MIPS reg %i.", i);
}
@ -141,6 +164,9 @@ void FPURegCache::Flush() {
}
}
}
for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) {
DiscardR(i);
}
}
OpArg FPURegCache::GetDefaultLocation(int reg) const {
@ -203,7 +229,7 @@ X64Reg FPURegCache::GetFreeXReg() {
return (X64Reg) -1;
}
void FPURegCache::FlushR(X64Reg reg) {
void FPURegCache::FlushX(X64Reg reg) {
if (reg >= NUM_X_FPREGS)
PanicAlert("Flushing non existent reg");
if (xregs[reg].mipsReg != -1) {

View File

@ -26,9 +26,18 @@ using namespace Gen;
// GPRs are numbered 0 to 31
// VFPU regs are numbered 32 to 160.
// VFPU regs are numbered 32 to 159.
// Then we have some temp regs for VFPU handling from 160 to 167.
#define NUM_MIPS_FPRS (32 + 128)
enum {
NUM_TEMPS = 4,
TEMP0 = 32 + 128,
TEMP1 = TEMP0 + 1,
TEMP2 = TEMP0 + 2,
TEMP3 = TEMP0 + 3,
TEMP4 = TEMP0 + 4,
NUM_MIPS_FPRS = 32 + 128 + NUM_TEMPS,
};
#ifdef _M_X64
#define NUM_X_FPREGS 16
@ -68,6 +77,11 @@ public:
StoreFromRegister(preg + 32);
}
OpArg GetDefaultLocation(int reg) const;
void DiscardR(int freg);
void DiscardV(int vreg) {
DiscardR(vreg + 32);
}
bool IsTemp(X64Reg xreg);
void SetEmitter(XEmitter *emitter) {emit = emitter;}
@ -100,6 +114,9 @@ public:
void MapRegV(int vreg, int flags);
void MapRegsV(int vec, VectorSize vsz, int flags);
void MapRegsV(const u8 *v, VectorSize vsz, int flags);
void SpillLockV(int vreg) {
SpillLock(vreg + 32);
}
void SpillLockV(const u8 *v, VectorSize vsz);
void SpillLockV(int vec, VectorSize vsz);
@ -107,7 +124,7 @@ public:
private:
X64Reg GetFreeXReg();
void FlushR(X64Reg reg);
void FlushX(X64Reg reg);
const int *GetAllocationOrder(int &count);
MIPSCachedFPReg regs[NUM_MIPS_FPRS];