Optimize VDot and VecDo3 to avoid temporaries.

This commit is contained in:
Unknown W. Brackets 2013-02-16 10:19:05 -08:00
parent 0bd382c518
commit 1c4c5e718b

View File

@ -145,11 +145,12 @@ void Jit::ApplyPrefixD(const u8 *vregs, u32 prefix, VectorSize sz, bool onlyWrit
}
// Vector regs can overlap in all sorts of swizzled ways.
bool DestRegOverlaps(int dreg, int sn, u8 sregs[], int tn, u8 tregs[])
// This does allow a single overlap in sregs[i].
bool DestRegOverlaps(int dreg, int di, int sn, u8 sregs[], int tn, u8 tregs[])
{
for (int i = 0; i < sn; ++i)
{
if (sregs[i] == dreg)
if (sregs[i] == dreg && i != di)
return true;
}
for (int i = 0; i < tn; ++i)
@ -332,25 +333,34 @@ void Jit::Comp_VDot(u32 op) {
// TODO: applyprefixST here somehow (shuffle, etc...)
MOVSS(XMM0, fpr.V(sregs[0]));
MULSS(XMM0, fpr.V(tregs[0]));
int n = GetNumVectorElements(sz);
X64Reg tempxreg = XMM0;
if (!DestRegOverlaps(dregs[0], 0, n, sregs, n, tregs))
{
fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
tempxreg = fpr.VX(dregs[0]);
}
if (!fpr.V(sregs[0]).IsSimpleReg(tempxreg))
MOVSS(tempxreg, fpr.V(sregs[0]));
MULSS(tempxreg, fpr.V(tregs[0]));
for (int i = 1; i < n; i++)
{
// sum += s[i]*t[i];
MOVSS(XMM1, fpr.V(sregs[i]));
MULSS(XMM1, fpr.V(tregs[i]));
ADDSS(XMM0, R(XMM1));
ADDSS(tempxreg, R(XMM1));
}
fpr.ReleaseSpillLocks();
fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg))
{
fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
MOVSS(fpr.V(dregs[0]), XMM0);
}
// TODO: applyprefixD here somehow (write mask etc..)
MOVSS(fpr.V(vd), XMM0);
fpr.ReleaseSpillLocks();
js.EatPrefix();
@ -416,7 +426,7 @@ void Jit::Comp_VecDo3(u32 op) {
X64Reg tempxregs[4];
for (int i = 0; i < n; ++i)
{
if (DestRegOverlaps(dregs[i], n, sregs, n, tregs))
if (DestRegOverlaps(dregs[i], i, n, sregs, n, tregs))
{
// On 32-bit we only have 6 xregs for mips regs, use XMM0/XMM1 if possible.
if (i < 2)
@ -430,19 +440,22 @@ void Jit::Comp_VecDo3(u32 op) {
}
else
{
fpr.MapRegV(dregs[i], MAP_NOINIT | MAP_DIRTY);
fpr.MapRegV(dregs[i], (dregs[i] == sregs[i] ? 0 : MAP_NOINIT) | MAP_DIRTY);
fpr.SpillLockV(dregs[i]);
tempxregs[i] = fpr.VX(dregs[i]);
}
}
for (int i = 0; i < n; ++i)
MOVSS(tempxregs[i], fpr.V(sregs[i]));
{
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
MOVSS(tempxregs[i], fpr.V(sregs[i]));
}
for (int i = 0; i < n; ++i)
(this->*xmmop)(tempxregs[i], fpr.V(tregs[i]));
for (int i = 0; i < n; ++i)
{
if (!fpr.R(dregs[i]).IsSimpleReg(tempxregs[i]))
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
}