Merge pull request #2093 from xsacha/armjit-vfpu

Armjit: Improve ApplyPrefixD. Add VABD to emitter.
This commit is contained in:
Henrik Rydgård 2013-06-05 10:30:02 -07:00
commit 7574ebbe58
4 changed files with 35 additions and 35 deletions

View File

@ -850,6 +850,21 @@ ARMReg ARMXEmitter::SubBase(ARMReg Reg)
}
// NEON Specific
void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)");
_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it");
bool register_quad = Vd >= Q0;
// Gets encoded as a double register
Vd = SubBase(Vd);
Vn = SubBase(Vn);
Vm = SubBase(Vm);
Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
| ((Vm & 0x10) << 2) | (Vm & 0xF));
}
void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
@ -864,7 +879,7 @@ void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
| ((Vm & 0x10) << 2) | (Vm & 0xF));
| ((Vm & 0x10) << 2) | (Vm & 0xF));
}
void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
@ -879,7 +894,7 @@ void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
| ((Vm & 0x10) << 2) | (Vm & 0xF));
| ((Vm & 0x10) << 2) | (Vm & 0xF));
}
// VFP Specific

View File

@ -530,6 +530,7 @@ public:
// Subtracts the base from the register to give us the real one
ARMReg SubBase(ARMReg Reg);
// NEON Only
void VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);

View File

@ -17,6 +17,7 @@
#include "../../MemMap.h"
#include "../MIPSAnalyst.h"
#include "Common/CPUDetect.h"
#include "Core/Config.h"
#include "Core/Reporting.h"
@ -166,39 +167,22 @@ namespace MIPSComp
int sat = (js.prefixD >> (i * 2)) & 3;
if (sat == 1) {
// clamped = fabs(x) - fabs(x-0.5f) + 0.5f; // [ 0, 1]
fpr.MapRegV(vregs[i], MAP_DIRTY);
// ARGH this is a pain - no MIN/MAX in non-NEON VFP!
// NEON does have min/max though so this should only be a fallback.
MOVI2F(S0, 0.0, R0);
MOVI2F(S1, 1.0, R0);
VCMP(fpr.V(vregs[i]), S1);
VMRS_APSR();
SetCC(CC_GE);
VMOV(fpr.V(vregs[i]), S1);
FixupBranch skip = B();
SetCC(CC_AL);
VCMP(fpr.V(vregs[i]), S0);
VMRS_APSR();
SetCC(CC_LE);
VMOV(fpr.V(vregs[i]), S0);
SetCC(CC_AL);
SetJumpTarget(skip);
MOVI2F(S0, 0.5, R0);
VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x)
VSUB(S2, fpr.V(vregs[i]), S0); // S2 = fabs(x-0.5f) {VABD}
VABS(S2, S2);
VSUB(fpr.V(vregs[i]), S1, S2); // v[i] = S1 - S2 + 0.5f
VADD(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);
} else if (sat == 3) {
// clamped = fabs(x) - fabs(x-1.0f); // [-1, 1]
fpr.MapRegV(vregs[i], MAP_DIRTY);
MOVI2F(S0, -1.0, R0);
MOVI2F(S1, 1.0, R0);
VCMP(fpr.V(vregs[i]), S1);
VMRS_APSR();
SetCC(CC_GE);
VMOV(fpr.V(vregs[i]), S1);
FixupBranch skip = B();
SetCC(CC_AL);
VCMP(fpr.V(vregs[i]), S0);
VMRS_APSR();
SetCC(CC_LE);
VMOV(fpr.V(vregs[i]), S0);
SetCC(CC_AL);
SetJumpTarget(skip);
MOVI2F(S0, 1.0, R0);
VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x)
VSUB(S2, fpr.V(vregs[i]), S0); // S2 = fabs(x-1.0f) {VABD}
VABS(S2, S2);
VSUB(fpr.V(vregs[i]), S1, S2); // v[i] = S1 - S2
}
}
}

View File

@ -115,9 +115,9 @@ return floor(x+.5);
void ApplyPrefixST(float *v, u32 data, VectorSize size)
{
// Possible optimization shortcut:
if (data == 0xe4)
return;
// Possible optimization shortcut:
if (data == 0xe4)
return;
int n = GetNumVectorElements(size);
float origV[4];