x86 Jit: SIMD-ify vdot

This commit is contained in:
Henrik Rydgard 2014-11-26 23:47:18 +01:00
parent bbd0afd148
commit 5033babb10
3 changed files with 64 additions and 2 deletions

View File

@ -1740,6 +1740,7 @@ void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int ex
void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg); Write8(mask);}
void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}

View File

@ -650,6 +650,9 @@ public:
void EXTRACTPS(OpArg dest, X64Reg src, u8 arg);
#endif
// SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
void DPPS(X64Reg dest, OpArg src, u8 arg);
void UNPCKLPS(X64Reg dest, OpArg src);
void UNPCKHPS(X64Reg dest, OpArg src);
void UNPCKLPD(X64Reg dest, OpArg src);

View File

@ -486,7 +486,6 @@ void Jit::Comp_VVectorInit(MIPSOpcode op) {
u8 dregs[4];
GetVectorRegsPrefixD(dregs, sz, _VD);
// vzero only for now
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
if (type == 6) {
XORPS(fpr.VSX(dregs[0]), fpr.VS(dregs[0]));
@ -572,13 +571,72 @@ void Jit::Comp_VDot(MIPSOpcode op) {
VectorSize sz = GetVecSize(op);
int n = GetNumVectorElements(sz);
// TODO: Force read one of them into regs? probably not.
u8 sregs[4], tregs[4], dregs[1];
GetVectorRegsPrefixS(sregs, sz, _VS);
GetVectorRegsPrefixT(tregs, sz, _VT);
GetVectorRegsPrefixD(dregs, V_Single, _VD);
// With SSE2, these won't really give any performance benefit on their own, but may reduce
// conversion costs from/to SIMD form. However, the SSE4.1 DPPS may be worth it.
// Benchmarking will have to decide whether to enable this on < SSE4.1. Also a HADDPS version
// for SSE3 could be written.
if (fpr.TryMapDirtyInInVS(dregs, V_Single, sregs, sz, tregs, sz)) {
switch (sz) {
case V_Pair:
if (cpu_info.bSSE4_1) {
MOVAPD(XMM0, fpr.VS(sregs[0]));
DPPS(XMM0, fpr.VS(tregs[0]), 0x31);
MOVAPD(fpr.VSX(dregs[0]), R(XMM0));
} else {
MOVAPD(XMM0, fpr.VS(sregs[0]));
MULPS(XMM0, fpr.VS(tregs[0]));
MOVAPD(R(XMM1), XMM0);
SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(1, 1, 1, 1));
ADDPS(XMM1, R(XMM0));
MOVAPD(fpr.VS(dregs[0]), XMM1);
}
break;
case V_Triple:
if (cpu_info.bSSE4_1) {
MOVAPD(XMM0, fpr.VS(sregs[0]));
DPPS(XMM0, fpr.VS(tregs[0]), 0x71);
MOVAPD(fpr.VSX(dregs[0]), R(XMM0));
} else {
MOVAPD(XMM0, fpr.VS(sregs[0]));
MULPS(XMM0, fpr.VS(tregs[0]));
MOVAPD(R(XMM1), XMM0);
SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(3, 2, 1, 1));
ADDSS(XMM1, R(XMM0));
SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(3, 2, 2, 2));
ADDSS(XMM1, R(XMM0));
MOVAPD(fpr.VS(dregs[0]), XMM1);
}
break;
case V_Quad:
if (cpu_info.bSSE4_1) {
MOVAPD(XMM0, fpr.VS(sregs[0]));
DPPS(XMM0, fpr.VS(tregs[0]), 0xF1);
MOVAPD(fpr.VSX(dregs[0]), R(XMM0));
} else {
MOVAPD(XMM0, fpr.VS(sregs[0]));
MOVAPD(XMM1, fpr.VS(tregs[0]));
MULPS(XMM0, R(XMM1));
MOVAPD(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(2, 3, 0, 1));
ADDPS(XMM0, R(XMM1));
MOVAPD(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 1, 2, 3));
ADDSS(XMM0, R(XMM1));
MOVAPD(fpr.VSX(dregs[0]), R(XMM0));
}
}
ApplyPrefixD(dregs, V_Single);
fpr.ReleaseSpillLocks();
return;
}
// Flush SIMD.
fpr.SimpleRegsV(sregs, sz, 0);
fpr.SimpleRegsV(tregs, sz, 0);