Minor cleanup in vtfm. Re-enable vrot combination. Optimize vfad/vavg when dpps is available.

Also fixes bug in emitter of dpps.
This commit is contained in:
Henrik Rydgard 2014-12-03 22:42:33 +01:00
parent 9d97eb5b12
commit 5290ffd929
2 changed files with 51 additions and 35 deletions

View File

@ -1743,7 +1743,7 @@ void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int ex
void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg); Write8(mask);}
void XEmitter::DPPS(X64Reg dest, OpArg arg, u8 mask) {WriteSSE41Op(0x66, 0x3A40, dest, arg, 1); Write8(mask);}
void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}

View File

@ -2754,11 +2754,9 @@ void Jit::Comp_Vtfm(MIPSOpcode op) {
// The T matrix we will address individually.
GetVectorRegs(dcol, sz, vd);
GetMatrixRows(vs, msz, scols);
memset(tregs, 255, sizeof(tregs));
GetVectorRegs(tregs, sz, vt);
for (int i = 0; i < ARRAY_SIZE(tregs); i++) {
if (tregs[i] != 255)
fpr.StoreFromRegisterV(tregs[i]);
for (int i = 0; i < n; i++) {
fpr.StoreFromRegisterV(tregs[i]);
}
u8 scol[4][4];
@ -2767,7 +2765,6 @@ void Jit::Comp_Vtfm(MIPSOpcode op) {
for (int i = 0; i < n; i++) {
GetVectorRegs(scol[i], sz, scols[i]);
fpr.MapRegsVS(scol[i], sz, 0);
fpr.SpillLockV(scols[i], sz);
}
// Now, work our way through the matrix, loading things as we go.
@ -2792,7 +2789,6 @@ void Jit::Comp_Vtfm(MIPSOpcode op) {
return;
}
u8 sregs[16], dregs[4], tregs[4];
GetMatrixRegs(sregs, msz, _VS);
GetVectorRegs(tregs, sz, _VT);
@ -2985,32 +2981,52 @@ void Jit::Comp_Vhoriz(MIPSOpcode op) {
GetVectorRegsPrefixS(sregs, sz, _VS);
GetVectorRegsPrefixD(dregs, V_Single, _VD);
if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
switch (sz) {
case V_Pair:
MOVAPS(XMM0, fpr.VS(sregs));
MOVAPS(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,1));
ADDPS(XMM0, R(XMM1));
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
case V_Triple:
MOVAPS(XMM0, fpr.VS(sregs));
MOVAPS(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,1));
ADDPS(XMM0, R(XMM1));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3,2,1,2));
ADDPS(XMM0, R(XMM1));
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
case V_Quad:
MOVAPS(XMM0, fpr.VS(sregs));
MOVHLPS(XMM1, XMM0);
ADDPS(XMM0, R(XMM1));
MOVAPS(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1,1,1,1));
ADDPS(XMM0, R(XMM1));
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
if (cpu_info.bSSE4_1) {
switch (sz) {
case V_Pair:
MOVAPS(XMM0, fpr.VS(sregs));
DPPS(XMM0, M(&oneOneOneOne), 0x31);
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
case V_Triple:
MOVAPS(XMM0, fpr.VS(sregs));
DPPS(XMM0, M(&oneOneOneOne), 0x71);
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
case V_Quad:
MOVAPS(XMM0, fpr.VS(sregs));
DPPS(XMM0, M(&oneOneOneOne), 0xF1);
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
}
} else {
switch (sz) {
case V_Pair:
MOVAPS(XMM0, fpr.VS(sregs));
MOVAPS(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
ADDPS(XMM0, R(XMM1));
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
case V_Triple:
MOVAPS(XMM0, fpr.VS(sregs));
MOVAPS(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
ADDPS(XMM0, R(XMM1));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 2));
ADDPS(XMM0, R(XMM1));
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
case V_Quad:
MOVAPS(XMM0, fpr.VS(sregs));
MOVHLPS(XMM1, XMM0);
ADDPS(XMM0, R(XMM1));
MOVAPS(XMM1, R(XMM0));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1, 1, 1, 1));
ADDPS(XMM0, R(XMM1));
MOVAPS(fpr.VSX(dregs), R(XMM0));
break;
}
}
if (((op >> 16) & 31) == 7) { // vavg
MULSS(fpr.VSX(dregs), M(&vavg_table[n]));
@ -3150,11 +3166,11 @@ void Jit::Comp_VRot(MIPSOpcode op) {
u32 nextOp = Memory::Read_Opcode_JIT(js.compilerPC + 4).encoding;
int vd2 = -1;
int imm2 = -1;
if (false && (nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
// Pair of vrot. Let's join them.
vd2 = MIPS_GET_VD(nextOp);
imm2 = (nextOp >> 16) & 0x1f;
NOTICE_LOG(JIT, "Joint VFPU at %08x", js.blockStart);
// NOTICE_LOG(JIT, "Joint VFPU at %08x", js.blockStart);
}
u8 sreg;