mirror of
https://github.com/libretro/ppsspp.git
synced 2025-02-24 10:53:11 +00:00
x86 Jit SIMD: Generate somewhat shorter code for handling transposed matrices in vmmul.
TODO: Build into regalloc instead, with a MapMatrix function?
This commit is contained in:
parent
aa31bcc6ae
commit
6a7e5d1cc2
@ -70,7 +70,6 @@ const u32 MEMORY_ALIGNED16( lowZeroes[4] ) = {0x00000000, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
const u32 MEMORY_ALIGNED16( fourinfnan[4] ) = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
|
||||
const float MEMORY_ALIGNED16( identityMatrix[4][4]) = { { 1.0f, 0, 0, 0 }, { 0, 1.0f, 0, 0 }, { 0, 0, 1.0f, 0 }, { 0, 0, 0, 1.0f} };
|
||||
|
||||
|
||||
void Jit::Comp_VPFX(MIPSOpcode op)
|
||||
{
|
||||
CONDITIONAL_DISABLE;
|
||||
@ -569,7 +568,6 @@ void Jit::Comp_VIdt(MIPSOpcode op) {
|
||||
fpr.ReleaseSpillLocks();
|
||||
}
|
||||
|
||||
|
||||
void Jit::Comp_VDot(MIPSOpcode op) {
|
||||
CONDITIONAL_DISABLE;
|
||||
|
||||
@ -2637,15 +2635,18 @@ void Jit::Comp_Vmmul(MIPSOpcode op) {
|
||||
int vd = _VD;
|
||||
int vt = _VT;
|
||||
|
||||
// Some games transpose too much..
|
||||
// TODO: This doesn't work! It should!
|
||||
if (false && (vd & 0x20) && !(vs & 0x20) && (vt & 0x20) && sz == M_4x4) {
|
||||
// Transpose both operands and swap the order, according to (AB)' = (B'A')
|
||||
// We want our contiguous columns as much as possible!
|
||||
bool transposeDest = false;
|
||||
bool transposeS = false;
|
||||
|
||||
if ((vd & 0x20) && sz == M_4x4) {
|
||||
vd ^= 0x20;
|
||||
transposeDest = true;
|
||||
}
|
||||
|
||||
// Our algorithm needs a transposed S (which is the usual).
|
||||
if (!(vs & 0x20) && sz == M_4x4) {
|
||||
vs ^= 0x20;
|
||||
vt ^= 0x20;
|
||||
//vd ^= 0x20;
|
||||
std::swap(vs, vt);
|
||||
transposeS = true;
|
||||
}
|
||||
|
||||
// The T matrix we will address individually.
|
||||
@ -2667,6 +2668,35 @@ void Jit::Comp_Vmmul(MIPSOpcode op) {
|
||||
fpr.SpillLockV(scols[i], vsz);
|
||||
}
|
||||
|
||||
// Shorter than manually stuffing the registers. But it feels like ther'es room for optimization here...
|
||||
auto transposeInPlace = [=](u8 col[4][4]) {
|
||||
MOVAPS(XMM0, fpr.VS(col[0]));
|
||||
UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[2]));
|
||||
UNPCKHPS(XMM0, fpr.VS(col[2]));
|
||||
|
||||
MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
|
||||
UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[3]));
|
||||
UNPCKHPS(fpr.VSX(col[2]), fpr.VS(col[3]));
|
||||
|
||||
MOVAPS(fpr.VSX(col[3]), fpr.VS(col[0]));
|
||||
UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[1]));
|
||||
UNPCKHPS(fpr.VSX(col[3]), fpr.VS(col[1]));
|
||||
|
||||
MOVAPS(fpr.VSX(col[1]), R(XMM0));
|
||||
UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[2]));
|
||||
UNPCKHPS(XMM0, fpr.VS(col[2]));
|
||||
|
||||
MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
|
||||
MOVAPS(fpr.VSX(col[1]), fpr.VS(col[3]));
|
||||
MOVAPS(fpr.VSX(col[3]), R(XMM0));
|
||||
};
|
||||
|
||||
// Some games pass in S as an E matrix (transposed). Let's just transpose the data before we do the multiplication instead.
|
||||
// This is shorter than trying to combine a discontinous matrix with lots of shufps.
|
||||
if (transposeS) {
|
||||
transposeInPlace(scol);
|
||||
}
|
||||
|
||||
// Now, work our way through the matrix, loading things as we go.
|
||||
// TODO: With more temp registers, can generate much more efficient code.
|
||||
for (int i = 0; i < n; i++) {
|
||||
@ -2693,6 +2723,19 @@ void Jit::Comp_Vmmul(MIPSOpcode op) {
|
||||
#endif
|
||||
MOVAPS(fpr.VS(dcol), XMM1);
|
||||
}
|
||||
|
||||
#ifndef _M_X64
|
||||
fpr.ReleaseSpillLocks();
|
||||
#endif
|
||||
if (transposeDest) {
|
||||
u8 dcol[4][4];
|
||||
for (int i = 0; i < n; i++) {
|
||||
GetVectorRegs(dcol[i], vsz, dcols[i]);
|
||||
fpr.MapRegsVS(dcol[i], vsz, MAP_DIRTY);
|
||||
fpr.SpillLockV(dcols[i], vsz);
|
||||
}
|
||||
transposeInPlace(dcol);
|
||||
}
|
||||
fpr.ReleaseSpillLocks();
|
||||
return;
|
||||
}
|
||||
@ -3257,7 +3300,7 @@ void Jit::Comp_VRot(MIPSOpcode op) {
|
||||
int vd2 = -1;
|
||||
int imm2 = -1;
|
||||
if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
|
||||
// Pair of vrot. Let's join them.
|
||||
// Pair of vrot with the same angle argument. Let's join them (can share sin/cos results).
|
||||
vd2 = MIPS_GET_VD(nextOp);
|
||||
imm2 = (nextOp >> 16) & 0x1f;
|
||||
// NOTICE_LOG(JIT, "Joint VFPU at %08x", js.blockStart);
|
||||
|
Loading…
x
Reference in New Issue
Block a user