Merge pull request #4656 from hrydgard/vfpu-reg-order

Reorder VFPU registers in memory so that we can flush and reload them with vector moves.
This commit is contained in:
Henrik Rydgård 2013-11-28 04:28:13 -08:00
commit d4b0df8d30
12 changed files with 158 additions and 61 deletions

View File

@ -520,7 +520,7 @@ public:
virtual void DoState(PointerWrap &p)
{
auto s = p.Section("Thread", 1, 3);
auto s = p.Section("Thread", 1, 4);
if (!s)
return;
@ -530,7 +530,21 @@ public:
p.Do(isProcessingCallbacks);
p.Do(currentMipscallId);
p.Do(currentCallbackId);
// TODO: How do I "version" adding a DoState method to ThreadContext?
p.Do(context);
if (s <= 3)
{
// We must have been loading an old state if we're here.
// Reorder VFPU data to new order.
float temp[128];
memcpy(temp, context.v, 128 * sizeof(float));
for (int i = 0; i < 128; i++) {
context.v[voffset[i]] = temp[i];
}
}
if (s <= 2)
{
context.other[4] = context.other[5];

View File

@ -202,13 +202,6 @@ namespace MIPSComp
SetCC(CC_GT);
VMOV(fpr.V(vregs[i]), S1);
SetCC(CC_AL);
/*
VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x)
VSUB(fpr.V(vregs[i]), fpr.V(vregs[i]), S0); // S2 = fabs(x-0.5f) {VABD}
VABS(fpr.V(vregs[i]), fpr.V(vregs[i]));
VSUB(fpr.V(vregs[i]), S1, fpr.V(vregs[i])); // v[i] = S1 - S2 + 0.5f
VADD(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);*/
} else if (sat == 3) {
fpr.MapRegV(vregs[i], MAP_DIRTY);
@ -224,16 +217,6 @@ namespace MIPSComp
SetCC(CC_GT);
VMOV(fpr.V(vregs[i]), S1);
SetCC(CC_AL);
// clamped = fabs(x) - fabs(x-1.0f); // [-1, 1]
/*
fpr.MapRegV(vregs[i], MAP_DIRTY);
MOVI2F(S0, 1.0f, R0);
VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x)
VSUB(fpr.V(vregs[i]), fpr.V(vregs[i]), S0); // S2 = fabs(x-1.0f) {VABD}
VABS(fpr.V(vregs[i]), fpr.V(vregs[i]));
VSUB(fpr.V(vregs[i]), S1, fpr.V(vregs[i])); // v[i] = S1 - S2
*/
}
}
}

View File

@ -371,10 +371,17 @@ void ArmRegCacheFPU::FlushAll() {
int ArmRegCacheFPU::GetMipsRegOffset(MIPSReg r) {
// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.
if (r < 32 + 128 + NUM_TEMPS)
return (r + 32) << 2;
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
return 0; // or what?
if (r < 0 || r > 32 + 128 + NUM_TEMPS) {
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
return 0; // or what?
}
if (r < 32 || r > 32 + 128) {
return (32 + r) << 2;
} else {
// r is between 32 and 128 + 32
return (32 + 32 + voffset[r - 32]) << 2;
}
}
void ArmRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {

View File

@ -44,6 +44,9 @@ MIPSState *currentMIPS = &mipsr4k;
MIPSDebugInterface debugr4k(&mipsr4k);
MIPSDebugInterface *currentDebugMIPS = &debugr4k;
u8 voffset[128];
u8 fromvoffset[128];
#ifndef M_LOG2E
#define M_E 2.71828182845904523536f
@ -89,6 +92,70 @@ const float cst_constants[32] = {
MIPSState::MIPSState()
{
MIPSComp::jit = 0;
// Initialize vorder
// This reordering of the VFPU registers in RAM means that instead of being like this:
// 0x00 0x20 0x40 0x60 -> "columns", the most common direction
// 0x01 0x21 0x41 0x61
// 0x02 0x22 0x42 0x62
// 0x03 0x23 0x43 0x63
// 0x04 0x24 0x44 0x64
// 0x06 0x26 0x45 0x65
// ....
// the VPU registers are effectively organized like this:
// 0x00 0x01 0x02 0x03
// 0x04 0x05 0x06 0x07
// 0x08 0x09 0x0a 0x0b
// ....
// This is because the original indices look like this:
// 0XXMMMYY where M is the matrix number.
// We will now map 0YYMMMXX to 0MMMXXYY.
// Advantages:
// * Columns can be flushed and reloaded faster "at once"
// * 4x4 Matrices are contiguous in RAM, making them, too, fast-loadable in NEON
// Disadvantages:
// * Extra indirection, can be confusing and slower (interpreter only)
// * Flushing and reloading row registers is now slower
int i = 0;
for (int m = 0; m < 8; m++) {
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
voffset[m * 4 + x * 32 + y] = i++;
}
}
}
// And the inverse.
for (int i = 0; i < 128; i++) {
fromvoffset[voffset[i]] = i;
}
// Sanity check that things that should be ordered are ordered.
static const u8 firstThirtyTwo[] = {
0x0, 0x20, 0x40, 0x60,
0x1, 0x21, 0x41, 0x61,
0x2, 0x22, 0x42, 0x62,
0x3, 0x23, 0x43, 0x63,
0x4, 0x24, 0x44, 0x64,
0x5, 0x25, 0x45, 0x65,
0x6, 0x26, 0x46, 0x66,
0x7, 0x27, 0x47, 0x67,
};
for (int i = 0; i < ARRAY_SIZE(firstThirtyTwo); i++) {
if (voffset[firstThirtyTwo[i]] != i) {
ERROR_LOG(CPU, "Wrong voffset order! %i: %i should have been %i", firstThirtyTwo[i], voffset[firstThirtyTwo[i]], i);
}
}
}
MIPSState::~MIPSState()
@ -146,7 +213,7 @@ void MIPSState::Reset()
}
void MIPSState::DoState(PointerWrap &p) {
auto s = p.Section("MIPSState", 1, 2);
auto s = p.Section("MIPSState", 1, 3);
if (!s)
return;
@ -160,7 +227,15 @@ void MIPSState::DoState(PointerWrap &p) {
p.DoArray(r, sizeof(r) / sizeof(r[0]));
p.DoArray(f, sizeof(f) / sizeof(f[0]));
p.DoArray(v, sizeof(v) / sizeof(v[0]));
if (s <= 2) {
float vtemp[128];
p.DoArray(vtemp, sizeof(v) / sizeof(v[0]));
for (int i = 0; i < 128; i++) {
v[voffset[i]] = vtemp[i];
}
} else {
p.DoArray(v, sizeof(v) / sizeof(v[0]));
}
p.DoArray(vfpuCtrl, sizeof(vfpuCtrl) / sizeof(vfpuCtrl[0]));
p.Do(pc);
p.Do(nextPC);

View File

@ -115,6 +115,12 @@ enum VCondition
VC_NS
};
// In memory, we order the VFPU registers differently.
// Games use columns a whole lot more than rows, and it would thus be good if columns
// were contiguous in memory. Also, matrices aren't but should be.
extern u8 voffset[128];
extern u8 fromvoffset[128];
class MIPSState
{
public:

View File

@ -107,7 +107,7 @@ public:
return temp;
case 2:
memcpy(&temp, &cpu->v[index], 4);
memcpy(&temp, &cpu->v[voffset[index]], 4);
return temp;
default:
@ -129,7 +129,7 @@ public:
break;
case 2:
memcpy(&cpu->v[index], &value, 4);
memcpy(&cpu->v[voffset[index]], &value, 4);
break;
default:

View File

@ -52,8 +52,8 @@
#include <algorithm>
#define R(i) (currentMIPS->r[i])
#define V(i) (currentMIPS->v[i])
#define VI(i) (currentMIPS->vi[i])
#define V(i) (currentMIPS->v[voffset[i]])
#define VI(i) (currentMIPS->vi[voffset[i]])
#define FI(i) (currentMIPS->fi[i])
#define FsI(i) (currentMIPS->fs[i])
#define PC (currentMIPS->pc)
@ -107,7 +107,7 @@ inline float nanclamp(float f, float lower, float upper)
}
void ApplyPrefixST(float *v, u32 data, VectorSize size)
void ApplyPrefixST(float *r, u32 data, VectorSize size)
{
// Possible optimization shortcut:
if (data == 0xe4)
@ -119,7 +119,7 @@ void ApplyPrefixST(float *v, u32 data, VectorSize size)
for (int i = 0; i < n; i++)
{
origV[i] = v[i];
origV[i] = r[i];
}
for (int i = 0; i < n; i++)
@ -141,17 +141,17 @@ void ApplyPrefixST(float *v, u32 data, VectorSize size)
regnum = 0;
}
v[i] = origV[regnum];
r[i] = origV[regnum];
if (abs)
v[i] = fabs(v[i]);
r[i] = fabs(r[i]);
}
else
{
v[i] = constantArray[regnum + (abs<<2)];
r[i] = constantArray[regnum + (abs<<2)];
}
if (negate)
v[i] = -v[i];
r[i] = -r[i];
}
}

View File

@ -22,8 +22,8 @@
#include <limits>
#define V(i) (currentMIPS->v[i])
#define VI(i) (currentMIPS->vi[i])
#define V(i) (currentMIPS->v[voffset[i]])
#define VI(i) (currentMIPS->vi[voffset[i]])
void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
int mtx = (vectorReg >> 2) & 7;

View File

@ -344,10 +344,17 @@ void PpcRegCacheFPU::FlushAll() {
int PpcRegCacheFPU::GetMipsRegOffset(MIPSReg r) {
// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.
if (r < 32 + 128 + NUM_TEMPS)
return (r + 32) << 2;
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
return 0; // or what?
if (r < 0 || r > 32 + 128 + NUM_TEMPS) {
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
return 0; // or what?
}
if (r < 32 || r > 32 + 128) {
return (32 + r) << 2;
} else {
// r is between 32 and 128 + 32
return (32 + 32 + voffset[r - 32]) << 2;
}
}
void PpcRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {

View File

@ -32,8 +32,10 @@ FPURegCache::FPURegCache() : mips(0), initialReady(false), emit(0) {
void FPURegCache::Start(MIPSState *mips, MIPSAnalyst::AnalysisResults &stats) {
this->mips = mips;
if (!initialReady)
if (!initialReady) {
SetupInitialRegs();
initialReady = true;
}
memcpy(xregs, xregsInitial, sizeof(xregs));
memcpy(regs, regsInitial, sizeof(regs));
@ -50,12 +52,14 @@ void FPURegCache::SetupInitialRegs() {
regsInitial[i].location = base;
base.IncreaseOffset(sizeof(float));
}
base = GetDefaultLocation(32);
for (int i = 32; i < NUM_MIPS_FPRS; i++) {
for (int i = 32; i < 32 + 128; i++) {
regsInitial[i].location = GetDefaultLocation(i);
}
base = GetDefaultLocation(32 + 128);
for (int i = 32 + 128; i < NUM_MIPS_FPRS; i++) {
regsInitial[i].location = base;
base.IncreaseOffset(sizeof(float));
}
initialReady = true;
}
void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
@ -65,16 +69,16 @@ void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
if (p4 != 0xFF) regs[p4].locked = true;
}
void FPURegCache::SpillLockV(const u8 *v, VectorSize sz) {
void FPURegCache::SpillLockV(const u8 *vec, VectorSize sz) {
for (int i = 0; i < GetNumVectorElements(sz); i++) {
vregs[v[i]].locked = true;
vregs[vec[i]].locked = true;
}
}
void FPURegCache::SpillLockV(int vec, VectorSize sz) {
u8 v[4];
GetVectorRegs(v, sz, vec);
SpillLockV(v, sz);
u8 r[4];
GetVectorRegs(r, sz, vec);
SpillLockV(r, sz);
}
void FPURegCache::MapRegV(int vreg, int flags) {
@ -82,18 +86,18 @@ void FPURegCache::MapRegV(int vreg, int flags) {
}
void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) {
u8 v[4];
GetVectorRegs(v, sz, vec);
SpillLockV(v, sz);
u8 r[4];
GetVectorRegs(r, sz, vec);
SpillLockV(r, sz);
for (int i = 0; i < GetNumVectorElements(sz); i++) {
MapReg(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
MapReg(r[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
}
}
void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) {
SpillLockV(v, sz);
void FPURegCache::MapRegsV(const u8 *r, VectorSize sz, int flags) {
SpillLockV(r, sz);
for (int i = 0; i < GetNumVectorElements(sz); i++) {
MapReg(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
MapReg(r[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
}
}
@ -205,7 +209,7 @@ OpArg FPURegCache::GetDefaultLocation(int reg) const {
if (reg < 32) {
return M(&mips->f[reg]);
} else if (reg < 32 + 128) {
return M(&mips->v[reg - 32]);
return M(&mips->v[voffset[reg - 32]]);
} else {
return M(&tempValues[reg - 32 - 128]);
}
@ -264,9 +268,9 @@ X64Reg FPURegCache::GetFreeXReg() {
}
void FPURegCache::FlushX(X64Reg reg) {
if (reg >= NUM_X_FPREGS)
if (reg >= NUM_X_FPREGS) {
PanicAlert("Flushing non existent reg");
else if (xregs[reg].mipsReg != -1) {
} else if (xregs[reg].mipsReg != -1) {
StoreFromRegister(xregs[reg].mipsReg);
}
}

View File

@ -154,6 +154,7 @@ public:
void FlushX(X64Reg reg);
X64Reg GetFreeXReg();
private:
const int *GetAllocationOrder(int &count);
void SetupInitialRegs();

View File

@ -168,8 +168,8 @@ BOOL CVFPUDlg::DlgProc(UINT message, WPARAM wParam, LPARAM lParam)
for (int row = 0; row<4; row++)
{
float val = mipsr4k.v[column*32+row+matrix*4];
u32 hex = mipsr4k.vi[column*32+row+matrix*4];
float val = mipsr4k.v[voffset[column*32+row+matrix*4]];
u32 hex = mipsr4k.vi[voffset[column*32+row+matrix*4]];
switch (mode)
{
case 0: temp_len = sprintf_s(temp,"%f",val); break;