mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-02-26 08:55:58 +00:00
Merge pull request #4656 from hrydgard/vfpu-reg-order
Reorder VFPU registers in memory so that we can flush and reload them with vector moves.
This commit is contained in:
commit
d4b0df8d30
@ -520,7 +520,7 @@ public:
|
||||
|
||||
virtual void DoState(PointerWrap &p)
|
||||
{
|
||||
auto s = p.Section("Thread", 1, 3);
|
||||
auto s = p.Section("Thread", 1, 4);
|
||||
if (!s)
|
||||
return;
|
||||
|
||||
@ -530,7 +530,21 @@ public:
|
||||
p.Do(isProcessingCallbacks);
|
||||
p.Do(currentMipscallId);
|
||||
p.Do(currentCallbackId);
|
||||
|
||||
// TODO: How do I "version" adding a DoState method to ThreadContext?
|
||||
p.Do(context);
|
||||
|
||||
if (s <= 3)
|
||||
{
|
||||
// We must have been loading an old state if we're here.
|
||||
// Reorder VFPU data to new order.
|
||||
float temp[128];
|
||||
memcpy(temp, context.v, 128 * sizeof(float));
|
||||
for (int i = 0; i < 128; i++) {
|
||||
context.v[voffset[i]] = temp[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (s <= 2)
|
||||
{
|
||||
context.other[4] = context.other[5];
|
||||
|
@ -202,13 +202,6 @@ namespace MIPSComp
|
||||
SetCC(CC_GT);
|
||||
VMOV(fpr.V(vregs[i]), S1);
|
||||
SetCC(CC_AL);
|
||||
|
||||
/*
|
||||
VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x)
|
||||
VSUB(fpr.V(vregs[i]), fpr.V(vregs[i]), S0); // S2 = fabs(x-0.5f) {VABD}
|
||||
VABS(fpr.V(vregs[i]), fpr.V(vregs[i]));
|
||||
VSUB(fpr.V(vregs[i]), S1, fpr.V(vregs[i])); // v[i] = S1 - S2 + 0.5f
|
||||
VADD(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);*/
|
||||
} else if (sat == 3) {
|
||||
fpr.MapRegV(vregs[i], MAP_DIRTY);
|
||||
|
||||
@ -224,16 +217,6 @@ namespace MIPSComp
|
||||
SetCC(CC_GT);
|
||||
VMOV(fpr.V(vregs[i]), S1);
|
||||
SetCC(CC_AL);
|
||||
|
||||
// clamped = fabs(x) - fabs(x-1.0f); // [-1, 1]
|
||||
/*
|
||||
fpr.MapRegV(vregs[i], MAP_DIRTY);
|
||||
MOVI2F(S0, 1.0f, R0);
|
||||
VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x)
|
||||
VSUB(fpr.V(vregs[i]), fpr.V(vregs[i]), S0); // S2 = fabs(x-1.0f) {VABD}
|
||||
VABS(fpr.V(vregs[i]), fpr.V(vregs[i]));
|
||||
VSUB(fpr.V(vregs[i]), S1, fpr.V(vregs[i])); // v[i] = S1 - S2
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -371,10 +371,17 @@ void ArmRegCacheFPU::FlushAll() {
|
||||
|
||||
int ArmRegCacheFPU::GetMipsRegOffset(MIPSReg r) {
|
||||
// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.
|
||||
if (r < 32 + 128 + NUM_TEMPS)
|
||||
return (r + 32) << 2;
|
||||
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
|
||||
return 0; // or what?
|
||||
if (r < 0 || r > 32 + 128 + NUM_TEMPS) {
|
||||
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
|
||||
return 0; // or what?
|
||||
}
|
||||
|
||||
if (r < 32 || r > 32 + 128) {
|
||||
return (32 + r) << 2;
|
||||
} else {
|
||||
// r is between 32 and 128 + 32
|
||||
return (32 + 32 + voffset[r - 32]) << 2;
|
||||
}
|
||||
}
|
||||
|
||||
void ArmRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {
|
||||
|
@ -44,6 +44,9 @@ MIPSState *currentMIPS = &mipsr4k;
|
||||
MIPSDebugInterface debugr4k(&mipsr4k);
|
||||
MIPSDebugInterface *currentDebugMIPS = &debugr4k;
|
||||
|
||||
u8 voffset[128];
|
||||
u8 fromvoffset[128];
|
||||
|
||||
|
||||
#ifndef M_LOG2E
|
||||
#define M_E 2.71828182845904523536f
|
||||
@ -89,6 +92,70 @@ const float cst_constants[32] = {
|
||||
MIPSState::MIPSState()
|
||||
{
|
||||
MIPSComp::jit = 0;
|
||||
|
||||
// Initialize vorder
|
||||
|
||||
// This reordering of the VFPU registers in RAM means that instead of being like this:
|
||||
|
||||
// 0x00 0x20 0x40 0x60 -> "columns", the most common direction
|
||||
// 0x01 0x21 0x41 0x61
|
||||
// 0x02 0x22 0x42 0x62
|
||||
// 0x03 0x23 0x43 0x63
|
||||
|
||||
// 0x04 0x24 0x44 0x64
|
||||
// 0x06 0x26 0x45 0x65
|
||||
// ....
|
||||
|
||||
// the VPU registers are effectively organized like this:
|
||||
// 0x00 0x01 0x02 0x03
|
||||
// 0x04 0x05 0x06 0x07
|
||||
// 0x08 0x09 0x0a 0x0b
|
||||
// ....
|
||||
|
||||
// This is because the original indices look like this:
|
||||
// 0XXMMMYY where M is the matrix number.
|
||||
|
||||
// We will now map 0YYMMMXX to 0MMMXXYY.
|
||||
|
||||
// Advantages:
|
||||
// * Columns can be flushed and reloaded faster "at once"
|
||||
// * 4x4 Matrices are contiguous in RAM, making them, too, fast-loadable in NEON
|
||||
|
||||
// Disadvantages:
|
||||
// * Extra indirection, can be confusing and slower (interpreter only)
|
||||
// * Flushing and reloading row registers is now slower
|
||||
|
||||
int i = 0;
|
||||
for (int m = 0; m < 8; m++) {
|
||||
for (int y = 0; y < 4; y++) {
|
||||
for (int x = 0; x < 4; x++) {
|
||||
voffset[m * 4 + x * 32 + y] = i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// And the inverse.
|
||||
for (int i = 0; i < 128; i++) {
|
||||
fromvoffset[voffset[i]] = i;
|
||||
}
|
||||
|
||||
// Sanity check that things that should be ordered are ordered.
|
||||
static const u8 firstThirtyTwo[] = {
|
||||
0x0, 0x20, 0x40, 0x60,
|
||||
0x1, 0x21, 0x41, 0x61,
|
||||
0x2, 0x22, 0x42, 0x62,
|
||||
0x3, 0x23, 0x43, 0x63,
|
||||
|
||||
0x4, 0x24, 0x44, 0x64,
|
||||
0x5, 0x25, 0x45, 0x65,
|
||||
0x6, 0x26, 0x46, 0x66,
|
||||
0x7, 0x27, 0x47, 0x67,
|
||||
};
|
||||
for (int i = 0; i < ARRAY_SIZE(firstThirtyTwo); i++) {
|
||||
if (voffset[firstThirtyTwo[i]] != i) {
|
||||
ERROR_LOG(CPU, "Wrong voffset order! %i: %i should have been %i", firstThirtyTwo[i], voffset[firstThirtyTwo[i]], i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MIPSState::~MIPSState()
|
||||
@ -146,7 +213,7 @@ void MIPSState::Reset()
|
||||
}
|
||||
|
||||
void MIPSState::DoState(PointerWrap &p) {
|
||||
auto s = p.Section("MIPSState", 1, 2);
|
||||
auto s = p.Section("MIPSState", 1, 3);
|
||||
if (!s)
|
||||
return;
|
||||
|
||||
@ -160,7 +227,15 @@ void MIPSState::DoState(PointerWrap &p) {
|
||||
|
||||
p.DoArray(r, sizeof(r) / sizeof(r[0]));
|
||||
p.DoArray(f, sizeof(f) / sizeof(f[0]));
|
||||
p.DoArray(v, sizeof(v) / sizeof(v[0]));
|
||||
if (s <= 2) {
|
||||
float vtemp[128];
|
||||
p.DoArray(vtemp, sizeof(v) / sizeof(v[0]));
|
||||
for (int i = 0; i < 128; i++) {
|
||||
v[voffset[i]] = vtemp[i];
|
||||
}
|
||||
} else {
|
||||
p.DoArray(v, sizeof(v) / sizeof(v[0]));
|
||||
}
|
||||
p.DoArray(vfpuCtrl, sizeof(vfpuCtrl) / sizeof(vfpuCtrl[0]));
|
||||
p.Do(pc);
|
||||
p.Do(nextPC);
|
||||
|
@ -115,6 +115,12 @@ enum VCondition
|
||||
VC_NS
|
||||
};
|
||||
|
||||
// In memory, we order the VFPU registers differently.
|
||||
// Games use columns a whole lot more than rows, and it would thus be good if columns
|
||||
// were contiguous in memory. Also, matrices aren't but should be.
|
||||
extern u8 voffset[128];
|
||||
extern u8 fromvoffset[128];
|
||||
|
||||
class MIPSState
|
||||
{
|
||||
public:
|
||||
|
@ -107,7 +107,7 @@ public:
|
||||
return temp;
|
||||
|
||||
case 2:
|
||||
memcpy(&temp, &cpu->v[index], 4);
|
||||
memcpy(&temp, &cpu->v[voffset[index]], 4);
|
||||
return temp;
|
||||
|
||||
default:
|
||||
@ -129,7 +129,7 @@ public:
|
||||
break;
|
||||
|
||||
case 2:
|
||||
memcpy(&cpu->v[index], &value, 4);
|
||||
memcpy(&cpu->v[voffset[index]], &value, 4);
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -52,8 +52,8 @@
|
||||
#include <algorithm>
|
||||
|
||||
#define R(i) (currentMIPS->r[i])
|
||||
#define V(i) (currentMIPS->v[i])
|
||||
#define VI(i) (currentMIPS->vi[i])
|
||||
#define V(i) (currentMIPS->v[voffset[i]])
|
||||
#define VI(i) (currentMIPS->vi[voffset[i]])
|
||||
#define FI(i) (currentMIPS->fi[i])
|
||||
#define FsI(i) (currentMIPS->fs[i])
|
||||
#define PC (currentMIPS->pc)
|
||||
@ -107,7 +107,7 @@ inline float nanclamp(float f, float lower, float upper)
|
||||
}
|
||||
|
||||
|
||||
void ApplyPrefixST(float *v, u32 data, VectorSize size)
|
||||
void ApplyPrefixST(float *r, u32 data, VectorSize size)
|
||||
{
|
||||
// Possible optimization shortcut:
|
||||
if (data == 0xe4)
|
||||
@ -119,7 +119,7 @@ void ApplyPrefixST(float *v, u32 data, VectorSize size)
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
origV[i] = v[i];
|
||||
origV[i] = r[i];
|
||||
}
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
@ -141,17 +141,17 @@ void ApplyPrefixST(float *v, u32 data, VectorSize size)
|
||||
regnum = 0;
|
||||
}
|
||||
|
||||
v[i] = origV[regnum];
|
||||
r[i] = origV[regnum];
|
||||
if (abs)
|
||||
v[i] = fabs(v[i]);
|
||||
r[i] = fabs(r[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
v[i] = constantArray[regnum + (abs<<2)];
|
||||
r[i] = constantArray[regnum + (abs<<2)];
|
||||
}
|
||||
|
||||
if (negate)
|
||||
v[i] = -v[i];
|
||||
r[i] = -r[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -22,8 +22,8 @@
|
||||
|
||||
#include <limits>
|
||||
|
||||
#define V(i) (currentMIPS->v[i])
|
||||
#define VI(i) (currentMIPS->vi[i])
|
||||
#define V(i) (currentMIPS->v[voffset[i]])
|
||||
#define VI(i) (currentMIPS->vi[voffset[i]])
|
||||
|
||||
void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
|
||||
int mtx = (vectorReg >> 2) & 7;
|
||||
|
@ -344,10 +344,17 @@ void PpcRegCacheFPU::FlushAll() {
|
||||
|
||||
int PpcRegCacheFPU::GetMipsRegOffset(MIPSReg r) {
|
||||
// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls.
|
||||
if (r < 32 + 128 + NUM_TEMPS)
|
||||
return (r + 32) << 2;
|
||||
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
|
||||
return 0; // or what?
|
||||
if (r < 0 || r > 32 + 128 + NUM_TEMPS) {
|
||||
ERROR_LOG(JIT, "bad mips register %i, out of range", r);
|
||||
return 0; // or what?
|
||||
}
|
||||
|
||||
if (r < 32 || r > 32 + 128) {
|
||||
return (32 + r) << 2;
|
||||
} else {
|
||||
// r is between 32 and 128 + 32
|
||||
return (32 + 32 + voffset[r - 32]) << 2;
|
||||
}
|
||||
}
|
||||
|
||||
void PpcRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) {
|
||||
|
@ -32,8 +32,10 @@ FPURegCache::FPURegCache() : mips(0), initialReady(false), emit(0) {
|
||||
void FPURegCache::Start(MIPSState *mips, MIPSAnalyst::AnalysisResults &stats) {
|
||||
this->mips = mips;
|
||||
|
||||
if (!initialReady)
|
||||
if (!initialReady) {
|
||||
SetupInitialRegs();
|
||||
initialReady = true;
|
||||
}
|
||||
|
||||
memcpy(xregs, xregsInitial, sizeof(xregs));
|
||||
memcpy(regs, regsInitial, sizeof(regs));
|
||||
@ -50,12 +52,14 @@ void FPURegCache::SetupInitialRegs() {
|
||||
regsInitial[i].location = base;
|
||||
base.IncreaseOffset(sizeof(float));
|
||||
}
|
||||
base = GetDefaultLocation(32);
|
||||
for (int i = 32; i < NUM_MIPS_FPRS; i++) {
|
||||
for (int i = 32; i < 32 + 128; i++) {
|
||||
regsInitial[i].location = GetDefaultLocation(i);
|
||||
}
|
||||
base = GetDefaultLocation(32 + 128);
|
||||
for (int i = 32 + 128; i < NUM_MIPS_FPRS; i++) {
|
||||
regsInitial[i].location = base;
|
||||
base.IncreaseOffset(sizeof(float));
|
||||
}
|
||||
initialReady = true;
|
||||
}
|
||||
|
||||
void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
|
||||
@ -65,16 +69,16 @@ void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
|
||||
if (p4 != 0xFF) regs[p4].locked = true;
|
||||
}
|
||||
|
||||
void FPURegCache::SpillLockV(const u8 *v, VectorSize sz) {
|
||||
void FPURegCache::SpillLockV(const u8 *vec, VectorSize sz) {
|
||||
for (int i = 0; i < GetNumVectorElements(sz); i++) {
|
||||
vregs[v[i]].locked = true;
|
||||
vregs[vec[i]].locked = true;
|
||||
}
|
||||
}
|
||||
|
||||
void FPURegCache::SpillLockV(int vec, VectorSize sz) {
|
||||
u8 v[4];
|
||||
GetVectorRegs(v, sz, vec);
|
||||
SpillLockV(v, sz);
|
||||
u8 r[4];
|
||||
GetVectorRegs(r, sz, vec);
|
||||
SpillLockV(r, sz);
|
||||
}
|
||||
|
||||
void FPURegCache::MapRegV(int vreg, int flags) {
|
||||
@ -82,18 +86,18 @@ void FPURegCache::MapRegV(int vreg, int flags) {
|
||||
}
|
||||
|
||||
void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) {
|
||||
u8 v[4];
|
||||
GetVectorRegs(v, sz, vec);
|
||||
SpillLockV(v, sz);
|
||||
u8 r[4];
|
||||
GetVectorRegs(r, sz, vec);
|
||||
SpillLockV(r, sz);
|
||||
for (int i = 0; i < GetNumVectorElements(sz); i++) {
|
||||
MapReg(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
|
||||
MapReg(r[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
|
||||
}
|
||||
}
|
||||
|
||||
void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) {
|
||||
SpillLockV(v, sz);
|
||||
void FPURegCache::MapRegsV(const u8 *r, VectorSize sz, int flags) {
|
||||
SpillLockV(r, sz);
|
||||
for (int i = 0; i < GetNumVectorElements(sz); i++) {
|
||||
MapReg(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
|
||||
MapReg(r[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -205,7 +209,7 @@ OpArg FPURegCache::GetDefaultLocation(int reg) const {
|
||||
if (reg < 32) {
|
||||
return M(&mips->f[reg]);
|
||||
} else if (reg < 32 + 128) {
|
||||
return M(&mips->v[reg - 32]);
|
||||
return M(&mips->v[voffset[reg - 32]]);
|
||||
} else {
|
||||
return M(&tempValues[reg - 32 - 128]);
|
||||
}
|
||||
@ -264,9 +268,9 @@ X64Reg FPURegCache::GetFreeXReg() {
|
||||
}
|
||||
|
||||
void FPURegCache::FlushX(X64Reg reg) {
|
||||
if (reg >= NUM_X_FPREGS)
|
||||
if (reg >= NUM_X_FPREGS) {
|
||||
PanicAlert("Flushing non existent reg");
|
||||
else if (xregs[reg].mipsReg != -1) {
|
||||
} else if (xregs[reg].mipsReg != -1) {
|
||||
StoreFromRegister(xregs[reg].mipsReg);
|
||||
}
|
||||
}
|
||||
|
@ -154,6 +154,7 @@ public:
|
||||
|
||||
void FlushX(X64Reg reg);
|
||||
X64Reg GetFreeXReg();
|
||||
|
||||
private:
|
||||
const int *GetAllocationOrder(int &count);
|
||||
void SetupInitialRegs();
|
||||
|
@ -168,8 +168,8 @@ BOOL CVFPUDlg::DlgProc(UINT message, WPARAM wParam, LPARAM lParam)
|
||||
|
||||
for (int row = 0; row<4; row++)
|
||||
{
|
||||
float val = mipsr4k.v[column*32+row+matrix*4];
|
||||
u32 hex = mipsr4k.vi[column*32+row+matrix*4];
|
||||
float val = mipsr4k.v[voffset[column*32+row+matrix*4]];
|
||||
u32 hex = mipsr4k.vi[voffset[column*32+row+matrix*4]];
|
||||
switch (mode)
|
||||
{
|
||||
case 0: temp_len = sprintf_s(temp,"%f",val); break;
|
||||
|
Loading…
x
Reference in New Issue
Block a user