From aa753c88b28c8aeece8a856601ea189ae9c569d5 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sat, 28 Sep 2013 12:30:28 +0200 Subject: [PATCH 1/5] ARM: implement vhdp --- Core/MIPS/ARM/ArmCompVFPU.cpp | 52 +++++++++++++++++++++++++++++++---- Core/MIPS/ARM/ArmJit.h | 1 + Core/MIPS/MIPSTables.cpp | 2 +- Core/MIPS/PPC/PpcCompVFPU.cpp | 4 +++ Core/MIPS/PPC/PpcJit.h | 1 + Core/MIPS/x86/CompVFPU.cpp | 4 +++ Core/MIPS/x86/Jit.h | 1 + android/ab.cmd | 2 ++ 8 files changed, 60 insertions(+), 7 deletions(-) diff --git a/Core/MIPS/ARM/ArmCompVFPU.cpp b/Core/MIPS/ARM/ArmCompVFPU.cpp index cb5d9d1f3..75a4e516b 100644 --- a/Core/MIPS/ARM/ArmCompVFPU.cpp +++ b/Core/MIPS/ARM/ArmCompVFPU.cpp @@ -559,6 +559,48 @@ namespace MIPSComp fpr.ReleaseSpillLocksAndDiscardTemps(); } + void Jit::Comp_VHdp(MIPSOpcode op) { + // DISABLE; + + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + int vd = _VD; + int vs = _VS; + int vt = _VT; + VectorSize sz = GetVecSize(op); + + // TODO: Force read one of them into regs? probably not. + u8 sregs[4], tregs[4], dregs[1]; + GetVectorRegsPrefixS(sregs, sz, vs); + GetVectorRegsPrefixT(tregs, sz, vt); + GetVectorRegsPrefixD(dregs, V_Single, vd); + + // TODO: applyprefixST here somehow (shuffle, etc...) + fpr.MapRegsAndSpillLockV(sregs, sz, 0); + fpr.MapRegsAndSpillLockV(tregs, sz, 0); + VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0])); + + int n = GetNumVectorElements(sz); + for (int i = 1; i < n; i++) { + // sum += s[i]*t[i]; + if (i == n - 1) { + VADD(S0, S0, fpr.V(tregs[i])); + } else { + VMLA(S0, fpr.V(sregs[i]), fpr.V(tregs[i])); + } + } + fpr.ReleaseSpillLocksAndDiscardTemps(); + + fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY); + + VMOV(fpr.V(dregs[0]), S0); + ApplyPrefixD(dregs, V_Single); + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + void Jit::Comp_VDot(MIPSOpcode op) { CONDITIONAL_DISABLE; if (js.HasUnknownPrefix() || disablePrefixes) { @@ -590,7 +632,6 @@ namespace MIPSComp fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY); - // TODO: applyprefixD here somehow (write mask etc..) VMOV(fpr.V(dregs[0]), S0); ApplyPrefixD(dregs, V_Single); fpr.ReleaseSpillLocksAndDiscardTemps(); @@ -607,11 +648,6 @@ namespace MIPSComp } } - void Jit::Comp_VHdp(MIPSOpcode op) { - // Similar to vdot - DISABLE; - } - void Jit::Comp_VecDo3(MIPSOpcode op) { CONDITIONAL_DISABLE; @@ -899,6 +935,10 @@ namespace MIPSComp fpr.ReleaseSpillLocksAndDiscardTemps(); } + void Jit::Comp_Vh2f(MIPSOpcode op) { + DISABLE; + } + void Jit::Comp_Vf2i(MIPSOpcode op) { CONDITIONAL_DISABLE; DISABLE; diff --git a/Core/MIPS/ARM/ArmJit.h b/Core/MIPS/ARM/ArmJit.h index 2379dd925..8da416e9d 100644 --- a/Core/MIPS/ARM/ArmJit.h +++ b/Core/MIPS/ARM/ArmJit.h @@ -215,6 +215,7 @@ public: void Comp_Vx2i(MIPSOpcode op); void Comp_Vf2i(MIPSOpcode op); void Comp_Vi2f(MIPSOpcode op); + void Comp_Vh2f(MIPSOpcode op); void Comp_Vcst(MIPSOpcode op); void Comp_Vhoriz(MIPSOpcode op); void Comp_VRot(MIPSOpcode op); diff --git a/Core/MIPS/MIPSTables.cpp b/Core/MIPS/MIPSTables.cpp index ab9f76095..e26f2b05d 100644 --- a/Core/MIPS/MIPSTables.cpp +++ b/Core/MIPS/MIPSTables.cpp @@ -605,7 +605,7 @@ const MIPSInstruction tableVFPU7[32] = // 110100 00001 xxxxx . ....... . ....... INVALID, INVALID, INSTR("vf2h", &Jit::Comp_Generic, Dis_Generic, Int_Vf2h, IN_OTHER|OUT_OTHER|IS_VFPU|OUT_EAT_PREFIX), - INSTR("vh2f", &Jit::Comp_Generic, Dis_Generic, Int_Vh2f, IN_OTHER|OUT_OTHER|IS_VFPU|OUT_EAT_PREFIX), + INSTR("vh2f", &Jit::Comp_Vh2f, Dis_Generic, Int_Vh2f, IN_OTHER|OUT_OTHER|IS_VFPU|OUT_EAT_PREFIX), INVALID, INVALID, diff --git a/Core/MIPS/PPC/PpcCompVFPU.cpp b/Core/MIPS/PPC/PpcCompVFPU.cpp index 9b1337ef0..47880db66 100644 --- a/Core/MIPS/PPC/PpcCompVFPU.cpp +++ b/Core/MIPS/PPC/PpcCompVFPU.cpp @@ -949,6 +949,10 @@ namespace MIPSComp DISABLE; } + void Jit::Comp_Vh2f(MIPSOpcode op) { + DISABLE; + } + void Jit::Comp_Vcst(MIPSOpcode op) { CONDITIONAL_DISABLE; diff --git a/Core/MIPS/PPC/PpcJit.h b/Core/MIPS/PPC/PpcJit.h index 7a1227e2d..731d8661e 100644 --- a/Core/MIPS/PPC/PpcJit.h +++ b/Core/MIPS/PPC/PpcJit.h @@ -220,6 +220,7 @@ namespace MIPSComp void Comp_Vx2i(MIPSOpcode op); void Comp_Vf2i(MIPSOpcode op); void Comp_Vi2f(MIPSOpcode op); + void Comp_Vh2f(MIPSOpcode op); void Comp_Vcst(MIPSOpcode op); void Comp_Vhoriz(MIPSOpcode op); void Comp_VRot(MIPSOpcode op); diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 9529c4812..25e5dd4ab 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -1083,6 +1083,10 @@ void Jit::Comp_Vi2f(MIPSOpcode op) { fpr.ReleaseSpillLocks(); } +void Jit::Comp_Vh2f(MIPSOpcode op) { + DISABLE; +} + extern const double mulTableVf2i[32] = { (1ULL<<0),(1ULL<<1),(1ULL<<2),(1ULL<<3), (1ULL<<4),(1ULL<<5),(1ULL<<6),(1ULL<<7), diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h index 7ff4f115d..9c8cfdbe6 100644 --- a/Core/MIPS/x86/Jit.h +++ b/Core/MIPS/x86/Jit.h @@ -235,6 +235,7 @@ public: void Comp_Vx2i(MIPSOpcode op); void Comp_Vf2i(MIPSOpcode op); void Comp_Vi2f(MIPSOpcode op); + void Comp_Vh2f(MIPSOpcode op); void Comp_Vcst(MIPSOpcode op); void Comp_Vhoriz(MIPSOpcode op); void Comp_VRot(MIPSOpcode op); diff --git a/android/ab.cmd b/android/ab.cmd index 1a65efb5f..a6a2cec17 100644 --- a/android/ab.cmd +++ b/android/ab.cmd @@ -1,5 +1,7 @@ xcopy ..\flash0 assets\flash0 /s /y xcopy ..\lang assets\lang /s /y +xcopy ..\assets\shaders assets\shaders /s /y +copy ..\assets\langregion.ini assets\langregion.ini SET NDK=C:\AndroidNDK SET NDK_MODULE_PATH=..;..\native\ext REM Need to force target-platform to android-9 to get access to OpenSL headers. From 532678b6f7d7e743673be5c8f0667addb7d28a9f Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sat, 28 Sep 2013 12:31:46 +0200 Subject: [PATCH 2/5] Update lang --- Core/Dialog/PSPOskDialog.cpp | 8 ++++---- lang | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Core/Dialog/PSPOskDialog.cpp b/Core/Dialog/PSPOskDialog.cpp index ff7616931..be0189bd4 100755 --- a/Core/Dialog/PSPOskDialog.cpp +++ b/Core/Dialog/PSPOskDialog.cpp @@ -1050,12 +1050,12 @@ int PSPOskDialog::Update() int PSPOskDialog::Shutdown(bool force) { - if (status != SCE_UTILITY_STATUS_FINISHED && !force) - return SCE_ERROR_UTILITY_INVALID_STATUS; + if (status != SCE_UTILITY_STATUS_FINISHED && !force) + return SCE_ERROR_UTILITY_INVALID_STATUS; - PSPDialog::Shutdown(); + PSPDialog::Shutdown(); - return 0; + return 0; } void PSPOskDialog::DoState(PointerWrap &p) diff --git a/lang b/lang index 2c000fdc9..a52257c30 160000 --- a/lang +++ b/lang @@ -1 +1 @@ -Subproject commit 2c000fdc92450c49b1ad446740c31987419e67d6 +Subproject commit a52257c30fb0739b828666359ed81763d1ec8fc9 From cfdfa77476d59a2d1ace810a6caae03106288213 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sat, 28 Sep 2013 12:33:16 +0200 Subject: [PATCH 3/5] Change a bunch of "MayHavePrefix" to "HasUnknonwPrefix". Should be safe, I think none of these have any sane use of prefixes anyway. --- Core/MIPS/ARM/ArmCompVFPU.cpp | 4 ++-- Core/MIPS/x86/CompVFPU.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Core/MIPS/ARM/ArmCompVFPU.cpp b/Core/MIPS/ARM/ArmCompVFPU.cpp index 75a4e516b..b1851f82c 100644 --- a/Core/MIPS/ARM/ArmCompVFPU.cpp +++ b/Core/MIPS/ARM/ArmCompVFPU.cpp @@ -1183,7 +1183,7 @@ namespace MIPSComp CONDITIONAL_DISABLE; // TODO: This probably ignores prefixes? - if (js.MayHavePrefix() || disablePrefixes) { + if (js.HasUnknownPrefix() || disablePrefixes) { DISABLE; } @@ -1229,7 +1229,7 @@ namespace MIPSComp CONDITIONAL_DISABLE; // TODO: This probably ignores prefixes? Or maybe uses D? - if (js.MayHavePrefix() || disablePrefixes) { + if (js.HasUnknownPrefix() || disablePrefixes) { DISABLE; } diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 25e5dd4ab..6f29ac165 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -1441,7 +1441,7 @@ void Jit::Comp_Vmmov(MIPSOpcode op) { CONDITIONAL_DISABLE; // TODO: This probably ignores prefixes? - if (js.MayHavePrefix()) + if (js.HasUnknownPrefix()) DISABLE; MatrixSize sz = GetMtxSize(op); @@ -1534,7 +1534,7 @@ void Jit::Comp_Vmmul(MIPSOpcode op) { CONDITIONAL_DISABLE; // TODO: This probably ignores prefixes? - if (js.MayHavePrefix()) + if (js.HasUnknownPrefix()) DISABLE; MatrixSize sz = GetMtxSize(op); @@ -1598,7 +1598,7 @@ void Jit::Comp_Vmscl(MIPSOpcode op) { CONDITIONAL_DISABLE; // TODO: This probably ignores prefixes? - if (js.MayHavePrefix()) + if (js.HasUnknownPrefix()) DISABLE; MatrixSize sz = GetMtxSize(op); @@ -1643,7 +1643,7 @@ void Jit::Comp_Vtfm(MIPSOpcode op) { CONDITIONAL_DISABLE; // TODO: This probably ignores prefixes? Or maybe uses D? - if (js.MayHavePrefix()) + if (js.HasUnknownPrefix()) DISABLE; VectorSize sz = GetVecSize(op); From 7ca6d73857b13fdba47880d8714d2b222f82369f Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sat, 28 Sep 2013 14:01:26 +0200 Subject: [PATCH 4/5] Two approaches to vh2f (half-float to float): lookuptable and fast SSE --- Core/MIPS/x86/CompVFPU.cpp | 196 ++++++++++++++++++++++++++++++++++-- Core/MIPS/x86/Jit.cpp | 11 +- Core/MIPS/x86/Jit.h | 9 +- Core/MIPS/x86/RegCacheFPU.h | 4 +- 4 files changed, 204 insertions(+), 16 deletions(-) diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 6f29ac165..cbbfd695e 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -1083,10 +1083,193 @@ void Jit::Comp_Vi2f(MIPSOpcode op) { fpr.ReleaseSpillLocks(); } + + +#if 0 + +// One possible approach + +// Uses lookup tables to decode half floats. Not really sure how bad the CPU cache impact will be... void Jit::Comp_Vh2f(MIPSOpcode op) { - DISABLE; + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix()) + DISABLE; + + VectorSize sz = GetVecSize(op); + VectorSize outsize; + switch (sz) { + case V_Single: + outsize = V_Pair; + break; + case V_Pair: + outsize = V_Quad; + break; + } + + u8 sregs[4], dregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixD(dregs, outsize, _VD); + + switch (sz) { + case V_Single: + // Flush so we can access it with integer instructions + // Grab ECX as a secondary working register + gpr.FlushLockX(ECX); + fpr.StoreFromRegisterV(sregs[0]); + MOV(32, R(EAX), fpr.V(sregs[0])); + fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY); + //XOR(32, R(EAX), R(EAX)); + MOV(32, R(ECX), R(EAX)); + AND(32, R(EAX), Imm32(0xFFFF)); + SHR(32, R(ECX), Imm8(16)); +#ifdef _M_IX86 + MOVSS(fpr.VX(dregs[0]), MScaled(EAX, 4, (u32)halfToFloat_)); + MOVSS(fpr.VX(dregs[1]), MScaled(ECX, 4, (u32)halfToFloat_)); +#endif + break; + case V_Pair: + // Flush so we can access it with integer instructions + // Grab ECX and EDX as a secondary/third working register + gpr.FlushLockX(ECX, EDX); + fpr.StoreFromRegisterV(sregs[0]); + fpr.StoreFromRegisterV(sregs[1]); + MOV(32, R(EAX), fpr.V(sregs[0])); + MOV(32, R(EDX), fpr.V(sregs[1])); + fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY); + //XOR(32, R(EAX), R(EAX)); + MOV(32, R(ECX), R(EAX)); + AND(32, R(EAX), Imm32(0xFFFF)); + SHR(32, R(ECX), Imm8(16)); +#ifdef _M_IX86 + MOVSS(fpr.VX(dregs[0]), MScaled(EAX, 4, (u32)halfToFloat_)); + MOVSS(fpr.VX(dregs[1]), MScaled(ECX, 4, (u32)halfToFloat_)); +#endif + //XOR(32, R(EAX), R(EAX)); + MOV(32, R(ECX), R(EDX)); + AND(32, R(EDX), Imm32(0xFFFF)); + SHR(32, R(ECX), Imm8(16)); +#ifdef _M_IX86 + MOVSS(fpr.VX(dregs[2]), MScaled(EDX, 4, (u32)halfToFloat_)); + MOVSS(fpr.VX(dregs[3]), MScaled(ECX, 4, (u32)halfToFloat_)); +#endif + break; + case V_Triple: + case V_Quad: + _dbg_assert_msg_(CPU, 0, "Trying to interpret Int_Vh2f instruction that can't be interpreted"); + break; + } + ApplyPrefixD(dregs, outsize); + gpr.UnlockAllX(); + fpr.ReleaseSpillLocks(); } +#else + +#undef CONST + +// Planning for true SIMD + +// Sequence for gathering sparse registers into one SIMD: +// MOVSS(XMM0, fpr.R(sregs[0])); +// MOVSS(XMM1, fpr.R(sregs[1])); +// MOVSS(XMM2, fpr.R(sregs[2])); +// MOVSS(XMM3, fpr.R(sregs[3])); +// SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); // XMM0 = S1 S1 S0 S0 +// SHUFPS(XMM2, R(XMM3), _MM_SHUFFLE(0, 0, 0, 0)); // XMM2 = S3 S3 S2 S2 +// SHUFPS(XMM0, R(XMM2), _MM_SHUFFLE(2, 0, 2, 0)); // XMM0 = S3 S2 S1 S0 +// Some punpckwd etc would also work. + +// Sequence for scattering a SIMD register to sparse registers: +// (Very serial though, better methods may be possible) +// MOVSS(fpr.R(sregs[0]), XMM0); +// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); +// MOVSS(fpr.R(sregs[1]), XMM0); +// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); +// MOVSS(fpr.R(sregs[2]), XMM0); +// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); +// MOVSS(fpr.R(sregs[3]), XMM0); + + +// Translation of ryg's half_to_float5_SSE2 +void Jit::Comp_Vh2f(MIPSOpcode op) { +#define SSE_CONST4(name, val) static const __declspec(align(16)) u32 name[4] = { (val), (val), (val), (val) } + + SSE_CONST4(mask_nosign, 0x7fff); + SSE_CONST4(magic, (254 - 15) << 23); + SSE_CONST4(was_infnan, 0x7bff); + SSE_CONST4(exp_infnan, 255 << 23); + +#undef SSE_CONST4 + + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix()) + DISABLE; + + VectorSize sz = GetVecSize(op); + VectorSize outsize; + switch (sz) { + case V_Single: + outsize = V_Pair; + DISABLE; + break; + case V_Pair: + outsize = V_Quad; + break; + } + + u8 sregs[4], dregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixD(dregs, outsize, _VD); + + // Force ourselves an extra xreg as temp space. + X64Reg tempR = fpr.GetFreeXReg(); + + MOVSS(XMM0, fpr.V(sregs[0])); + if (sz != V_Single) { + MOVSS(XMM1, fpr.V(sregs[1])); + PUNPCKLDQ(XMM0, R(XMM1)); + } + XORPS(XMM1, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + + // OK, 16 bits in each word. + // Let's go. Deep magic here. + MOVAPS(XMM1, R(XMM0)); + // MOVAPS(XMM2, R(XMM0)); // xmm2 = h + ANDPS(XMM0, M((void *)mask_nosign)); // xmm0 = expmant + XORPS(XMM1, R(XMM0)); // xmm1 = justsign = expmant ^ xmm0 + MOVAPS(tempR, R(XMM0)); + PCMPGTD(tempR, M((void *)was_infnan)); // xmm2 = b_wasinfnan + PSLLD(XMM0, 13); + MULPS(XMM0, M((void *)magic)); /// xmm0 = scaled + PSLLD(XMM1, 16); // xmm1 = sign + ANDPS(tempR, M((void *)exp_infnan)); + ORPS(XMM1, R(tempR)); + ORPS(XMM0, R(XMM1)); + + fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY); + + // TODO: Could apply D-prefix in parallel here... + + MOVSS(fpr.V(dregs[0]), XMM0); + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); + MOVSS(fpr.V(dregs[1]), XMM0); + + if (sz != V_Single) { + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); + MOVSS(fpr.V(dregs[2]), XMM0); + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); + MOVSS(fpr.V(dregs[3]), XMM0); + } + + ApplyPrefixD(dregs, outsize); + gpr.UnlockAllX(); + fpr.ReleaseSpillLocks(); +} + +#endif + + extern const double mulTableVf2i[32] = { (1ULL<<0),(1ULL<<1),(1ULL<<2),(1ULL<<3), (1ULL<<4),(1ULL<<5),(1ULL<<6),(1ULL<<7), @@ -1759,22 +1942,17 @@ void Jit::Comp_Vfim(MIPSOpcode op) { static float sincostemp[2]; void SinCos(float angle) { -#ifndef M_PI_2 -#define M_PI_2 1.57079632679489661923 -#endif - angle *= (float)M_PI_2; + angle *= (float)1.57079632679489661923; // pi / 2 sincostemp[0] = sinf(angle); sincostemp[1] = cosf(angle); } void SinCosNegSin(float angle) { -#ifndef M_PI_2 -#define M_PI_2 1.57079632679489661923 -#endif - angle *= (float)M_PI_2; + angle *= (float)1.57079632679489661923; // pi / 2 sincostemp[0] = -sinf(angle); sincostemp[1] = cosf(angle); } + // Very heavily used by FF:CC void Jit::Comp_VRot(MIPSOpcode op) { // DISABLE; diff --git a/Core/MIPS/x86/Jit.cpp b/Core/MIPS/x86/Jit.cpp index 3d0f6bc36..ac8f635e5 100644 --- a/Core/MIPS/x86/Jit.cpp +++ b/Core/MIPS/x86/Jit.cpp @@ -18,6 +18,8 @@ #include #include +#include "math/math_util.h" + #include "Common/ChunkFile.h" #include "Core/Core.h" #include "Core/System.h" @@ -116,11 +118,18 @@ Jit::Jit(MIPSState *mips) : blocks(mips, this), mips_(mips) fpr.SetEmitter(this); AllocCodeSpace(1024 * 1024 * 16); asm_.Init(mips, this); - + halfToFloat_ = new float[65536]; + for (int i = 0; i < 65536; i++) { + halfToFloat_[i] = ExpandHalf((u16)i); + } // TODO: If it becomes possible to switch from the interpreter, this should be set right. js.startDefaultPrefix = true; } +Jit::~Jit() { + delete [] halfToFloat_; +} + void Jit::DoState(PointerWrap &p) { auto s = p.Section("Jit", 1); diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h index 9c8cfdbe6..3ad07d73e 100644 --- a/Core/MIPS/x86/Jit.h +++ b/Core/MIPS/x86/Jit.h @@ -170,6 +170,7 @@ class Jit : public Gen::XCodeBlock { public: Jit(MIPSState *mips); + ~Jit(); void DoState(PointerWrap &p); static void DoDummyState(PointerWrap &p); @@ -339,8 +340,9 @@ private: MIPSState *mips_; - class JitSafeMem - { + float *halfToFloat_; // lookup table. + + class JitSafeMem { public: JitSafeMem(Jit *jit, MIPSGPReg raddr, s32 offset, u32 alignMask = 0xFFFFFFFF); @@ -367,8 +369,7 @@ private: void NextSlowRead(void *safeFunc, int suboffset); private: - enum ReadType - { + enum ReadType { MEM_READ, MEM_WRITE, }; diff --git a/Core/MIPS/x86/RegCacheFPU.h b/Core/MIPS/x86/RegCacheFPU.h index cd8b669f2..e8f49568c 100644 --- a/Core/MIPS/x86/RegCacheFPU.h +++ b/Core/MIPS/x86/RegCacheFPU.h @@ -139,9 +139,9 @@ public: MIPSState *mips; -private: - X64Reg GetFreeXReg(); void FlushX(X64Reg reg); + X64Reg GetFreeXReg(); +private: const int *GetAllocationOrder(int &count); MIPSCachedFPReg regs[NUM_MIPS_FPRS]; From 20174d94107fbd32cca3ad1a075cdb6f69635c42 Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Sat, 28 Sep 2013 22:11:24 +0200 Subject: [PATCH 5/5] Delete the lookup table version of vh2f --- Core/MIPS/x86/CompVFPU.cpp | 90 +------------------------------------- Core/MIPS/x86/Jit.cpp | 5 --- Core/MIPS/x86/Jit.h | 2 - 3 files changed, 2 insertions(+), 95 deletions(-) diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index cbbfd695e..8c60df33d 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -17,6 +17,7 @@ #include #include +#include #include "base/logging.h" #include "math/math_util.h" @@ -1083,90 +1084,6 @@ void Jit::Comp_Vi2f(MIPSOpcode op) { fpr.ReleaseSpillLocks(); } - - -#if 0 - -// One possible approach - -// Uses lookup tables to decode half floats. Not really sure how bad the CPU cache impact will be... -void Jit::Comp_Vh2f(MIPSOpcode op) { - CONDITIONAL_DISABLE; - if (js.HasUnknownPrefix()) - DISABLE; - - VectorSize sz = GetVecSize(op); - VectorSize outsize; - switch (sz) { - case V_Single: - outsize = V_Pair; - break; - case V_Pair: - outsize = V_Quad; - break; - } - - u8 sregs[4], dregs[4]; - GetVectorRegsPrefixS(sregs, sz, _VS); - GetVectorRegsPrefixD(dregs, outsize, _VD); - - switch (sz) { - case V_Single: - // Flush so we can access it with integer instructions - // Grab ECX as a secondary working register - gpr.FlushLockX(ECX); - fpr.StoreFromRegisterV(sregs[0]); - MOV(32, R(EAX), fpr.V(sregs[0])); - fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY); - //XOR(32, R(EAX), R(EAX)); - MOV(32, R(ECX), R(EAX)); - AND(32, R(EAX), Imm32(0xFFFF)); - SHR(32, R(ECX), Imm8(16)); -#ifdef _M_IX86 - MOVSS(fpr.VX(dregs[0]), MScaled(EAX, 4, (u32)halfToFloat_)); - MOVSS(fpr.VX(dregs[1]), MScaled(ECX, 4, (u32)halfToFloat_)); -#endif - break; - case V_Pair: - // Flush so we can access it with integer instructions - // Grab ECX and EDX as a secondary/third working register - gpr.FlushLockX(ECX, EDX); - fpr.StoreFromRegisterV(sregs[0]); - fpr.StoreFromRegisterV(sregs[1]); - MOV(32, R(EAX), fpr.V(sregs[0])); - MOV(32, R(EDX), fpr.V(sregs[1])); - fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY); - //XOR(32, R(EAX), R(EAX)); - MOV(32, R(ECX), R(EAX)); - AND(32, R(EAX), Imm32(0xFFFF)); - SHR(32, R(ECX), Imm8(16)); -#ifdef _M_IX86 - MOVSS(fpr.VX(dregs[0]), MScaled(EAX, 4, (u32)halfToFloat_)); - MOVSS(fpr.VX(dregs[1]), MScaled(ECX, 4, (u32)halfToFloat_)); -#endif - //XOR(32, R(EAX), R(EAX)); - MOV(32, R(ECX), R(EDX)); - AND(32, R(EDX), Imm32(0xFFFF)); - SHR(32, R(ECX), Imm8(16)); -#ifdef _M_IX86 - MOVSS(fpr.VX(dregs[2]), MScaled(EDX, 4, (u32)halfToFloat_)); - MOVSS(fpr.VX(dregs[3]), MScaled(ECX, 4, (u32)halfToFloat_)); -#endif - break; - case V_Triple: - case V_Quad: - _dbg_assert_msg_(CPU, 0, "Trying to interpret Int_Vh2f instruction that can't be interpreted"); - break; - } - ApplyPrefixD(dregs, outsize); - gpr.UnlockAllX(); - fpr.ReleaseSpillLocks(); -} - -#else - -#undef CONST - // Planning for true SIMD // Sequence for gathering sparse registers into one SIMD: @@ -1192,7 +1109,7 @@ void Jit::Comp_Vh2f(MIPSOpcode op) { // Translation of ryg's half_to_float5_SSE2 void Jit::Comp_Vh2f(MIPSOpcode op) { -#define SSE_CONST4(name, val) static const __declspec(align(16)) u32 name[4] = { (val), (val), (val), (val) } +#define SSE_CONST4(name, val) static const u32 MEMORY_ALIGNED16(name[4]) = { (val), (val), (val), (val) } SSE_CONST4(mask_nosign, 0x7fff); SSE_CONST4(magic, (254 - 15) << 23); @@ -1267,9 +1184,6 @@ void Jit::Comp_Vh2f(MIPSOpcode op) { fpr.ReleaseSpillLocks(); } -#endif - - extern const double mulTableVf2i[32] = { (1ULL<<0),(1ULL<<1),(1ULL<<2),(1ULL<<3), (1ULL<<4),(1ULL<<5),(1ULL<<6),(1ULL<<7), diff --git a/Core/MIPS/x86/Jit.cpp b/Core/MIPS/x86/Jit.cpp index ac8f635e5..7322ccd4e 100644 --- a/Core/MIPS/x86/Jit.cpp +++ b/Core/MIPS/x86/Jit.cpp @@ -118,16 +118,11 @@ Jit::Jit(MIPSState *mips) : blocks(mips, this), mips_(mips) fpr.SetEmitter(this); AllocCodeSpace(1024 * 1024 * 16); asm_.Init(mips, this); - halfToFloat_ = new float[65536]; - for (int i = 0; i < 65536; i++) { - halfToFloat_[i] = ExpandHalf((u16)i); - } // TODO: If it becomes possible to switch from the interpreter, this should be set right. js.startDefaultPrefix = true; } Jit::~Jit() { - delete [] halfToFloat_; } void Jit::DoState(PointerWrap &p) diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h index 3ad07d73e..b8b49ee48 100644 --- a/Core/MIPS/x86/Jit.h +++ b/Core/MIPS/x86/Jit.h @@ -340,8 +340,6 @@ private: MIPSState *mips_; - float *halfToFloat_; // lookup table. - class JitSafeMem { public: JitSafeMem(Jit *jit, MIPSGPReg raddr, s32 offset, u32 alignMask = 0xFFFFFFFF);