diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 6f29ac165..8c60df33d 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -17,6 +17,7 @@ #include #include +#include #include "base/logging.h" #include "math/math_util.h" @@ -1083,8 +1084,104 @@ void Jit::Comp_Vi2f(MIPSOpcode op) { fpr.ReleaseSpillLocks(); } +// Planning for true SIMD + +// Sequence for gathering sparse registers into one SIMD: +// MOVSS(XMM0, fpr.R(sregs[0])); +// MOVSS(XMM1, fpr.R(sregs[1])); +// MOVSS(XMM2, fpr.R(sregs[2])); +// MOVSS(XMM3, fpr.R(sregs[3])); +// SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); // XMM0 = S1 S1 S0 S0 +// SHUFPS(XMM2, R(XMM3), _MM_SHUFFLE(0, 0, 0, 0)); // XMM2 = S3 S3 S2 S2 +// SHUFPS(XMM0, R(XMM2), _MM_SHUFFLE(2, 0, 2, 0)); // XMM0 = S3 S2 S1 S0 +// Some punpckwd etc would also work. + +// Sequence for scattering a SIMD register to sparse registers: +// (Very serial though, better methods may be possible) +// MOVSS(fpr.R(sregs[0]), XMM0); +// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); +// MOVSS(fpr.R(sregs[1]), XMM0); +// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); +// MOVSS(fpr.R(sregs[2]), XMM0); +// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); +// MOVSS(fpr.R(sregs[3]), XMM0); + + +// Translation of ryg's half_to_float5_SSE2 void Jit::Comp_Vh2f(MIPSOpcode op) { - DISABLE; +#define SSE_CONST4(name, val) static const u32 MEMORY_ALIGNED16(name[4]) = { (val), (val), (val), (val) } + + SSE_CONST4(mask_nosign, 0x7fff); + SSE_CONST4(magic, (254 - 15) << 23); + SSE_CONST4(was_infnan, 0x7bff); + SSE_CONST4(exp_infnan, 255 << 23); + +#undef SSE_CONST4 + + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix()) + DISABLE; + + VectorSize sz = GetVecSize(op); + VectorSize outsize; + switch (sz) { + case V_Single: + outsize = V_Pair; + DISABLE; + break; + case V_Pair: + outsize = V_Quad; + break; + } + + u8 sregs[4], dregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixD(dregs, outsize, _VD); + + // Force ourselves an extra xreg as temp space. + X64Reg tempR = fpr.GetFreeXReg(); + + MOVSS(XMM0, fpr.V(sregs[0])); + if (sz != V_Single) { + MOVSS(XMM1, fpr.V(sregs[1])); + PUNPCKLDQ(XMM0, R(XMM1)); + } + XORPS(XMM1, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + + // OK, 16 bits in each word. + // Let's go. Deep magic here. + MOVAPS(XMM1, R(XMM0)); + // MOVAPS(XMM2, R(XMM0)); // xmm2 = h + ANDPS(XMM0, M((void *)mask_nosign)); // xmm0 = expmant + XORPS(XMM1, R(XMM0)); // xmm1 = justsign = expmant ^ xmm0 + MOVAPS(tempR, R(XMM0)); + PCMPGTD(tempR, M((void *)was_infnan)); // xmm2 = b_wasinfnan + PSLLD(XMM0, 13); + MULPS(XMM0, M((void *)magic)); /// xmm0 = scaled + PSLLD(XMM1, 16); // xmm1 = sign + ANDPS(tempR, M((void *)exp_infnan)); + ORPS(XMM1, R(tempR)); + ORPS(XMM0, R(XMM1)); + + fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY); + + // TODO: Could apply D-prefix in parallel here... + + MOVSS(fpr.V(dregs[0]), XMM0); + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); + MOVSS(fpr.V(dregs[1]), XMM0); + + if (sz != V_Single) { + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); + MOVSS(fpr.V(dregs[2]), XMM0); + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); + MOVSS(fpr.V(dregs[3]), XMM0); + } + + ApplyPrefixD(dregs, outsize); + gpr.UnlockAllX(); + fpr.ReleaseSpillLocks(); } extern const double mulTableVf2i[32] = { @@ -1759,22 +1856,17 @@ void Jit::Comp_Vfim(MIPSOpcode op) { static float sincostemp[2]; void SinCos(float angle) { -#ifndef M_PI_2 -#define M_PI_2 1.57079632679489661923 -#endif - angle *= (float)M_PI_2; + angle *= (float)1.57079632679489661923; // pi / 2 sincostemp[0] = sinf(angle); sincostemp[1] = cosf(angle); } void SinCosNegSin(float angle) { -#ifndef M_PI_2 -#define M_PI_2 1.57079632679489661923 -#endif - angle *= (float)M_PI_2; + angle *= (float)1.57079632679489661923; // pi / 2 sincostemp[0] = -sinf(angle); sincostemp[1] = cosf(angle); } + // Very heavily used by FF:CC void Jit::Comp_VRot(MIPSOpcode op) { // DISABLE; diff --git a/Core/MIPS/x86/Jit.cpp b/Core/MIPS/x86/Jit.cpp index 3d0f6bc36..7322ccd4e 100644 --- a/Core/MIPS/x86/Jit.cpp +++ b/Core/MIPS/x86/Jit.cpp @@ -18,6 +18,8 @@ #include #include +#include "math/math_util.h" + #include "Common/ChunkFile.h" #include "Core/Core.h" #include "Core/System.h" @@ -116,11 +118,13 @@ Jit::Jit(MIPSState *mips) : blocks(mips, this), mips_(mips) fpr.SetEmitter(this); AllocCodeSpace(1024 * 1024 * 16); asm_.Init(mips, this); - // TODO: If it becomes possible to switch from the interpreter, this should be set right. js.startDefaultPrefix = true; } +Jit::~Jit() { +} + void Jit::DoState(PointerWrap &p) { auto s = p.Section("Jit", 1); diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h index 9c8cfdbe6..b8b49ee48 100644 --- a/Core/MIPS/x86/Jit.h +++ b/Core/MIPS/x86/Jit.h @@ -170,6 +170,7 @@ class Jit : public Gen::XCodeBlock { public: Jit(MIPSState *mips); + ~Jit(); void DoState(PointerWrap &p); static void DoDummyState(PointerWrap &p); @@ -339,8 +340,7 @@ private: MIPSState *mips_; - class JitSafeMem - { + class JitSafeMem { public: JitSafeMem(Jit *jit, MIPSGPReg raddr, s32 offset, u32 alignMask = 0xFFFFFFFF); @@ -367,8 +367,7 @@ private: void NextSlowRead(void *safeFunc, int suboffset); private: - enum ReadType - { + enum ReadType { MEM_READ, MEM_WRITE, }; diff --git a/Core/MIPS/x86/RegCacheFPU.h b/Core/MIPS/x86/RegCacheFPU.h index cd8b669f2..e8f49568c 100644 --- a/Core/MIPS/x86/RegCacheFPU.h +++ b/Core/MIPS/x86/RegCacheFPU.h @@ -139,9 +139,9 @@ public: MIPSState *mips; -private: - X64Reg GetFreeXReg(); void FlushX(X64Reg reg); + X64Reg GetFreeXReg(); +private: const int *GetAllocationOrder(int &count); MIPSCachedFPReg regs[NUM_MIPS_FPRS];