From 29109d25af75d50f2a1cad2ef43eb31187f914f9 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 20 Apr 2013 01:11:40 -0700 Subject: [PATCH] Non-optimal vmmul for x86 jit. It's faster than interpreter anyway, but it could be much better. --- Core/MIPS/ARM/ArmCompVFPU.cpp | 4 +++ Core/MIPS/ARM/ArmJit.h | 1 + Core/MIPS/MIPSTables.cpp | 8 +++--- Core/MIPS/x86/CompVFPU.cpp | 48 +++++++++++++++++++++++++++++++++++ Core/MIPS/x86/Jit.h | 1 + 5 files changed, 58 insertions(+), 4 deletions(-) diff --git a/Core/MIPS/ARM/ArmCompVFPU.cpp b/Core/MIPS/ARM/ArmCompVFPU.cpp index 21d294cdb..bf59bb6a3 100644 --- a/Core/MIPS/ARM/ArmCompVFPU.cpp +++ b/Core/MIPS/ARM/ArmCompVFPU.cpp @@ -718,4 +718,8 @@ namespace MIPSComp DISABLE; } + void Jit::Comp_Vmmul(u32 op) { + DISABLE; + } + } diff --git a/Core/MIPS/ARM/ArmJit.h b/Core/MIPS/ARM/ArmJit.h index b2cb72beb..1e726db71 100644 --- a/Core/MIPS/ARM/ArmJit.h +++ b/Core/MIPS/ARM/ArmJit.h @@ -194,6 +194,7 @@ public: void Comp_Vmtvc(u32 op); void Comp_Vmmov(u32 op); void Comp_VScl(u32 op); + void Comp_Vmmul(u32 op); ArmJitBlockCache *GetBlockCache() { return &blocks; } diff --git a/Core/MIPS/MIPSTables.cpp b/Core/MIPS/MIPSTables.cpp index 8b347554e..f52773591 100644 --- a/Core/MIPS/MIPSTables.cpp +++ b/Core/MIPS/MIPSTables.cpp @@ -626,10 +626,10 @@ MIPSInstruction tableVFPU5[8] = //110111 xxx const MIPSInstruction tableVFPU6[32] = //111100 xxx { //0 - INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), - INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), - INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), - INSTR("vmmul",&Jit::Comp_Generic, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), + INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), + INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), + INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), + INSTR("vmmul",&Jit::Comp_Vmmul, Dis_MatrixMult, Int_Vmmul, IS_VFPU|OUT_EAT_PREFIX), INSTR("v(h)tfm2",&Jit::Comp_Generic, Dis_Vtfm, Int_Vtfm, IS_VFPU|OUT_EAT_PREFIX), INSTR("v(h)tfm2",&Jit::Comp_Generic, Dis_Vtfm, Int_Vtfm, IS_VFPU|OUT_EAT_PREFIX), diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index e8a97249c..2c1c73333 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -783,4 +783,52 @@ void Jit::Comp_VScl(u32 op) { fpr.ReleaseSpillLocks(); } +void Jit::Comp_Vmmul(u32 op) { + CONDITIONAL_DISABLE; + + // TODO: This probably ignores prefixes? + if (js.MayHavePrefix()) + DISABLE; + + MatrixSize sz = GetMtxSize(op); + int n = GetMatrixSide(sz); + + u8 sregs[16], tregs[16], dregs[16]; + GetMatrixRegs(sregs, sz, _VS); + GetMatrixRegs(tregs, sz, _VT); + GetMatrixRegs(dregs, sz, _VD); + + // TODO: test overlap, fix non-optimal. + u8 tempregs[16]; + for (int a = 0; a < n; a++) + { + for (int b = 0; b < n; b++) + { + XORPS(XMM0, R(XMM0)); + for (int c = 0; c < n; c++) + { + MOVSS(XMM1, fpr.V(sregs[b * 4 + c])); + MULSS(XMM1, fpr.V(tregs[a * 4 + c])); + ADDSS(XMM0, R(XMM1)); + } + u8 temp = (u8) fpr.GetTempV(); + fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY); + MOVSS(fpr.VX(temp), R(XMM0)); + fpr.StoreFromRegisterV(temp); + tempregs[a * 4 + b] = temp; + } + } + for (int a = 0; a < n; a++) + { + for (int b = 0; b < n; b++) + { + u8 temp = tempregs[a * 4 + b]; + fpr.MapRegV(temp, 0); + MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp)); + } + } + + fpr.ReleaseSpillLocks(); +} + } diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h index 8a1c8cd97..f7f1121bf 100644 --- a/Core/MIPS/x86/Jit.h +++ b/Core/MIPS/x86/Jit.h @@ -204,6 +204,7 @@ public: void Comp_Vmtvc(u32 op); void Comp_Vmmov(u32 op); void Comp_VScl(u32 op); + void Comp_Vmmul(u32 op); void Comp_DoNothing(u32 op);