From 98c185591600241f3dc6a1b0ca62e1d376ef5ebf Mon Sep 17 00:00:00 2001 From: cottonvibes Date: Tue, 21 Jul 2009 06:36:56 +0000 Subject: [PATCH] microVU: - More regalloc work/fixes - Implemented some untested SSE4.1 optimizations (can't test since don't have sse4.1 cpu) pcsx2: - Added an SSE4 instruction to the legacy emitter (just a wrapper to the new emitter function). Note: Currently tri-ace fix and logical min-max code (thing that mad DaZ safe to use) is broken with mVU. Will fix later. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1547 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/x86/ix86/ix86_legacy_instructions.h | 1 + pcsx2/x86/ix86/ix86_legacy_sse.cpp | 1 + pcsx2/x86/microVU_IR.h | 30 ++++++++++------- pcsx2/x86/microVU_Misc.inl | 40 +++++++++++++++-------- pcsx2/x86/microVU_Upper.inl | 40 ++++++++--------------- 5 files changed, 62 insertions(+), 50 deletions(-) diff --git a/pcsx2/x86/ix86/ix86_legacy_instructions.h b/pcsx2/x86/ix86/ix86_legacy_instructions.h index ad91a6a51..1fe98cb92 100644 --- a/pcsx2/x86/ix86/ix86_legacy_instructions.h +++ b/pcsx2/x86/ix86/ix86_legacy_instructions.h @@ -1349,6 +1349,7 @@ extern void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8); extern void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8); extern void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8); extern void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8); +extern void SSE4_EXTRACTPS_XMM_to_M32(uptr to, x86SSERegType from, u8 imm8); extern void SSE4_BLENDPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8); extern void SSE4_BLENDVPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from); extern void SSE4_BLENDVPS_M128_to_XMM(x86SSERegType to, uptr from); diff --git a/pcsx2/x86/ix86/ix86_legacy_sse.cpp b/pcsx2/x86/ix86/ix86_legacy_sse.cpp index ed1bddcbc..66074ca08 100644 --- a/pcsx2/x86/ix86/ix86_legacy_sse.cpp +++ b/pcsx2/x86/ix86/ix86_legacy_sse.cpp @@ -364,6 +364,7 @@ emitterT void SSE4_PINSRD_R32_to_XMM(x86SSERegType to, x86IntRegType from, u8 im emitterT void SSE4_INSERTPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xINSERTPS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); } emitterT void SSE4_EXTRACTPS_XMM_to_R32(x86IntRegType to, x86SSERegType from, u8 imm8) { xEXTRACTPS( xRegister32(to), xRegisterSSE(from), imm8 ); } +emitterT void SSE4_EXTRACTPS_XMM_to_M32(uptr to, x86SSERegType from, u8 imm8) { xEXTRACTPS( (u32*)to, xRegisterSSE(from), imm8 ); } emitterT void SSE4_DPPS_XMM_to_XMM(x86SSERegType to, x86SSERegType from, u8 imm8) { xDP.PS( xRegisterSSE(to), xRegisterSSE(from), imm8 ); } emitterT void SSE4_DPPS_M128_to_XMM(x86SSERegType to, uptr from, u8 imm8) { xDP.PS( xRegisterSSE(to), (void*)from, imm8 ); } diff --git a/pcsx2/x86/microVU_IR.h b/pcsx2/x86/microVU_IR.h index 732daf9bc..7e0ba4e85 100644 --- a/pcsx2/x86/microVU_IR.h +++ b/pcsx2/x86/microVU_IR.h @@ -161,6 +161,7 @@ struct microIR { // Reg Alloc //------------------------------------------------------------------ +void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW); void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW); void mVUloadReg(int reg, uptr offset, int xyzw); @@ -223,7 +224,7 @@ public: } void writeBackReg(int reg) { if ((xmmReg[reg].reg > 0) && xmmReg[reg].xyzw) { // Reg was modified and not Temp or vf0 - if (xmmReg[reg].reg == 32) SSE_MOVAPS_XMM_to_M128((uptr)&vuRegs->ACC.UL[0], reg); + if (xmmReg[reg].reg == 32) mVUsaveReg(reg, (uptr)&vuRegs->ACC.UL[0], xmmReg[reg].xyzw, 1); else mVUsaveReg(reg, (uptr)&vuRegs->VF[xmmReg[reg].reg].UL[0], xmmReg[reg].xyzw, 1); for (int i = 0; i < xmmTotal; i++) { if (i == reg) continue; @@ -241,20 +242,26 @@ public: clearReg(reg); // Clear Reg } void clearNeeded(int reg) { - // ToDo: Merge Regs Support xmmReg[reg].isNeeded = 0; if (xmmReg[reg].xyzw) { // Reg was modified if (xmmReg[reg].reg > 0) { - if (xmmReg[reg].xyzw < 0xf) writeBackReg(reg); // Always Write Back Partial Writes - if (xmmReg[reg].reg > 0) { - for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg - if (i == reg) continue; - if (xmmReg[i].reg == xmmReg[reg].reg) { - if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon::Error("microVU Error: clearNeeded()"); - clearReg(i); + int mergeRegs = 0; + if (xmmReg[reg].xyzw < 0xf) { mergeRegs = 1; } // Try to merge partial writes + for (int i = 0; i < xmmTotal; i++) { // Invalidate any other read-only regs of same vfReg + if (i == reg) continue; + if (xmmReg[i].reg == xmmReg[reg].reg) { + if (xmmReg[i].xyzw && xmmReg[i].xyzw < 0xf) DevCon::Error("microVU Error: clearNeeded() [%d]", params xmmReg[i].reg); + if (mergeRegs == 1) { + mVUmergeRegs(i, reg, xmmReg[reg].xyzw, 1); + xmmReg[i].xyzw = 0xf; + xmmReg[i].count = counter; + mergeRegs = 2; } + else clearReg(i); } } + if (mergeRegs == 2) clearReg(reg); // Clear Current Reg if Merged + else if (mergeRegs) writeBackReg(reg); // Write Back Partial Writes if couldn't merge } else clearReg(reg); // If Reg was temp or vf0, then invalidate itself } @@ -263,7 +270,7 @@ public: counter++; if (vfLoadReg >= 0) { // Search For Cached Regs for (int i = 0; i < xmmTotal; i++) { - if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified + if ((xmmReg[i].reg == vfLoadReg) && (!xmmReg[i].xyzw // Reg Was Not Modified || (/*!xmmReg[i].isNeeded &&*/ xmmReg[i].reg && (xmmReg[i].xyzw==0xf)))) { // Reg Had All Vectors Modified and != VF0 int z = i; if (vfWriteReg >= 0) { // Reg will be modified @@ -296,7 +303,8 @@ public: writeBackReg(x); if (vfWriteReg >= 0) { // Reg Will Be Modified (allow partial reg loading) - if (vfLoadReg == 32) mVUloadReg(x, (uptr)&vuRegs->ACC.UL[0], xyzw); + if ((vfLoadReg == 0) && !(xyzw & 1)) { SSE2_PXOR_XMM_to_XMM(x, x); } + else if (vfLoadReg == 32) mVUloadReg(x, (uptr)&vuRegs->ACC.UL[0], xyzw); else if (vfLoadReg >= 0) mVUloadReg(x, (uptr)&vuRegs->VF[vfLoadReg].UL[0], xyzw); xmmReg[x].reg = vfWriteReg; xmmReg[x].xyzw = xyzw; diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl index eb84e02a7..cdf6d9687 100644 --- a/pcsx2/x86/microVU_Misc.inl +++ b/pcsx2/x86/microVU_Misc.inl @@ -104,10 +104,16 @@ void mVUsaveReg(int reg, uptr offset, int xyzw, bool modXYZW) { return;*/ switch ( xyzw ) { - case 5: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY - SSE_MOVSS_XMM_to_M32(offset+4, reg); - SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW - SSE_MOVSS_XMM_to_M32(offset+12, reg); + case 5: if (cpucaps.hasStreamingSIMD4Extensions) { + SSE4_EXTRACTPS_XMM_to_M32(offset+4, reg, 1); + SSE4_EXTRACTPS_XMM_to_M32(offset+12, reg, 3); + } + else { + SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xe1); //WZXY + SSE_MOVSS_XMM_to_M32(offset+4, reg); + SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xff); //WWWW + SSE_MOVSS_XMM_to_M32(offset+12, reg); + } break; // YW case 6: SSE2_PSHUFD_XMM_to_XMM(reg, reg, 0xc9); SSE_MOVLPS_XMM_to_M64(offset+4, reg); @@ -203,25 +209,33 @@ void mVUsaveReg2(int reg, int gprReg, u32 offset, int xyzw) { } } -// Modifies the Source Reg! -void mVUmergeRegs(int dest, int src, int xyzw) { +// Modifies the Source Reg! (ToDo: Optimize modXYZW = 1 cases) +void mVUmergeRegs(int dest, int src, int xyzw, bool modXYZW = 0) { xyzw &= 0xf; if ( (dest != src) && (xyzw != 0) ) { - if ( cpucaps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf) ) { + if (cpucaps.hasStreamingSIMD4Extensions && (xyzw != 0x8) && (xyzw != 0xf)) { + if (modXYZW) { + if (xyzw == 1) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 3, 0)); return; } + else if (xyzw == 2) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 2, 0)); return; } + else if (xyzw == 4) { SSE4_INSERTPS_XMM_to_XMM(dest, src, _MM_MK_INSERTPS_NDX(0, 1, 0)); return; } + } xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); SSE4_BLENDPS_XMM_to_XMM(dest, src, xyzw); } else { switch (xyzw) { - case 1: SSE_MOVHLPS_XMM_to_XMM(src, dest); - SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); + case 1: if (modXYZW) mVUunpack_xyzw(src, src, 0); + SSE_MOVHLPS_XMM_to_XMM(src, dest); // src = Sw Sz Dw Dz + SSE_SHUFPS_XMM_to_XMM(dest, src, 0xc4); // 11 00 01 00 break; - case 2: SSE_MOVHLPS_XMM_to_XMM(src, dest); + case 2: if (modXYZW) mVUunpack_xyzw(src, src, 0); + SSE_MOVHLPS_XMM_to_XMM(src, dest); SSE_SHUFPS_XMM_to_XMM(dest, src, 0x64); break; case 3: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xe4); break; - case 4: SSE_MOVSS_XMM_to_XMM(src, dest); + case 4: if (modXYZW) mVUunpack_xyzw(src, src, 0); + SSE_MOVSS_XMM_to_XMM(src, dest); SSE2_MOVSD_XMM_to_XMM(dest, src); break; case 5: SSE_SHUFPS_XMM_to_XMM(dest, src, 0xd8); @@ -333,8 +347,8 @@ void MIN_MAX_(x86SSERegType to, x86SSERegType from, bool min) { // Warning: Modifies from and to's upper 3 vectors void MIN_MAX_SS(x86SSERegType to, x86SSERegType from, bool min) { SSE_SHUFPS_XMM_to_XMM (to, from, 0); - SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX_MASK1); - SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX_MASK2); + SSE2_PAND_M128_to_XMM (to, (uptr)MIN_MAX_MASK1); + SSE2_POR_M128_to_XMM (to, (uptr)MIN_MAX_MASK2); SSE2_PSHUFD_XMM_to_XMM(from, to, 0xee); if (min) SSE2_MINPD_XMM_to_XMM(to, from); else SSE2_MAXPD_XMM_to_XMM(to, from); diff --git a/pcsx2/x86/microVU_Upper.inl b/pcsx2/x86/microVU_Upper.inl index 141fd0f9c..1f46c8d7a 100644 --- a/pcsx2/x86/microVU_Upper.inl +++ b/pcsx2/x86/microVU_Upper.inl @@ -106,12 +106,21 @@ void mVU_printOP(microVU* mVU, int opCase, char* opName, bool isACC) { opCase4 { if (isACC) { mVUlogACC(); } else { mVUlogFd(); } mVUlogQ(); } } +// Sets Up Pass1 Info for Normal, BC, I, and Q Cases +void setupPass1(microVU* mVU, int opCase, bool isACC, bool noFlagUpdate) { + opCase1 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); } + opCase2 { mVUanalyzeFMAC3(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); } + opCase3 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); } + opCase4 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); } + if (noFlagUpdate) { sFLAG.doFlag = 0; } +} + // Sets Up Ft Reg for Normal, BC, I, and Q Cases void setupFtReg(microVU* mVU, int& Ft, int opCase) { opCase1 { Ft = mVU->regAlloc->allocReg(_Ft_); } opCase2 { if (!_XYZW_SS) { - Ft = mVU->regAlloc->allocReg(_Ft_, 0, _X_Y_Z_W); + Ft = mVU->regAlloc->allocReg(_Ft_, 0, 0xf); mVUunpack_xyzw(Ft, Ft, _bc_); } else Ft = mVU->regAlloc->allocReg(_Ft_); @@ -122,13 +131,7 @@ void setupFtReg(microVU* mVU, int& Ft, int opCase) { // Normal FMAC Opcodes void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, char* opName) { - pass1 { - opCase1 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); } - opCase2 { mVUanalyzeFMAC3(mVU, ((isACC) ? 0 : _Fd_), _Fs_, _Ft_); } - opCase3 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); } - opCase4 { mVUanalyzeFMAC1(mVU, ((isACC) ? 0 : _Fd_), _Fs_, 0); } - if ((opType == 3) || (opType == 4)) { sFLAG.doFlag = 0; } - } + pass1 { setupPass1(mVU, opCase, isACC, ((opType == 3) || (opType == 4))); } pass2 { int Fs, Ft, ACC; mVU->regAlloc->reset(); // Reset for Testing @@ -169,12 +172,7 @@ void mVU_FMACa(microVU* mVU, int recPass, int opCase, int opType, bool isACC, ch // MADDA/MSUBA Opcodes void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, char* opName) { - pass1 { - opCase1 { mVUanalyzeFMAC1(mVU, 0, _Fs_, _Ft_); } - opCase2 { mVUanalyzeFMAC3(mVU, 0, _Fs_, _Ft_); } - opCase3 { mVUanalyzeFMAC1(mVU, 0, _Fs_, 0); } - opCase4 { mVUanalyzeFMAC1(mVU, 0, _Fs_, 0); } - } + pass1 { setupPass1(mVU, opCase, 1, 0); } pass2 { int Fs, Ft, ACC; mVU->regAlloc->reset(); // Reset for Testing @@ -218,12 +216,7 @@ void mVU_FMACb(microVU* mVU, int recPass, int opCase, int opType, char* opName) // MADD Opcodes void mVU_FMACc(microVU* mVU, int recPass, int opCase, char* opName) { - pass1 { - opCase1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); } - opCase2 { mVUanalyzeFMAC3(mVU, _Fd_, _Fs_, _Ft_); } - opCase3 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); } - opCase4 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); } - } + pass1 { setupPass1(mVU, opCase, 0, 0); } pass2 { int Fs, Ft, ACC; mVU->regAlloc->reset(); // Reset for Testing @@ -255,12 +248,7 @@ void mVU_FMACc(microVU* mVU, int recPass, int opCase, char* opName) { // MSUB Opcodes void mVU_FMACd(microVU* mVU, int recPass, int opCase, char* opName) { - pass1 { - opCase1 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, _Ft_); } - opCase2 { mVUanalyzeFMAC3(mVU, _Fd_, _Fs_, _Ft_); } - opCase3 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); } - opCase4 { mVUanalyzeFMAC1(mVU, _Fd_, _Fs_, 0); } - } + pass1 { setupPass1(mVU, opCase, 0, 0); } pass2 { int Fs, Ft, Fd; mVU->regAlloc->reset(); // Reset for Testing