diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index ccf05586d..f053ae6e3 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -1156,6 +1156,20 @@ void XEmitter::WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg a arg.WriteRest(this, extrabytes); } +void XEmitter::WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes) +{ + if (size == 64 && packed) + Write8(0x66); //this time, override goes upwards + if (!packed) + Write8(size == 64 ? 0xF2 : 0xF3); + arg.operandReg = regOp; + arg.WriteRex(this, 0, 0); + Write8(0x0F); + Write8(0x38); + Write8(sseOp); + arg.WriteRest(this, extrabytes); +} + void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);} void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);} @@ -1351,6 +1365,48 @@ void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, tr void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);} //void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);} +void XEmitter::PMOVSXBW(X64Reg dest, const OpArg &arg) { + if (!cpu_info.bSSE4_1) { + PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer."); + } + WriteSSEOp2(64, 0x20, true, dest, arg); +} + +void XEmitter::PMOVSXBD(X64Reg dest, const OpArg &arg) { + if (!cpu_info.bSSE4_1) { + PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer."); + } + WriteSSEOp2(64, 0x21, true, dest, arg); +} + +void XEmitter::PMOVSXWD(X64Reg dest, const OpArg &arg) { + if (!cpu_info.bSSE4_1) { + PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer."); + } + WriteSSEOp2(64, 0x23, true, dest, arg); +} + +void XEmitter::PMOVZXBW(X64Reg dest, const OpArg &arg) { + if (!cpu_info.bSSE4_1) { + PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer."); + } + WriteSSEOp2(64, 0x30, true, dest, arg); +} + +void XEmitter::PMOVZXBD(X64Reg dest, const OpArg &arg) { + if (!cpu_info.bSSE4_1) { + PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer."); + } + WriteSSEOp2(64, 0x31, true, dest, arg); +} + +void XEmitter::PMOVZXWD(X64Reg dest, const OpArg &arg) { + if (!cpu_info.bSSE4_1) { + PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer."); + } + WriteSSEOp2(64, 0x33, true, dest, arg); +} + void XEmitter::PSRLW(X64Reg reg, int shift) { WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg)); Write8(shift); @@ -1418,13 +1474,7 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) { if (!cpu_info.bSSSE3) { PanicAlert("Trying to use PSHUFB on a system that doesn't support it. Bad programmer."); } - Write8(0x66); - arg.operandReg = dest; - arg.WriteRex(this, 0, 0); - Write8(0x0f); - Write8(0x38); - Write8(0x00); - arg.WriteRest(this, 0); + WriteSSEOp2(64, 0x00, true, dest, arg); } void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h index 1cface46f..97b6990f4 100644 --- a/Common/x64Emitter.h +++ b/Common/x64Emitter.h @@ -263,6 +263,7 @@ private: void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext); void WriteMXCSR(OpArg arg, int ext); void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); protected: @@ -604,6 +605,13 @@ public: void PUNPCKLWD(X64Reg dest, const OpArg &arg); void PUNPCKLDQ(X64Reg dest, const OpArg &arg); + void PMOVSXBW(X64Reg dest, const OpArg &arg); + void PMOVSXBD(X64Reg dest, const OpArg &arg); + void PMOVSXWD(X64Reg dest, const OpArg &arg); + void PMOVZXBW(X64Reg dest, const OpArg &arg); + void PMOVZXBD(X64Reg dest, const OpArg &arg); + void PMOVZXWD(X64Reg dest, const OpArg &arg); + void PAND(X64Reg dest, OpArg arg); void PANDN(X64Reg dest, OpArg arg); void PXOR(X64Reg dest, OpArg arg); diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp index 39beb84e6..3f2440a48 100644 --- a/GPU/GLES/VertexDecoderX86.cpp +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -718,8 +718,12 @@ void VertexDecoderJitCache::Jit_Color8888Morph() { for (int n = 0; n < dec_->morphcount; ++n) { const X64Reg reg = first ? fpScratchReg : fpScratchReg2; MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff)); - PUNPCKLBW(reg, R(fpScratchReg4)); - PUNPCKLWD(reg, R(fpScratchReg4)); + if (cpu_info.bSSE4_1) { + PMOVZXBD(reg, R(reg)); + } else { + PUNPCKLBW(reg, R(fpScratchReg4)); + PUNPCKLWD(reg, R(fpScratchReg4)); + } CVTDQ2PS(reg, R(reg)); @@ -763,8 +767,12 @@ void VertexDecoderJitCache::Jit_Color4444Morph() { POR(reg, R(fpScratchReg3)); PSRLW(reg, 4); - PUNPCKLBW(reg, R(fpScratchReg4)); - PUNPCKLWD(reg, R(fpScratchReg4)); + if (cpu_info.bSSE4_1) { + PMOVZXBD(reg, R(reg)); + } else { + PUNPCKLBW(reg, R(fpScratchReg4)); + PUNPCKLWD(reg, R(fpScratchReg4)); + } CVTDQ2PS(reg, R(reg)); MULPS(reg, R(XMM6)); @@ -957,10 +965,14 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { void VertexDecoderJitCache::Jit_NormalS8Skin() { XORPS(XMM3, R(XMM3)); MOVD_xmm(XMM1, MDisp(srcReg, dec_->nrmoff)); - PUNPCKLBW(XMM1, R(XMM3)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 24); - PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4 + if (cpu_info.bSSE4_1) { + PMOVSXBD(XMM1, R(XMM1)); + } else { + PUNPCKLBW(XMM1, R(XMM3)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 24); + PSRAD(XMM1, 24); + } CVTDQ2PS(XMM3, R(XMM1)); MULPS(XMM3, M(&by128)); Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); @@ -970,9 +982,12 @@ void VertexDecoderJitCache::Jit_NormalS8Skin() { void VertexDecoderJitCache::Jit_NormalS16Skin() { XORPS(XMM3, R(XMM3)); MOVQ_xmm(XMM1, MDisp(srcReg, dec_->nrmoff)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 16); - PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 + if (cpu_info.bSSE4_1) { + PMOVSXWD(XMM1, R(XMM1)); + } else { + PSLLD(XMM1, 16); + PSRAD(XMM1, 16); + } CVTDQ2PS(XMM3, R(XMM1)); MULPS(XMM3, M(&by32768)); Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); @@ -1045,10 +1060,14 @@ void VertexDecoderJitCache::Jit_PosFloat() { void VertexDecoderJitCache::Jit_PosS8Skin() { XORPS(XMM3, R(XMM3)); MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff)); - PUNPCKLBW(XMM1, R(XMM3)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 24); - PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4 + if (cpu_info.bSSE4_1) { + PMOVSXBD(XMM1, R(XMM1)); + } else { + PUNPCKLBW(XMM1, R(XMM3)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 24); + PSRAD(XMM1, 24); + } CVTDQ2PS(XMM3, R(XMM1)); MULPS(XMM3, M(&by128)); Jit_WriteMatrixMul(dec_->decFmt.posoff, true); @@ -1057,9 +1076,13 @@ void VertexDecoderJitCache::Jit_PosS8Skin() { void VertexDecoderJitCache::Jit_PosS16Skin() { XORPS(XMM3, R(XMM3)); MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff)); - PUNPCKLWD(XMM1, R(XMM3)); - PSLLD(XMM1, 16); - PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4 + if (cpu_info.bSSE4_1) { + PMOVSXWD(XMM1, R(XMM1)); + } else { + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 16); + PSRAD(XMM1, 16); + } CVTDQ2PS(XMM3, R(XMM1)); MULPS(XMM3, M(&by32768)); Jit_WriteMatrixMul(dec_->decFmt.posoff, true); @@ -1082,10 +1105,14 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { const X64Reg reg = first ? fpScratchReg : fpScratchReg2; // Okay, first convert to floats. MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff)); - PUNPCKLBW(reg, R(fpScratchReg4)); - PUNPCKLWD(reg, R(fpScratchReg4)); - PSLLD(reg, 24); - PSRAD(reg, 24); // Ugly sign extension, can be done faster in SSE4 + if (cpu_info.bSSE4_1) { + PMOVSXBD(reg, R(reg)); + } else { + PUNPCKLBW(reg, R(fpScratchReg4)); + PUNPCKLWD(reg, R(fpScratchReg4)); + PSLLD(reg, 24); + PSRAD(reg, 24); + } CVTDQ2PS(reg, R(reg)); // Now, It's time to multiply by the weight and 1.0f/127.0f. @@ -1116,9 +1143,13 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) { const X64Reg reg = first ? fpScratchReg : fpScratchReg2; // Okay, first convert to floats. MOVQ_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff)); - PUNPCKLWD(reg, R(fpScratchReg4)); - PSLLD(reg, 16); - PSRAD(reg, 16); // Ugly sign extension, can be done faster in SSE4 + if (cpu_info.bSSE4_1) { + PMOVSXWD(reg, R(reg)); + } else { + PUNPCKLWD(reg, R(fpScratchReg4)); + PSLLD(reg, 16); + PSRAD(reg, 16); + } CVTDQ2PS(reg, R(reg)); // Now, It's time to multiply by the weight and 1.0f/32767.0f.