vertexjit: Use SSE4.1 where available on x86.

Just because we can.
This commit is contained in:
Unknown W. Brackets 2014-03-20 08:14:49 -07:00
parent 5d04f123b9
commit 632eec38e8
3 changed files with 121 additions and 32 deletions

View File

@ -1156,6 +1156,20 @@ void XEmitter::WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg a
arg.WriteRest(this, extrabytes);
}
void XEmitter::WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
{
if (size == 64 && packed)
Write8(0x66); //this time, override goes upwards
if (!packed)
Write8(size == 64 ? 0xF2 : 0xF3);
arg.operandReg = regOp;
arg.WriteRex(this, 0, 0);
Write8(0x0F);
Write8(0x38);
Write8(sseOp);
arg.WriteRest(this, extrabytes);
}
void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);}
void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);}
@ -1351,6 +1365,48 @@ void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, tr
void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);}
//void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
void XEmitter::PMOVSXBW(X64Reg dest, const OpArg &arg) {
if (!cpu_info.bSSE4_1) {
PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer.");
}
WriteSSEOp2(64, 0x20, true, dest, arg);
}
void XEmitter::PMOVSXBD(X64Reg dest, const OpArg &arg) {
if (!cpu_info.bSSE4_1) {
PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer.");
}
WriteSSEOp2(64, 0x21, true, dest, arg);
}
void XEmitter::PMOVSXWD(X64Reg dest, const OpArg &arg) {
if (!cpu_info.bSSE4_1) {
PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer.");
}
WriteSSEOp2(64, 0x23, true, dest, arg);
}
void XEmitter::PMOVZXBW(X64Reg dest, const OpArg &arg) {
if (!cpu_info.bSSE4_1) {
PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer.");
}
WriteSSEOp2(64, 0x30, true, dest, arg);
}
void XEmitter::PMOVZXBD(X64Reg dest, const OpArg &arg) {
if (!cpu_info.bSSE4_1) {
PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer.");
}
WriteSSEOp2(64, 0x31, true, dest, arg);
}
void XEmitter::PMOVZXWD(X64Reg dest, const OpArg &arg) {
if (!cpu_info.bSSE4_1) {
PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer.");
}
WriteSSEOp2(64, 0x33, true, dest, arg);
}
void XEmitter::PSRLW(X64Reg reg, int shift) {
WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg));
Write8(shift);
@ -1418,13 +1474,7 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {
if (!cpu_info.bSSSE3) {
PanicAlert("Trying to use PSHUFB on a system that doesn't support it. Bad programmer.");
}
Write8(0x66);
arg.operandReg = dest;
arg.WriteRex(this, 0, 0);
Write8(0x0f);
Write8(0x38);
Write8(0x00);
arg.WriteRest(this, 0);
WriteSSEOp2(64, 0x00, true, dest, arg);
}
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);}

View File

@ -263,6 +263,7 @@ private:
void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
void WriteMXCSR(OpArg arg, int ext);
void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
protected:
@ -604,6 +605,13 @@ public:
void PUNPCKLWD(X64Reg dest, const OpArg &arg);
void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
void PMOVSXBW(X64Reg dest, const OpArg &arg);
void PMOVSXBD(X64Reg dest, const OpArg &arg);
void PMOVSXWD(X64Reg dest, const OpArg &arg);
void PMOVZXBW(X64Reg dest, const OpArg &arg);
void PMOVZXBD(X64Reg dest, const OpArg &arg);
void PMOVZXWD(X64Reg dest, const OpArg &arg);
void PAND(X64Reg dest, OpArg arg);
void PANDN(X64Reg dest, OpArg arg);
void PXOR(X64Reg dest, OpArg arg);

View File

@ -718,8 +718,12 @@ void VertexDecoderJitCache::Jit_Color8888Morph() {
for (int n = 0; n < dec_->morphcount; ++n) {
const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
PUNPCKLBW(reg, R(fpScratchReg4));
PUNPCKLWD(reg, R(fpScratchReg4));
if (cpu_info.bSSE4_1) {
PMOVZXBD(reg, R(reg));
} else {
PUNPCKLBW(reg, R(fpScratchReg4));
PUNPCKLWD(reg, R(fpScratchReg4));
}
CVTDQ2PS(reg, R(reg));
@ -763,8 +767,12 @@ void VertexDecoderJitCache::Jit_Color4444Morph() {
POR(reg, R(fpScratchReg3));
PSRLW(reg, 4);
PUNPCKLBW(reg, R(fpScratchReg4));
PUNPCKLWD(reg, R(fpScratchReg4));
if (cpu_info.bSSE4_1) {
PMOVZXBD(reg, R(reg));
} else {
PUNPCKLBW(reg, R(fpScratchReg4));
PUNPCKLWD(reg, R(fpScratchReg4));
}
CVTDQ2PS(reg, R(reg));
MULPS(reg, R(XMM6));
@ -957,10 +965,14 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
void VertexDecoderJitCache::Jit_NormalS8Skin() {
XORPS(XMM3, R(XMM3));
MOVD_xmm(XMM1, MDisp(srcReg, dec_->nrmoff));
PUNPCKLBW(XMM1, R(XMM3));
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 24);
PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4
if (cpu_info.bSSE4_1) {
PMOVSXBD(XMM1, R(XMM1));
} else {
PUNPCKLBW(XMM1, R(XMM3));
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 24);
PSRAD(XMM1, 24);
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by128));
Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
@ -970,9 +982,12 @@ void VertexDecoderJitCache::Jit_NormalS8Skin() {
void VertexDecoderJitCache::Jit_NormalS16Skin() {
XORPS(XMM3, R(XMM3));
MOVQ_xmm(XMM1, MDisp(srcReg, dec_->nrmoff));
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 16);
PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4
if (cpu_info.bSSE4_1) {
PMOVSXWD(XMM1, R(XMM1));
} else {
PSLLD(XMM1, 16);
PSRAD(XMM1, 16);
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by32768));
Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
@ -1045,10 +1060,14 @@ void VertexDecoderJitCache::Jit_PosFloat() {
void VertexDecoderJitCache::Jit_PosS8Skin() {
XORPS(XMM3, R(XMM3));
MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff));
PUNPCKLBW(XMM1, R(XMM3));
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 24);
PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4
if (cpu_info.bSSE4_1) {
PMOVSXBD(XMM1, R(XMM1));
} else {
PUNPCKLBW(XMM1, R(XMM3));
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 24);
PSRAD(XMM1, 24);
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by128));
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
@ -1057,9 +1076,13 @@ void VertexDecoderJitCache::Jit_PosS8Skin() {
void VertexDecoderJitCache::Jit_PosS16Skin() {
XORPS(XMM3, R(XMM3));
MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff));
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 16);
PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4
if (cpu_info.bSSE4_1) {
PMOVSXWD(XMM1, R(XMM1));
} else {
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 16);
PSRAD(XMM1, 16);
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by32768));
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
@ -1082,10 +1105,14 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
// Okay, first convert to floats.
MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
PUNPCKLBW(reg, R(fpScratchReg4));
PUNPCKLWD(reg, R(fpScratchReg4));
PSLLD(reg, 24);
PSRAD(reg, 24); // Ugly sign extension, can be done faster in SSE4
if (cpu_info.bSSE4_1) {
PMOVSXBD(reg, R(reg));
} else {
PUNPCKLBW(reg, R(fpScratchReg4));
PUNPCKLWD(reg, R(fpScratchReg4));
PSLLD(reg, 24);
PSRAD(reg, 24);
}
CVTDQ2PS(reg, R(reg));
// Now, It's time to multiply by the weight and 1.0f/127.0f.
@ -1116,9 +1143,13 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
// Okay, first convert to floats.
MOVQ_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
PUNPCKLWD(reg, R(fpScratchReg4));
PSLLD(reg, 16);
PSRAD(reg, 16); // Ugly sign extension, can be done faster in SSE4
if (cpu_info.bSSE4_1) {
PMOVSXWD(reg, R(reg));
} else {
PUNPCKLWD(reg, R(fpScratchReg4));
PSLLD(reg, 16);
PSRAD(reg, 16);
}
CVTDQ2PS(reg, R(reg));
// Now, It's time to multiply by the weight and 1.0f/32767.0f.