mirror of
https://github.com/libretro/ppsspp.git
synced 2024-11-25 01:00:01 +00:00
vertexjit: Use SSE4.1 where available on x86.
Just because we can.
This commit is contained in:
parent
5d04f123b9
commit
632eec38e8
@ -1156,6 +1156,20 @@ void XEmitter::WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg a
|
||||
arg.WriteRest(this, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
{
|
||||
if (size == 64 && packed)
|
||||
Write8(0x66); //this time, override goes upwards
|
||||
if (!packed)
|
||||
Write8(size == 64 ? 0xF2 : 0xF3);
|
||||
arg.operandReg = regOp;
|
||||
arg.WriteRex(this, 0, 0);
|
||||
Write8(0x0F);
|
||||
Write8(0x38);
|
||||
Write8(sseOp);
|
||||
arg.WriteRest(this, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);}
|
||||
void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);}
|
||||
|
||||
@ -1351,6 +1365,48 @@ void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, tr
|
||||
void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);}
|
||||
//void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
|
||||
|
||||
void XEmitter::PMOVSXBW(X64Reg dest, const OpArg &arg) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer.");
|
||||
}
|
||||
WriteSSEOp2(64, 0x20, true, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PMOVSXBD(X64Reg dest, const OpArg &arg) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer.");
|
||||
}
|
||||
WriteSSEOp2(64, 0x21, true, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PMOVSXWD(X64Reg dest, const OpArg &arg) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer.");
|
||||
}
|
||||
WriteSSEOp2(64, 0x23, true, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PMOVZXBW(X64Reg dest, const OpArg &arg) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer.");
|
||||
}
|
||||
WriteSSEOp2(64, 0x30, true, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PMOVZXBD(X64Reg dest, const OpArg &arg) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer.");
|
||||
}
|
||||
WriteSSEOp2(64, 0x31, true, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PMOVZXWD(X64Reg dest, const OpArg &arg) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer.");
|
||||
}
|
||||
WriteSSEOp2(64, 0x33, true, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLW(X64Reg reg, int shift) {
|
||||
WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg));
|
||||
Write8(shift);
|
||||
@ -1418,13 +1474,7 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {
|
||||
if (!cpu_info.bSSSE3) {
|
||||
PanicAlert("Trying to use PSHUFB on a system that doesn't support it. Bad programmer.");
|
||||
}
|
||||
Write8(0x66);
|
||||
arg.operandReg = dest;
|
||||
arg.WriteRex(this, 0, 0);
|
||||
Write8(0x0f);
|
||||
Write8(0x38);
|
||||
Write8(0x00);
|
||||
arg.WriteRest(this, 0);
|
||||
WriteSSEOp2(64, 0x00, true, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);}
|
||||
|
@ -263,6 +263,7 @@ private:
|
||||
void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
|
||||
void WriteMXCSR(OpArg arg, int ext);
|
||||
void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
|
||||
|
||||
protected:
|
||||
@ -604,6 +605,13 @@ public:
|
||||
void PUNPCKLWD(X64Reg dest, const OpArg &arg);
|
||||
void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
|
||||
|
||||
void PMOVSXBW(X64Reg dest, const OpArg &arg);
|
||||
void PMOVSXBD(X64Reg dest, const OpArg &arg);
|
||||
void PMOVSXWD(X64Reg dest, const OpArg &arg);
|
||||
void PMOVZXBW(X64Reg dest, const OpArg &arg);
|
||||
void PMOVZXBD(X64Reg dest, const OpArg &arg);
|
||||
void PMOVZXWD(X64Reg dest, const OpArg &arg);
|
||||
|
||||
void PAND(X64Reg dest, OpArg arg);
|
||||
void PANDN(X64Reg dest, OpArg arg);
|
||||
void PXOR(X64Reg dest, OpArg arg);
|
||||
|
@ -718,8 +718,12 @@ void VertexDecoderJitCache::Jit_Color8888Morph() {
|
||||
for (int n = 0; n < dec_->morphcount; ++n) {
|
||||
const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
|
||||
MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + dec_->coloff));
|
||||
PUNPCKLBW(reg, R(fpScratchReg4));
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVZXBD(reg, R(reg));
|
||||
} else {
|
||||
PUNPCKLBW(reg, R(fpScratchReg4));
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
}
|
||||
|
||||
CVTDQ2PS(reg, R(reg));
|
||||
|
||||
@ -763,8 +767,12 @@ void VertexDecoderJitCache::Jit_Color4444Morph() {
|
||||
POR(reg, R(fpScratchReg3));
|
||||
PSRLW(reg, 4);
|
||||
|
||||
PUNPCKLBW(reg, R(fpScratchReg4));
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVZXBD(reg, R(reg));
|
||||
} else {
|
||||
PUNPCKLBW(reg, R(fpScratchReg4));
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
}
|
||||
|
||||
CVTDQ2PS(reg, R(reg));
|
||||
MULPS(reg, R(XMM6));
|
||||
@ -957,10 +965,14 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
|
||||
void VertexDecoderJitCache::Jit_NormalS8Skin() {
|
||||
XORPS(XMM3, R(XMM3));
|
||||
MOVD_xmm(XMM1, MDisp(srcReg, dec_->nrmoff));
|
||||
PUNPCKLBW(XMM1, R(XMM3));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 24);
|
||||
PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXBD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PUNPCKLBW(XMM1, R(XMM3));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 24);
|
||||
PSRAD(XMM1, 24);
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by128));
|
||||
Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
|
||||
@ -970,9 +982,12 @@ void VertexDecoderJitCache::Jit_NormalS8Skin() {
|
||||
void VertexDecoderJitCache::Jit_NormalS16Skin() {
|
||||
XORPS(XMM3, R(XMM3));
|
||||
MOVQ_xmm(XMM1, MDisp(srcReg, dec_->nrmoff));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 16);
|
||||
PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXWD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PSLLD(XMM1, 16);
|
||||
PSRAD(XMM1, 16);
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by32768));
|
||||
Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
|
||||
@ -1045,10 +1060,14 @@ void VertexDecoderJitCache::Jit_PosFloat() {
|
||||
void VertexDecoderJitCache::Jit_PosS8Skin() {
|
||||
XORPS(XMM3, R(XMM3));
|
||||
MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff));
|
||||
PUNPCKLBW(XMM1, R(XMM3));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 24);
|
||||
PSRAD(XMM1, 24); // Ugly sign extension, can be done faster in SSE4
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXBD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PUNPCKLBW(XMM1, R(XMM3));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 24);
|
||||
PSRAD(XMM1, 24);
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by128));
|
||||
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
|
||||
@ -1057,9 +1076,13 @@ void VertexDecoderJitCache::Jit_PosS8Skin() {
|
||||
void VertexDecoderJitCache::Jit_PosS16Skin() {
|
||||
XORPS(XMM3, R(XMM3));
|
||||
MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 16);
|
||||
PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXWD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 16);
|
||||
PSRAD(XMM1, 16);
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by32768));
|
||||
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
|
||||
@ -1082,10 +1105,14 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
|
||||
const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
|
||||
// Okay, first convert to floats.
|
||||
MOVD_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
|
||||
PUNPCKLBW(reg, R(fpScratchReg4));
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
PSLLD(reg, 24);
|
||||
PSRAD(reg, 24); // Ugly sign extension, can be done faster in SSE4
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXBD(reg, R(reg));
|
||||
} else {
|
||||
PUNPCKLBW(reg, R(fpScratchReg4));
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
PSLLD(reg, 24);
|
||||
PSRAD(reg, 24);
|
||||
}
|
||||
CVTDQ2PS(reg, R(reg));
|
||||
|
||||
// Now, It's time to multiply by the weight and 1.0f/127.0f.
|
||||
@ -1116,9 +1143,13 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
|
||||
const X64Reg reg = first ? fpScratchReg : fpScratchReg2;
|
||||
// Okay, first convert to floats.
|
||||
MOVQ_xmm(reg, MDisp(srcReg, dec_->onesize_ * n + srcoff));
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
PSLLD(reg, 16);
|
||||
PSRAD(reg, 16); // Ugly sign extension, can be done faster in SSE4
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXWD(reg, R(reg));
|
||||
} else {
|
||||
PUNPCKLWD(reg, R(fpScratchReg4));
|
||||
PSLLD(reg, 16);
|
||||
PSRAD(reg, 16);
|
||||
}
|
||||
CVTDQ2PS(reg, R(reg));
|
||||
|
||||
// Now, It's time to multiply by the weight and 1.0f/32767.0f.
|
||||
|
Loading…
Reference in New Issue
Block a user