diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 38be9f129..73a276145 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -673,6 +673,8 @@ private: void Jit_WriteMorphColor(int outOff, bool checkAlpha = true); void Jit_AnyS8ToFloat(int srcoff); void Jit_AnyS16ToFloat(int srcoff); + void Jit_AnyU8ToFloat(int srcoff); + void Jit_AnyU16ToFloat(int srcoff); void Jit_AnyS8Morph(int srcoff, int dstoff); void Jit_AnyS16Morph(int srcoff, int dstoff); void Jit_AnyFloatMorph(int srcoff, int dstoff); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index f85c732ff..54a2c53ea 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -360,9 +360,28 @@ void VertexDecoderJitCache::Jit_WeightsU16() { } void VertexDecoderJitCache::Jit_WeightsU8ToFloat() { + int j = 0; + + switch (dec_->nweights) { + case 4: + // We'll at least do the first 4 fast. + case 5: + case 6: + case 7: + j = 4; + Jit_AnyU8ToFloat(dec_->weightoff); + MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); + break; + case 8: + Jit_AnyU8ToFloat(dec_->weightoff); + MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); + Jit_AnyU8ToFloat(dec_->weightoff + 4); + MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3); + return; + } + // Basic implementation - a byte at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { + for (; j < dec_->nweights; j++) { MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j)); CVTSI2SS(fpScratchReg, R(tempReg1)); MULSS(fpScratchReg, M(&by128)); @@ -375,9 +394,28 @@ void VertexDecoderJitCache::Jit_WeightsU8ToFloat() { } void VertexDecoderJitCache::Jit_WeightsU16ToFloat() { + int j = 0; + + switch (dec_->nweights) { + case 4: + // We'll at least do the first 4 fast. + case 5: + case 6: + case 7: + j = 4; + Jit_AnyU16ToFloat(dec_->weightoff); + MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); + break; + case 8: + Jit_AnyU16ToFloat(dec_->weightoff); + MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3); + Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2); + MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3); + return; + } + // Basic implementation - a short at a time. TODO: Optimize - int j; - for (j = 0; j < dec_->nweights; j++) { + for (; j < dec_->nweights; j++) { MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2)); CVTSI2SS(fpScratchReg, R(tempReg1)); MULSS(fpScratchReg, M(&by32768)); @@ -1145,6 +1183,35 @@ void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) { MULPS(XMM3, M(&by32768)); } +void VertexDecoderJitCache::Jit_AnyU8ToFloat(int srcoff) { + if (!cpu_info.bSSE4_1) { + XORPS(XMM3, R(XMM3)); + } + MOVD_xmm(XMM1, MDisp(srcReg, srcoff)); + if (cpu_info.bSSE4_1) { + PMOVZXBD(XMM1, R(XMM1)); + } else { + PUNPCKLBW(XMM1, R(XMM3)); + PUNPCKLWD(XMM1, R(XMM3)); + } + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M(&by128)); +} + +void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff) { + if (!cpu_info.bSSE4_1) { + XORPS(XMM3, R(XMM3)); + } + MOVQ_xmm(XMM1, MDisp(srcReg, srcoff)); + if (cpu_info.bSSE4_1) { + PMOVZXWD(XMM1, R(XMM1)); + } else { + PUNPCKLWD(XMM1, R(XMM3)); + } + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M(&by32768)); +} + void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); PXOR(fpScratchReg4, R(fpScratchReg4));