vertexjit: Optimize the d3d9 weights a bit.

Only used without skinning on, or with morph, of course.
This commit is contained in:
Unknown W. Brackets 2014-09-17 08:39:56 -07:00
parent 062bcceeee
commit 4c1061ff4a
2 changed files with 73 additions and 4 deletions

View File

@ -673,6 +673,8 @@ private:
void Jit_WriteMorphColor(int outOff, bool checkAlpha = true);
void Jit_AnyS8ToFloat(int srcoff);
void Jit_AnyS16ToFloat(int srcoff);
void Jit_AnyU8ToFloat(int srcoff);
void Jit_AnyU16ToFloat(int srcoff);
void Jit_AnyS8Morph(int srcoff, int dstoff);
void Jit_AnyS16Morph(int srcoff, int dstoff);
void Jit_AnyFloatMorph(int srcoff, int dstoff);

View File

@ -360,9 +360,28 @@ void VertexDecoderJitCache::Jit_WeightsU16() {
}
void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
int j = 0;
switch (dec_->nweights) {
case 4:
// We'll at least do the first 4 fast.
case 5:
case 6:
case 7:
j = 4;
Jit_AnyU8ToFloat(dec_->weightoff);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
break;
case 8:
Jit_AnyU8ToFloat(dec_->weightoff);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
Jit_AnyU8ToFloat(dec_->weightoff + 4);
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
return;
}
// Basic implementation - a byte at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
for (; j < dec_->nweights; j++) {
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
CVTSI2SS(fpScratchReg, R(tempReg1));
MULSS(fpScratchReg, M(&by128));
@ -375,9 +394,28 @@ void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
}
void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
int j = 0;
switch (dec_->nweights) {
case 4:
// We'll at least do the first 4 fast.
case 5:
case 6:
case 7:
j = 4;
Jit_AnyU16ToFloat(dec_->weightoff);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
break;
case 8:
Jit_AnyU16ToFloat(dec_->weightoff);
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2);
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
return;
}
// Basic implementation - a short at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
for (; j < dec_->nweights; j++) {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
CVTSI2SS(fpScratchReg, R(tempReg1));
MULSS(fpScratchReg, M(&by32768));
@ -1145,6 +1183,35 @@ void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
MULPS(XMM3, M(&by32768));
}
void VertexDecoderJitCache::Jit_AnyU8ToFloat(int srcoff) {
if (!cpu_info.bSSE4_1) {
XORPS(XMM3, R(XMM3));
}
MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
if (cpu_info.bSSE4_1) {
PMOVZXBD(XMM1, R(XMM1));
} else {
PUNPCKLBW(XMM1, R(XMM3));
PUNPCKLWD(XMM1, R(XMM3));
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by128));
}
void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff) {
if (!cpu_info.bSSE4_1) {
XORPS(XMM3, R(XMM3));
}
MOVQ_xmm(XMM1, MDisp(srcReg, srcoff));
if (cpu_info.bSSE4_1) {
PMOVZXWD(XMM1, R(XMM1));
} else {
PUNPCKLWD(XMM1, R(XMM3));
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by32768));
}
void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
PXOR(fpScratchReg4, R(fpScratchReg4));