mirror of
https://github.com/libretro/ppsspp.git
synced 2024-11-25 09:09:49 +00:00
vertexjit: Optimize the d3d9 weights a bit.
Only used without skinning on, or with morph, of course.
This commit is contained in:
parent
062bcceeee
commit
4c1061ff4a
@ -673,6 +673,8 @@ private:
|
||||
void Jit_WriteMorphColor(int outOff, bool checkAlpha = true);
|
||||
void Jit_AnyS8ToFloat(int srcoff);
|
||||
void Jit_AnyS16ToFloat(int srcoff);
|
||||
void Jit_AnyU8ToFloat(int srcoff);
|
||||
void Jit_AnyU16ToFloat(int srcoff);
|
||||
void Jit_AnyS8Morph(int srcoff, int dstoff);
|
||||
void Jit_AnyS16Morph(int srcoff, int dstoff);
|
||||
void Jit_AnyFloatMorph(int srcoff, int dstoff);
|
||||
|
@ -360,9 +360,28 @@ void VertexDecoderJitCache::Jit_WeightsU16() {
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
|
||||
int j = 0;
|
||||
|
||||
switch (dec_->nweights) {
|
||||
case 4:
|
||||
// We'll at least do the first 4 fast.
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
j = 4;
|
||||
Jit_AnyU8ToFloat(dec_->weightoff);
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||
break;
|
||||
case 8:
|
||||
Jit_AnyU8ToFloat(dec_->weightoff);
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||
Jit_AnyU8ToFloat(dec_->weightoff + 4);
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
|
||||
return;
|
||||
}
|
||||
|
||||
// Basic implementation - a byte at a time. TODO: Optimize
|
||||
int j;
|
||||
for (j = 0; j < dec_->nweights; j++) {
|
||||
for (; j < dec_->nweights; j++) {
|
||||
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
|
||||
CVTSI2SS(fpScratchReg, R(tempReg1));
|
||||
MULSS(fpScratchReg, M(&by128));
|
||||
@ -375,9 +394,28 @@ void VertexDecoderJitCache::Jit_WeightsU8ToFloat() {
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
|
||||
int j = 0;
|
||||
|
||||
switch (dec_->nweights) {
|
||||
case 4:
|
||||
// We'll at least do the first 4 fast.
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
j = 4;
|
||||
Jit_AnyU16ToFloat(dec_->weightoff);
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||
break;
|
||||
case 8:
|
||||
Jit_AnyU16ToFloat(dec_->weightoff);
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
|
||||
Jit_AnyU16ToFloat(dec_->weightoff + 4 * 2);
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.w1off), XMM3);
|
||||
return;
|
||||
}
|
||||
|
||||
// Basic implementation - a short at a time. TODO: Optimize
|
||||
int j;
|
||||
for (j = 0; j < dec_->nweights; j++) {
|
||||
for (; j < dec_->nweights; j++) {
|
||||
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
|
||||
CVTSI2SS(fpScratchReg, R(tempReg1));
|
||||
MULSS(fpScratchReg, M(&by32768));
|
||||
@ -1145,6 +1183,35 @@ void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
|
||||
MULPS(XMM3, M(&by32768));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_AnyU8ToFloat(int srcoff) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
XORPS(XMM3, R(XMM3));
|
||||
}
|
||||
MOVD_xmm(XMM1, MDisp(srcReg, srcoff));
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVZXBD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PUNPCKLBW(XMM1, R(XMM3));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by128));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff) {
|
||||
if (!cpu_info.bSSE4_1) {
|
||||
XORPS(XMM3, R(XMM3));
|
||||
}
|
||||
MOVQ_xmm(XMM1, MDisp(srcReg, srcoff));
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVZXWD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by32768));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
|
||||
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
|
||||
PXOR(fpScratchReg4, R(fpScratchReg4));
|
||||
|
Loading…
Reference in New Issue
Block a user