vertexjit: Implement Morph pos/nrm variants on x86.

This commit is contained in:
Unknown W. Brackets 2014-03-02 20:11:40 -08:00
parent 8aceba732a
commit fb63dad54e
2 changed files with 125 additions and 0 deletions

View File

@ -245,6 +245,18 @@ public:
void Jit_PosS16Skin();
void Jit_PosFloatSkin();
void Jit_AnyS8Morph(int srcoff, int dstoff);
void Jit_AnyS16Morph(int srcoff, int dstoff);
void Jit_AnyFloatMorph(int srcoff, int dstoff);
void Jit_NormalS8Morph();
void Jit_NormalS16Morph();
void Jit_NormalFloatMorph();
void Jit_PosS8Morph();
void Jit_PosS16Morph();
void Jit_PosFloatMorph();
private:
bool CompileStep(const VertexDecoder &dec, int i);
void Jit_ApplyWeights();

View File

@ -27,12 +27,18 @@ static float MEMORY_ALIGNED16(bones[16 * 8]);
using namespace Gen;
static const float MEMORY_ALIGNED16( by127[4] ) = {
1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f
};
static const float MEMORY_ALIGNED16( by128[4] ) = {
1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
};
static const float MEMORY_ALIGNED16( by256[4] ) = {
1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256
};
static const float MEMORY_ALIGNED16( by32767[4] ) = {
1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f,
};
static const float MEMORY_ALIGNED16( by32768[4] ) = {
1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
};
@ -125,6 +131,14 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
{&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
{&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
{&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
{&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
{&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
{&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
};
// TODO: This should probably be global...
@ -841,6 +855,105 @@ void VertexDecoderJitCache::Jit_PosFloatSkin() {
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
}
void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
// TODO: Optimize the first one to skip an ADDPS.
XORPS(fpScratchReg, R(fpScratchReg));
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
for (int n = 0; n < dec_->morphcount; ++n) {
// Okay, first convert to floats.
XORPS(fpScratchReg3, R(fpScratchReg3));
MOVD_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + srcoff));
PUNPCKLBW(fpScratchReg2, R(fpScratchReg3));
PUNPCKLWD(fpScratchReg2, R(fpScratchReg3));
PSLLD(fpScratchReg2, 24);
PSRAD(fpScratchReg2, 24); // Ugly sign extension, can be done faster in SSE4
CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
// Now, It's time to multiply by the weight and 1.0f/127.0f.
MOVUPS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
MULPS(fpScratchReg3, M(by127));
SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(fpScratchReg2, R(fpScratchReg3));
ADDPS(fpScratchReg, R(fpScratchReg2));
}
// TODO: Is it okay that we're over-writing by 4 bytes? Probably...
MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
}
void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
// TODO: Optimize the first one to skip an ADDPS.
XORPS(fpScratchReg, R(fpScratchReg));
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
for (int n = 0; n < dec_->morphcount; ++n) {
// Okay, first convert to floats.
XORPS(fpScratchReg3, R(fpScratchReg3));
MOVQ_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + srcoff));
PUNPCKLWD(fpScratchReg2, R(fpScratchReg3));
PSLLD(fpScratchReg2, 16);
PSRAD(fpScratchReg2, 16); // Ugly sign extension, can be done faster in SSE4
CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
// Now, It's time to multiply by the weight and 1.0f/32767.0f.
MOVUPS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
MULPS(fpScratchReg3, M(by32767));
SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(fpScratchReg2, R(fpScratchReg3));
ADDPS(fpScratchReg, R(fpScratchReg2));
}
// TODO: Is it okay that we're over-writing by 4 bytes? Probably...
MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
}
void VertexDecoderJitCache::Jit_AnyFloatMorph(int srcoff, int dstoff) {
// TODO: Optimize the first one to skip an ADDPS.
XORPS(fpScratchReg, R(fpScratchReg));
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
for (int n = 0; n < dec_->morphcount; ++n) {
MOVUPS(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + srcoff));
MOVUPS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(fpScratchReg2, R(fpScratchReg3));
ADDPS(fpScratchReg, R(fpScratchReg2));
}
// TODO: Is it okay that we're over-writing by 4 bytes? Probably...
MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
}
void VertexDecoderJitCache::Jit_PosS8Morph() {
Jit_AnyS8Morph(dec_->posoff, dec_->decFmt.posoff);
}
void VertexDecoderJitCache::Jit_PosS16Morph() {
Jit_AnyS16Morph(dec_->posoff, dec_->decFmt.posoff);
}
void VertexDecoderJitCache::Jit_PosFloatMorph() {
Jit_AnyFloatMorph(dec_->posoff, dec_->decFmt.posoff);
}
void VertexDecoderJitCache::Jit_NormalS8Morph() {
Jit_AnyS8Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
}
void VertexDecoderJitCache::Jit_NormalS16Morph() {
Jit_AnyS16Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
}
void VertexDecoderJitCache::Jit_NormalFloatMorph() {
Jit_AnyFloatMorph(dec_->nrmoff, dec_->decFmt.nrmoff);
}
bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
// See if we find a matching JIT function
for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {