vertexjit: Implement Morph pos/nrm variants on x86.

2024-11-29 11:20:40 +00:00 · 2014-03-02 20:11:40 -08:00 · 2014-03-02 20:11:40 -08:00 · fb63dad54e
commit fb63dad54e
parent 8aceba732a
2 changed files with 125 additions and 0 deletions
--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@ -245,6 +245,18 @@ public:
 	void Jit_PosS16Skin();
 	void Jit_PosFloatSkin();

+	void Jit_AnyS8Morph(int srcoff, int dstoff);
+	void Jit_AnyS16Morph(int srcoff, int dstoff);
+	void Jit_AnyFloatMorph(int srcoff, int dstoff);
+
+	void Jit_NormalS8Morph();
+	void Jit_NormalS16Morph();
+	void Jit_NormalFloatMorph();
+
+	void Jit_PosS8Morph();
+	void Jit_PosS16Morph();
+	void Jit_PosFloatMorph();
+
 private:
 	bool CompileStep(const VertexDecoder &dec, int i);
 	void Jit_ApplyWeights();
--- a/GPU/GLES/VertexDecoderX86.cpp
+++ b/GPU/GLES/VertexDecoderX86.cpp
@ -27,12 +27,18 @@ static float MEMORY_ALIGNED16(bones[16 * 8]);

 using namespace Gen;

+static const float MEMORY_ALIGNED16( by127[4] ) = {
+	1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f
+};
 static const float MEMORY_ALIGNED16( by128[4] ) = {
 	1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
 };
 static const float MEMORY_ALIGNED16( by256[4] ) = {
 	1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256
 };
+static const float MEMORY_ALIGNED16( by32767[4] ) = {
+	1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f,
+};
 static const float MEMORY_ALIGNED16( by32768[4] ) = {
 	1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
 };
@ -125,6 +131,14 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
 	{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
 	{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
+
+	{&VertexDecoder::Step_NormalS8Morph, &VertexDecoderJitCache::Jit_NormalS8Morph},
+	{&VertexDecoder::Step_NormalS16Morph, &VertexDecoderJitCache::Jit_NormalS16Morph},
+	{&VertexDecoder::Step_NormalFloatMorph, &VertexDecoderJitCache::Jit_NormalFloatMorph},
+
+	{&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
+	{&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
+	{&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
 };

 // TODO: This should probably be global...
@ -841,6 +855,105 @@ void VertexDecoderJitCache::Jit_PosFloatSkin() {
 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
 }

+void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
+	// TODO: Optimize the first one to skip an ADDPS.
+	XORPS(fpScratchReg, R(fpScratchReg));
+
+	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
+
+	for (int n = 0; n < dec_->morphcount; ++n) {
+		// Okay, first convert to floats.
+		XORPS(fpScratchReg3, R(fpScratchReg3));
+		MOVD_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + srcoff));
+		PUNPCKLBW(fpScratchReg2, R(fpScratchReg3));
+		PUNPCKLWD(fpScratchReg2, R(fpScratchReg3));
+		PSLLD(fpScratchReg2, 24);
+		PSRAD(fpScratchReg2, 24); // Ugly sign extension, can be done faster in SSE4
+		CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
+
+		// Now, It's time to multiply by the weight and 1.0f/127.0f.
+		MOVUPS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
+		MULPS(fpScratchReg3, M(by127));
+		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
+
+		MULPS(fpScratchReg2, R(fpScratchReg3));
+		ADDPS(fpScratchReg, R(fpScratchReg2));
+	}
+
+	// TODO: Is it okay that we're over-writing by 4 bytes?  Probably...
+	MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
+	// TODO: Optimize the first one to skip an ADDPS.
+	XORPS(fpScratchReg, R(fpScratchReg));
+
+	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
+
+	for (int n = 0; n < dec_->morphcount; ++n) {
+		// Okay, first convert to floats.
+		XORPS(fpScratchReg3, R(fpScratchReg3));
+		MOVQ_xmm(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + srcoff));
+		PUNPCKLWD(fpScratchReg2, R(fpScratchReg3));
+		PSLLD(fpScratchReg2, 16);
+		PSRAD(fpScratchReg2, 16); // Ugly sign extension, can be done faster in SSE4
+		CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
+
+		// Now, It's time to multiply by the weight and 1.0f/32767.0f.
+		MOVUPS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
+		MULPS(fpScratchReg3, M(by32767));
+		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
+
+		MULPS(fpScratchReg2, R(fpScratchReg3));
+		ADDPS(fpScratchReg, R(fpScratchReg2));
+	}
+
+	// TODO: Is it okay that we're over-writing by 4 bytes?  Probably...
+	MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_AnyFloatMorph(int srcoff, int dstoff) {
+	// TODO: Optimize the first one to skip an ADDPS.
+	XORPS(fpScratchReg, R(fpScratchReg));
+
+	MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
+
+	for (int n = 0; n < dec_->morphcount; ++n) {
+		MOVUPS(fpScratchReg2, MDisp(srcReg, dec_->onesize_ * n + srcoff));
+		MOVUPS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
+		SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
+		MULPS(fpScratchReg2, R(fpScratchReg3));
+		ADDPS(fpScratchReg, R(fpScratchReg2));
+	}
+
+	// TODO: Is it okay that we're over-writing by 4 bytes?  Probably...
+	MOVUPS(MDisp(dstReg, dstoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_PosS8Morph() {
+	Jit_AnyS8Morph(dec_->posoff, dec_->decFmt.posoff);
+}
+
+void VertexDecoderJitCache::Jit_PosS16Morph() {
+	Jit_AnyS16Morph(dec_->posoff, dec_->decFmt.posoff);
+}
+
+void VertexDecoderJitCache::Jit_PosFloatMorph() {
+	Jit_AnyFloatMorph(dec_->posoff, dec_->decFmt.posoff);
+}
+
+void VertexDecoderJitCache::Jit_NormalS8Morph() {
+	Jit_AnyS8Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
+}
+
+void VertexDecoderJitCache::Jit_NormalS16Morph() {
+	Jit_AnyS16Morph(dec_->nrmoff, dec_->decFmt.nrmoff);
+}
+
+void VertexDecoderJitCache::Jit_NormalFloatMorph() {
+	Jit_AnyFloatMorph(dec_->nrmoff, dec_->decFmt.nrmoff);
+}
+
 bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
 	// See if we find a matching JIT function
 	for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {