Jit the most common of the "ToFloat" texcoord conversions

2025-02-26 00:45:49 +00:00 · 2014-09-10 10:52:59 +02:00 · 2014-09-10 10:52:59 +02:00 · 0727df6f0a
commit 0727df6f0a
parent 37e3cf362f
2 changed files with 36 additions and 0 deletions
--- a/GPU/Common/VertexDecoderArm.cpp
+++ b/GPU/Common/VertexDecoderArm.cpp
@ -51,6 +51,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f};


 static const float by128 = 1.0f / 128.0f;
+static const float by16384 = 1.0f / 16384.0f;
 static const float by32768 = 1.0f / 32768.0f;

 using namespace ArmGen;
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@ -39,6 +39,10 @@ static const float MEMORY_ALIGNED16( by32768[4] ) = {
 static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
 static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000};

+static const float MEMORY_ALIGNED16(by16384[4]) = {
+	1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f,
+};
+
 #ifdef _M_X64
 #ifdef _WIN32
 static const X64Reg tempReg1 = RAX;
@ -90,6 +94,8 @@ static const JitLookup jitLookup[] = {

 	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
 	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
+	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
+	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
 	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
 	{&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double},

@ -98,6 +104,7 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},

 	{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
+	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
 	{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},

@ -464,6 +471,26 @@ void VertexDecoderJitCache::Jit_TcU16() {
 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
 }

+void VertexDecoderJitCache::Jit_TcU8ToFloat() {
+	// TODO: The first five instructions could be done in 1 or 2 in SSE4
+	MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
+	MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
+	CVTSI2SS(fpScratchReg, R(tempReg1));
+	CVTSI2SS(fpScratchReg2, R(tempReg2));
+	UNPCKLPS(fpScratchReg, R(fpScratchReg2));
+	MULPS(fpScratchReg, M(&by128));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
+void VertexDecoderJitCache::Jit_TcU16ToFloat() {
+	PXOR(fpScratchReg2, R(fpScratchReg2));
+	MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
+	PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
+	CVTDQ2PS(fpScratchReg, R(fpScratchReg));
+	MULPS(fpScratchReg, M(&by32768));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
 void VertexDecoderJitCache::Jit_TcU16Double() {
 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
 	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));
@ -525,6 +552,14 @@ void VertexDecoderJitCache::Jit_TcU16Through() {
 	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
 }

+void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
+	PXOR(fpScratchReg2, R(fpScratchReg2));
+	MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
+	PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
+	CVTDQ2PS(fpScratchReg, R(fpScratchReg));
+	MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
+}
+
 void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
 	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));