Jit the most common of the "ToFloat" texcoord conversions

This commit is contained in:
Henrik Rydgard 2014-09-10 10:52:59 +02:00
parent 37e3cf362f
commit 0727df6f0a
2 changed files with 36 additions and 0 deletions

View File

@ -51,6 +51,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f};
static const float by128 = 1.0f / 128.0f;
static const float by16384 = 1.0f / 16384.0f;
static const float by32768 = 1.0f / 32768.0f;
using namespace ArmGen;

View File

@ -39,6 +39,10 @@ static const float MEMORY_ALIGNED16( by32768[4] ) = {
static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000};
static const float MEMORY_ALIGNED16(by16384[4]) = {
1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f, 1.0f / 16384.0f,
};
#ifdef _M_X64
#ifdef _WIN32
static const X64Reg tempReg1 = RAX;
@ -90,6 +94,8 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
{&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double},
@ -98,6 +104,7 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},
@ -464,6 +471,26 @@ void VertexDecoderJitCache::Jit_TcU16() {
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
}
void VertexDecoderJitCache::Jit_TcU8ToFloat() {
// TODO: The first five instructions could be done in 1 or 2 in SSE4
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
CVTSI2SS(fpScratchReg, R(tempReg1));
CVTSI2SS(fpScratchReg2, R(tempReg2));
UNPCKLPS(fpScratchReg, R(fpScratchReg2));
MULPS(fpScratchReg, M(&by128));
MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
}
void VertexDecoderJitCache::Jit_TcU16ToFloat() {
PXOR(fpScratchReg2, R(fpScratchReg2));
MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
CVTDQ2PS(fpScratchReg, R(fpScratchReg));
MULPS(fpScratchReg, M(&by32768));
MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
}
void VertexDecoderJitCache::Jit_TcU16Double() {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));
@ -525,6 +552,14 @@ void VertexDecoderJitCache::Jit_TcU16Through() {
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
}
void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
PXOR(fpScratchReg2, R(fpScratchReg2));
MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
PUNPCKLWD(fpScratchReg, R(fpScratchReg2));
CVTDQ2PS(fpScratchReg, R(fpScratchReg));
MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
}
void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));