mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-27 15:30:35 +00:00
Save a couple of registers in the x86 vertex decoder jit by SIMD-ing prescale UV
This commit is contained in:
parent
7e67476b00
commit
4f78eda23b
@ -927,11 +927,11 @@ struct JitLookup {
|
||||
JitStepFunction jitFunc;
|
||||
};
|
||||
|
||||
#ifdef ARM
|
||||
|
||||
static const float by128 = 1.0f / 128.0f;
|
||||
static const float by32768 = 1.0f / 32768.0f;
|
||||
|
||||
#ifdef ARM
|
||||
|
||||
using namespace ArmGen;
|
||||
|
||||
static const ARMReg tempReg1 = R3;
|
||||
@ -1373,6 +1373,9 @@ void VertexDecoderJitCache::Jit_PosFloat() {
|
||||
|
||||
using namespace Gen;
|
||||
|
||||
static const float MEMORY_ALIGNED16( by128[4] ) = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f};
|
||||
static const float MEMORY_ALIGNED16( by32768[4] ) = {1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f};
|
||||
|
||||
#ifdef _M_X64
|
||||
#ifdef _WIN32
|
||||
static const X64Reg tempReg1 = RAX;
|
||||
@ -1400,12 +1403,13 @@ static const X64Reg counterReg = ECX;
|
||||
|
||||
// XMM0-XMM5 are volatile on Windows X64
|
||||
// XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms)
|
||||
static const X64Reg fpUscaleReg = XMM0;
|
||||
static const X64Reg fpVscaleReg = XMM1;
|
||||
static const X64Reg fpUoffsetReg = XMM2;
|
||||
static const X64Reg fpVoffsetReg = XMM3;
|
||||
static const X64Reg fpScratchReg = XMM4;
|
||||
static const X64Reg fpScratchReg2 = XMM5;
|
||||
static const X64Reg fpScaleReg = XMM0;
|
||||
static const X64Reg fpOffsetReg = XMM1;
|
||||
static const X64Reg fpScratchReg = XMM2;
|
||||
static const X64Reg fpScratchReg2 = XMM3;
|
||||
// We're gonna keep the current skinning matrix in 3 or 4 XMM regs. Fortunately we easily
|
||||
// have space for that now.
|
||||
|
||||
|
||||
// To debug, just comment them out one at a time until it works. We fall back
|
||||
// on the interpreter if the compiler fails.
|
||||
@ -1495,16 +1499,16 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
#else
|
||||
MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv)));
|
||||
#endif
|
||||
MOVSS(fpUscaleReg, MDisp(tempReg1, 0));
|
||||
MOVSS(fpVscaleReg, MDisp(tempReg1, 4));
|
||||
MOVSS(fpUoffsetReg, MDisp(tempReg1, 8));
|
||||
MOVSS(fpVoffsetReg, MDisp(tempReg1, 12));
|
||||
MOVSS(fpScaleReg, MDisp(tempReg1, 0));
|
||||
MOVSS(fpScratchReg, MDisp(tempReg1, 4));
|
||||
UNPCKLPS(fpScaleReg, R(fpScratchReg));
|
||||
MOVSS(fpOffsetReg, MDisp(tempReg1, 8));
|
||||
MOVSS(fpScratchReg, MDisp(tempReg1, 12));
|
||||
UNPCKLPS(fpOffsetReg, R(fpScratchReg));
|
||||
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
|
||||
MULSS(fpUscaleReg, M((void *)&by128));
|
||||
MULSS(fpVscaleReg, M((void *)&by128));
|
||||
MULPS(fpScaleReg, M((void *)&by128));
|
||||
} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
|
||||
MULSS(fpUscaleReg, M((void *)&by32768));
|
||||
MULSS(fpVscaleReg, M((void *)&by32768));
|
||||
MULPS(fpScaleReg, M((void *)&by32768));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1611,43 +1615,34 @@ void VertexDecoderJitCache::Jit_TcFloat() {
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcU8Prescale() {
|
||||
// TODO: SIMD
|
||||
// TODO: The first five instructions could be done in 1 or 2 in SSE4
|
||||
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff));
|
||||
MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1));
|
||||
CVTSI2SS(fpScratchReg, R(tempReg1));
|
||||
CVTSI2SS(fpScratchReg2, R(tempReg2));
|
||||
MULSS(fpScratchReg, R(fpUscaleReg));
|
||||
MULSS(fpScratchReg2, R(fpVscaleReg));
|
||||
ADDSS(fpScratchReg, R(fpUoffsetReg));
|
||||
ADDSS(fpScratchReg2, R(fpVoffsetReg));
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
|
||||
UNPCKLPS(fpScratchReg, R(fpScratchReg2));
|
||||
MULPS(fpScratchReg, R(fpScaleReg));
|
||||
ADDPS(fpScratchReg, R(fpOffsetReg));
|
||||
MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcU16Prescale() {
|
||||
// TODO: SIMD
|
||||
// TODO: The first five instructions could be done in 1 or 2 in SSE4 and probably in 3 in SSE2
|
||||
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
|
||||
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2));
|
||||
CVTSI2SS(fpScratchReg, R(tempReg1));
|
||||
CVTSI2SS(fpScratchReg2, R(tempReg2));
|
||||
MULSS(fpScratchReg, R(fpUscaleReg));
|
||||
MULSS(fpScratchReg2, R(fpVscaleReg));
|
||||
ADDSS(fpScratchReg, R(fpUoffsetReg));
|
||||
ADDSS(fpScratchReg2, R(fpVoffsetReg));
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
|
||||
UNPCKLPS(fpScratchReg, R(fpScratchReg2));
|
||||
MULPS(fpScratchReg, R(fpScaleReg));
|
||||
ADDPS(fpScratchReg, R(fpOffsetReg));
|
||||
MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
|
||||
// TODO: SIMD
|
||||
MOVSS(fpScratchReg, MDisp(srcReg, dec_->tcoff));
|
||||
MOVSS(fpScratchReg2, MDisp(srcReg, dec_->tcoff + 4));
|
||||
MULSS(fpScratchReg, R(fpUscaleReg));
|
||||
MULSS(fpScratchReg2, R(fpVscaleReg));
|
||||
ADDSS(fpScratchReg, R(fpUoffsetReg));
|
||||
ADDSS(fpScratchReg2, R(fpVoffsetReg));
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2);
|
||||
MOVQ_xmm(fpScratchReg, MDisp(srcReg, dec_->tcoff));
|
||||
MULPS(fpScratchReg, R(fpScaleReg));
|
||||
ADDPS(fpScratchReg, R(fpOffsetReg));
|
||||
MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcU16Through() {
|
||||
|
Loading…
Reference in New Issue
Block a user