diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index 4c485650a5..b3d173e42a 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -787,6 +787,9 @@ struct JitLookup { JitStepFunction jitFunc; }; +static const float by128 = 1.0f / 128.0f; +static const float by32768 = 1.0f / 32768.0f; + #ifdef ARM using namespace ArmGen; @@ -798,6 +801,12 @@ static const ARMReg scratchReg = R6; static const ARMReg srcReg = R0; static const ARMReg dstReg = R1; static const ARMReg counterReg = R2; +static const ARMReg fpScratchReg = S4; +static const ARMReg fpScratchReg2 = S5; +static const ARMReg fpUscaleReg = S0; +static const ARMReg fpVscaleReg = S1; +static const ARMReg fpUoffsetReg = S2; +static const ARMReg fpVoffsetReg = S3; static const JitLookup jitLookup[] = { {&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8}, @@ -808,6 +817,10 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, + {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, + {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, + {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, + {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, @@ -830,19 +843,40 @@ static const JitLookup jitLookup[] = { }; JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { - // return 0; - dec_ = &dec; const u8 *start = this->GetCodePtr(); - // TODO: Test and make work + bool prescaleStep = false; + // Look for prescaled texcoord steps + for (int i = 0; i < dec.numSteps_; i++) { + if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { + prescaleStep = true; + } + } SetCC(CC_AL); PUSH(6, R4, R5, R6, R7, R8, _LR); - // Preserving our FP scratch register appears to improve stability. - VMOV(R7, S0); + // Keep the scale/offset in a few fp registers if we need it. + if (prescaleStep) { + MOVI2R(R3, (u32)(&gstate_c.uv), scratchReg); + VLDR(fpUscaleReg, R3, 0); + VLDR(fpVscaleReg, R3, 4); + VLDR(fpUoffsetReg, R3, 8); + VLDR(fpVoffsetReg, R3, 12); + if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { + MOVI2F(fpScratchReg, by128, scratchReg); + VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg); + VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg); + } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { + MOVI2F(fpScratchReg, by32768, scratchReg); + VMUL(fpUscaleReg, fpUscaleReg, fpScratchReg); + VMUL(fpVscaleReg, fpVscaleReg, fpScratchReg); + } + } JumpTarget loopStart = GetCodePtr(); for (int i = 0; i < dec.numSteps_; i++) { @@ -861,17 +895,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUBS(counterReg, counterReg, 1); B_CC(CC_NEQ, loopStart); - VMOV(S0, R7); // restore our fp scratch - - EOR(R0, R0, R0); POP(6, R4, R5, R6, R7, R8, _PC); FlushIcache(); - // DisassembleArm(start, GetCodePtr() - start); - // char temp[1024] = {0}; - // dec.ToString(temp); - // INFO_LOG(HLE, "%s", temp); + DisassembleArm(start, GetCodePtr() - start); + char temp[1024] = {0}; + dec.ToString(temp); + INFO_LOG(HLE, "%s", temp); return (JittedVertexDecoder)start; } @@ -962,6 +993,51 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() { STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4); } +void VertexDecoderJitCache::Jit_TcU8Prescale() { + // TODO: SIMD + LDRB(tempReg1, srcReg, dec_->tcoff); + LDRB(tempReg2, srcReg, dec_->tcoff + 1); + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); + // Could replace VMUL + VADD with VMLA but would require 2 more regs as we don't want to destroy fp*offsetReg. Later. + VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); + VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); + VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); + VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); +} + +void VertexDecoderJitCache::Jit_TcU16Prescale() { + // TODO: SIMD + LDRH(tempReg1, srcReg, dec_->tcoff); + LDRH(tempReg2, srcReg, dec_->tcoff + 2); + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT); + VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); + VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); + VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); + VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); +} + +void VertexDecoderJitCache::Jit_TcFloatPrescale() { + // TODO: SIMD + VLDR(fpScratchReg, srcReg, dec_->tcoff); + VLDR(fpScratchReg2, srcReg, dec_->tcoff + 4); + VMUL(fpScratchReg, fpScratchReg, fpUscaleReg); + VMUL(fpScratchReg2, fpScratchReg2, fpVscaleReg); + VADD(fpScratchReg, fpScratchReg, fpUoffsetReg); + VADD(fpScratchReg2, fpScratchReg2, fpVoffsetReg); + VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff); + VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4); +} + void VertexDecoderJitCache::Jit_Color8888() { LDR(tempReg1, srcReg, dec_->coloff); STR(tempReg1, dstReg, dec_->decFmt.c0off); @@ -1080,23 +1156,23 @@ void VertexDecoderJitCache::Jit_PosS8Through() { LDRSB(tempReg3, srcReg, dec_->posoff + 2); static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 }; for (int i = 0; i < 3; i++) { - VMOV(S0, tr[i]); - VCVT(S0, S0, TO_FLOAT | IS_SIGNED); - VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4); + VMOV(fpScratchReg, tr[i]); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); + VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4); } } // Through expands into floats, always. Might want to look at changing this. void VertexDecoderJitCache::Jit_PosS16Through() { + // TODO: SIMD LDRSH(tempReg1, srcReg, dec_->posoff); LDRSH(tempReg2, srcReg, dec_->posoff + 2); LDRSH(tempReg3, srcReg, dec_->posoff + 4); static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 }; - // TODO: SIMD for (int i = 0; i < 3; i++) { - VMOV(S0, tr[i]); - VCVT(S0, S0, TO_FLOAT | IS_SIGNED); - VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4); + VMOV(fpScratchReg, tr[i]); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); + VSTR(fpScratchReg, dstReg, dec_->decFmt.posoff + i * 4); } } @@ -1160,6 +1236,14 @@ static const X64Reg dstReg = EDI; static const X64Reg counterReg = ECX; #endif +// XMM0-XMM5 are volatile on Windows X64 +// XMM0-XMM7 are arguments (and thus volatile) on System V ABI (other x64 platforms) +static const X64Reg fpUscaleReg = XMM0; +static const X64Reg fpVscaleReg = XMM1; +static const X64Reg fpUoffsetReg = XMM2; +static const X64Reg fpVoffsetReg = XMM3; +static const X64Reg fpScratchReg = XMM4; +static const X64Reg fpScratchReg2 = XMM5; // To debug, just comment them out one at a time until it works. We fall back // on the interpreter if the compiler fails. @@ -1173,6 +1257,10 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, + {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, + {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, + {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, + {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, @@ -1194,6 +1282,13 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, }; +// TODO: This should probably be global... +#ifdef _M_X64 +#define PTRBITS 64 +#else +#define PTRBITS 32 +#endif + JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { dec_ = &dec; const u8 *start = this->GetCodePtr(); @@ -1211,6 +1306,44 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4)); MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8)); #endif + + // Save XMM4/XMM5 which apparently can be problematic? + // Actually, if they are, it must be a compiler bug because they SHOULD be ok. + // So I won't bother. + // SUB(PTRBITS, R(ESP), Imm8(32)); + // MOVUPS(MDisp(ESP, 0), XMM4); + // MOVUPS(MDisp(ESP, 16), XMM5); + + bool prescaleStep = false; + // Look for prescaled texcoord steps + for (int i = 0; i < dec.numSteps_; i++) { + if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { + prescaleStep = true; + } + } + + // Keep the scale/offset in a few fp registers if we need it. + if (prescaleStep) { +#ifdef _M_X64 + MOV(64, R(tempReg1), Imm64((u64)(&gstate_c.uv))); +#else + MOV(32, R(tempReg1), Imm32((u32)(&gstate_c.uv))); +#endif + MOVSS(fpUscaleReg, MDisp(tempReg1, 0)); + MOVSS(fpVscaleReg, MDisp(tempReg1, 4)); + MOVSS(fpUoffsetReg, MDisp(tempReg1, 8)); + MOVSS(fpVoffsetReg, MDisp(tempReg1, 12)); + if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) { + MULSS(fpUscaleReg, M((void *)&by128)); + MULSS(fpVscaleReg, M((void *)&by128)); + } else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) { + MULSS(fpUscaleReg, M((void *)&by32768)); + MULSS(fpVscaleReg, M((void *)&by32768)); + } + } + // Let's not bother with a proper stack frame. We just grab the arguments and go. JumpTarget loopStart = GetCodePtr(); for (int i = 0; i < dec.numSteps_; i++) { @@ -1221,16 +1354,15 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { } } -#ifdef _M_X64 - ADD(64, R(srcReg), Imm32(dec.VertexSize())); - ADD(64, R(dstReg), Imm32(dec.decFmt.stride)); -#else - ADD(32, R(srcReg), Imm32(dec.VertexSize())); - ADD(32, R(dstReg), Imm32(dec.decFmt.stride)); -#endif + ADD(PTRBITS, R(srcReg), Imm32(dec.VertexSize())); + ADD(PTRBITS, R(dstReg), Imm32(dec.decFmt.stride)); SUB(32, R(counterReg), Imm8(1)); J_CC(CC_NZ, loopStart, true); + // MOVUPS(XMM4, MDisp(ESP, 0)); + // MOVUPS(XMM5, MDisp(ESP, 16)); + // ADD(PTRBITS, R(ESP), Imm8(32)); + #ifdef _M_IX86 // Restore register values POP(EBP); @@ -1305,6 +1437,46 @@ void VertexDecoderJitCache::Jit_TcFloat() { #endif } +void VertexDecoderJitCache::Jit_TcU8Prescale() { + // TODO: SIMD + MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->tcoff)); + MOVZX(32, 8, tempReg2, MDisp(srcReg, dec_->tcoff + 1)); + CVTSI2SS(fpScratchReg, R(tempReg1)); + CVTSI2SS(fpScratchReg2, R(tempReg2)); + MULSS(fpScratchReg, R(fpUscaleReg)); + MULSS(fpScratchReg2, R(fpVscaleReg)); + ADDSS(fpScratchReg, R(fpUoffsetReg)); + ADDSS(fpScratchReg2, R(fpVoffsetReg)); + MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); + MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2); +} + +void VertexDecoderJitCache::Jit_TcU16Prescale() { + // TODO: SIMD + MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff)); + MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->tcoff + 2)); + CVTSI2SS(fpScratchReg, R(tempReg1)); + CVTSI2SS(fpScratchReg2, R(tempReg2)); + MULSS(fpScratchReg, R(fpUscaleReg)); + MULSS(fpScratchReg2, R(fpVscaleReg)); + ADDSS(fpScratchReg, R(fpUoffsetReg)); + ADDSS(fpScratchReg2, R(fpVoffsetReg)); + MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); + MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2); +} + +void VertexDecoderJitCache::Jit_TcFloatPrescale() { + // TODO: SIMD + MOVSS(fpScratchReg, MDisp(srcReg, dec_->tcoff)); + MOVSS(fpScratchReg2, MDisp(srcReg, dec_->tcoff + 4)); + MULSS(fpScratchReg, R(fpUscaleReg)); + MULSS(fpScratchReg2, R(fpVscaleReg)); + ADDSS(fpScratchReg, R(fpUoffsetReg)); + ADDSS(fpScratchReg2, R(fpVoffsetReg)); + MOVSS(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); + MOVSS(MDisp(dstReg, dec_->decFmt.uvoff + 4), fpScratchReg2); +} + void VertexDecoderJitCache::Jit_TcU16Through() { MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); @@ -1464,8 +1636,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() { // TODO: SIMD for (int i = 0; i < 3; i++) { MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i)); - CVTSI2SS(XMM0, R(tempReg1)); - MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0); + CVTSI2SS(fpScratchReg, R(tempReg1)); + MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg); } } @@ -1474,8 +1646,8 @@ void VertexDecoderJitCache::Jit_PosS16Through() { // TODO: SIMD for (int i = 0; i < 3; i++) { MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + i * 2)); - CVTSI2SS(XMM0, R(tempReg1)); - MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0); + CVTSI2SS(fpScratchReg, R(tempReg1)); + MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg); } } diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index 2e496fd871..940920252f 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -57,6 +57,7 @@ public: void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const; bool hasColor() const { return col != 0; } + bool hasTexcoord() const { return tc != 0; } int VertexSize() const { return size; } // PSP format size void Step_WeightsU8() const; @@ -190,6 +191,10 @@ public: void Jit_TcU16(); void Jit_TcFloat(); + void Jit_TcU8Prescale(); + void Jit_TcU16Prescale(); + void Jit_TcFloatPrescale(); + void Jit_TcU16Through(); void Jit_TcFloatThrough(); @@ -211,4 +216,4 @@ public: private: bool CompileStep(const VertexDecoder &dec, int i); const VertexDecoder *dec_; -}; \ No newline at end of file +};