mirror of
https://github.com/libretro/ppsspp.git
synced 2025-01-26 03:04:20 +00:00
arm: Jit throughmode 16-bit texcoords.
It's popular, and this makes decoding such verts much faster.
This commit is contained in:
parent
257f8dbbc6
commit
7699fa55de
@ -127,7 +127,7 @@ static const JitLookup jitLookup[] = {
|
||||
{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
|
||||
|
||||
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
|
||||
// {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
|
||||
{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
|
||||
|
||||
{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
|
||||
{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
|
||||
@ -568,6 +568,43 @@ void VertexDecoderJitCache::Jit_TcFloat() {
|
||||
STR(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
|
||||
LDRH(tempReg1, srcReg, dec_->tcoff);
|
||||
LDRH(tempReg2, srcReg, dec_->tcoff + 2);
|
||||
|
||||
MOVP2R(scratchReg, &gstate_c.vertBounds.minU);
|
||||
|
||||
auto updateSide = [&](ARMReg r, CCFlags cc, u32 off) {
|
||||
LDRH(tempReg3, scratchReg, off);
|
||||
CMP(r, tempReg3);
|
||||
SetCC(cc);
|
||||
STRH(r, scratchReg, off);
|
||||
SetCC(CC_AL);
|
||||
};
|
||||
|
||||
// TODO: Can this actually be fast? Hmm, floats aren't better.
|
||||
updateSide(tempReg1, CC_LT, offsetof(KnownVertexBounds, minU));
|
||||
updateSide(tempReg1, CC_GT, offsetof(KnownVertexBounds, maxU));
|
||||
updateSide(tempReg2, CC_LT, offsetof(KnownVertexBounds, minV));
|
||||
updateSide(tempReg2, CC_GT, offsetof(KnownVertexBounds, maxV));
|
||||
|
||||
if (cpu_info.bNEON) {
|
||||
ADD(scratchReg, srcReg, dec_->tcoff);
|
||||
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
|
||||
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
|
||||
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
|
||||
ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
|
||||
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
|
||||
} else {
|
||||
VMOV(fpScratchReg, tempReg1);
|
||||
VMOV(fpScratchReg2, tempReg2);
|
||||
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
|
||||
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
|
||||
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
|
||||
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
|
||||
}
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcFloatThrough() {
|
||||
LDR(tempReg1, srcReg, dec_->tcoff);
|
||||
LDR(tempReg2, srcReg, dec_->tcoff + 4);
|
||||
|
@ -101,7 +101,7 @@ static const JitLookup jitLookup[] = {
|
||||
{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
|
||||
|
||||
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
|
||||
// {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
|
||||
{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
|
||||
|
||||
{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
|
||||
{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
|
||||
@ -579,6 +579,26 @@ void VertexDecoderJitCache::Jit_Color5551() {
|
||||
CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
|
||||
LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
|
||||
LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
|
||||
|
||||
auto updateSide = [&](ARM64Reg src, CCFlags cc, ARM64Reg dst) {
|
||||
CMP(src, dst);
|
||||
CSEL(dst, src, dst, cc);
|
||||
};
|
||||
|
||||
updateSide(tempReg1, CC_LT, boundsMinUReg);
|
||||
updateSide(tempReg1, CC_GT, boundsMaxUReg);
|
||||
updateSide(tempReg2, CC_LT, boundsMinVReg);
|
||||
updateSide(tempReg2, CC_GT, boundsMaxVReg);
|
||||
|
||||
fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
|
||||
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
|
||||
fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
|
||||
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcFloatThrough() {
|
||||
LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
|
||||
STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
|
||||
|
Loading…
x
Reference in New Issue
Block a user