Vertex JIT: Add some missing functions to ARM/ARM64 decoders.

This commit is contained in:
Henrik Rydgard 2017-01-25 19:44:17 +01:00
parent f9d2fdac4c
commit 98e0ccf1e1
4 changed files with 74 additions and 5 deletions

View File

@ -120,6 +120,8 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
{&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double},
{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
@ -646,6 +648,28 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() {
}
}
void VertexDecoderJitCache::Jit_TcU8ToFloat() {
if (cpu_info.bNEON) {
// TODO: Needs testing
ADD(scratchReg, srcReg, dec_->tcoff);
VLD1_lane(I_16, neonScratchReg, scratchReg, 0, false);
VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
} else {
LDRB(tempReg1, srcReg, dec_->tcoff);
LDRB(tempReg2, srcReg, dec_->tcoff + 1);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
if (cpu_info.bNEON) {
// TODO: Needs testing
@ -673,6 +697,27 @@ void VertexDecoderJitCache::Jit_TcU16Prescale() {
}
}
void VertexDecoderJitCache::Jit_TcU16ToFloat() {
if (cpu_info.bNEON) {
// TODO: Needs testing
ADD(scratchReg, srcReg, dec_->tcoff);
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
ADD(scratchReg2, dstReg, dec_->decFmt.uvoff);
VST1(F_32, neonScratchReg, scratchReg2, 1, ALIGN_NONE);
} else {
LDRH(tempReg1, srcReg, dec_->tcoff);
LDRH(tempReg2, srcReg, dec_->tcoff + 2);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
VSTR(fpScratchReg, dstReg, dec_->decFmt.uvoff);
VSTR(fpScratchReg2, dstReg, dec_->decFmt.uvoff + 4);
}
}
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
if (cpu_info.bNEON) {
ADD(scratchReg, srcReg, dec_->tcoff);

View File

@ -92,11 +92,15 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
{&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double},
{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},
@ -631,6 +635,14 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() {
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU8ToFloat() {
fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
@ -640,6 +652,13 @@ void VertexDecoderJitCache::Jit_TcU16Prescale() {
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU16ToFloat() {
fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA

View File

@ -106,6 +106,11 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},
{&VertexDecoder::Step_TcU8MorphToFloat, &VertexDecoderJitCache::Jit_TcU8MorphToFloat},
{&VertexDecoder::Step_TcU16MorphToFloat, &VertexDecoderJitCache::Jit_TcU16MorphToFloat},
{&VertexDecoder::Step_TcFloatMorph, &VertexDecoderJitCache::Jit_TcFloatMorph},
@ -113,11 +118,6 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_TcU16PrescaleMorph, &VertexDecoderJitCache::Jit_TcU16PrescaleMorph},
{&VertexDecoder::Step_TcFloatPrescaleMorph, &VertexDecoderJitCache::Jit_TcFloatPrescaleMorph},
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},
{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
{&VertexDecoder::Step_NormalS8ToFloat, &VertexDecoderJitCache::Jit_NormalS8ToFloat},
{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
@ -244,6 +244,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
JumpTarget loopStart = GetCodePtr();
for (int i = 0; i < dec.numSteps_; i++) {
if (!CompileStep(dec, i)) {
EndWrite();
// Reset the code ptr and return zero to indicate that we failed.
SetCodePtr(const_cast<u8 *>(start));
return 0;

View File

@ -128,6 +128,10 @@ DrawEngineGLES::DrawEngineGLES()
dcid_(0),
fboTexNeedBind_(false),
fboTexBound_(false) {
decOptions_.expandAllWeightsToFloat = false;
decOptions_.expand8BitNormalsToFloat = false;
decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL;
bufferDecimationCounter_ = VERTEXCACHE_NAME_DECIMATION_INTERVAL;
// Allocate nicely aligned memory. Maybe graphics drivers will