From 5496b3d3b14ed9c447bcd3e375be2151a4a643ad Mon Sep 17 00:00:00 2001 From: Henrik Rydgard Date: Thu, 19 Mar 2015 23:04:46 +0100 Subject: [PATCH] ARM64: Some minor vertex decoder work. Hm, I think SCVTF will actually divide by 128.0, not 127.0 :/ --- GPU/Common/VertexDecoderArm64.cpp | 91 +++++++++++++++++++++++++++++- GPU/Common/VertexDecoderCommon.cpp | 9 ++- GPU/Common/VertexDecoderCommon.h | 3 + 3 files changed, 99 insertions(+), 4 deletions(-) diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 438369b6bf..d6d285b188 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -53,7 +53,7 @@ static const ARM64Reg fpUVoffsetReg = D1; static const ARM64Reg neonScratchReg = D2; static const ARM64Reg neonScratchReg2 = D3; -static const ARM64Reg neonScratchRegQ = Q1; // Overlaps with all the scratch regs +static const ARM64Reg neonScratchRegQ = Q1; // Everything above S6 is fair game for skinning @@ -85,15 +85,18 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - + */ {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, + /* {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, + */ {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, + /* {&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin}, {&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin}, {&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin}, @@ -105,9 +108,13 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551}, {&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through}, + */ {&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through}, + /* {&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat}, */ + {&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8}, + {&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16}, {&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat}, /* {&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin}, @@ -240,6 +247,13 @@ void VertexDecoderJitCache::Jit_TcU16() { STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); } +void VertexDecoderJitCache::Jit_TcU16Through() { + LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); + LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); + ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16)); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); +} + void VertexDecoderJitCache::Jit_TcFloat() { LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4); @@ -247,6 +261,20 @@ void VertexDecoderJitCache::Jit_TcFloat() { STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4); } +void VertexDecoderJitCache::Jit_PosS8() { + Jit_AnyS8ToFloat(dec_->posoff); + STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff); + STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4); + STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8); +} + +void VertexDecoderJitCache::Jit_PosS16() { + Jit_AnyS16ToFloat(dec_->posoff); + STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff); + STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4); + STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8); +} + // Just copy 12 bytes. void VertexDecoderJitCache::Jit_PosFloat() { LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff); @@ -256,3 +284,62 @@ void VertexDecoderJitCache::Jit_PosFloat() { STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.posoff + 4); STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8); } + +void VertexDecoderJitCache::Jit_PosS16Through() { + LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff); + LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 2); + LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4); + fp.SCVTF(fpScratchReg, tempReg1); + fp.SCVTF(fpScratchReg2, tempReg2); + fp.SCVTF(fpScratchReg3, tempReg3); + STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff); + STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4); + STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8); +} + +void VertexDecoderJitCache::Jit_NormalS8() { + LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff); + LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 1); + LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 2); + ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8)); + ORR(tempReg1, tempReg1, tempReg3, ArithOption(tempReg3, ST_LSL, 16)); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff); +} + +// Copy 6 bytes and then 2 zeroes. +void VertexDecoderJitCache::Jit_NormalS16() { + LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff); + LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 2); + LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 4); + ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16)); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff); + STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 4); +} + +void VertexDecoderJitCache::Jit_NormalFloat() { + LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff); + LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4); + LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff); + STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.nrmoff + 4); + STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8); +} + +void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) { + // TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too. + LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, srcoff); + LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 1); + LDRSB(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 2); + fp.SCVTF(src[0], tempReg1, 7); + fp.SCVTF(src[1], tempReg2, 7); + fp.SCVTF(src[2], tempReg3, 7); +} + +void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) { + LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, srcoff); + LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 2); + LDRSH(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 4); + fp.SCVTF(src[0], tempReg1, 15); + fp.SCVTF(src[1], tempReg2, 15); + fp.SCVTF(src[2], tempReg3, 15); +} diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index 977ee420f6..7f107b23f9 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -134,7 +134,8 @@ void PrintDecodedVertex(VertexReader &vtx) { printf("P: %f %f %f\n", pos[0], pos[1], pos[2]); } -VertexDecoder::VertexDecoder() : jitted_(0), decoded_(nullptr), ptr_(nullptr) { +VertexDecoder::VertexDecoder() : jitted_(0), decoded_(nullptr), ptr_(nullptr) +{ } void VertexDecoder::Step_WeightsU8() const @@ -1093,7 +1094,11 @@ int VertexDecoder::ToString(char *output) const { return output - start; } -VertexDecoderJitCache::VertexDecoderJitCache() { +VertexDecoderJitCache::VertexDecoderJitCache() +#ifdef ARM64 + : fp(this) +#endif +{ // 256k should be enough. AllocCodeSpace(1024 * 64 * 4); diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 426c1ae418..d39d82da6e 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -683,4 +683,7 @@ private: void Jit_AnyFloatMorph(int srcoff, int dstoff); const VertexDecoder *dec_; +#ifdef ARM64 + Arm64Gen::ARM64FloatEmitter fp; +#endif };