From 5496b3d3b14ed9c447bcd3e375be2151a4a643ad Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Thu, 19 Mar 2015 23:04:46 +0100
Subject: [PATCH] ARM64: Some minor vertex decoder work. Hm, I think SCVTF will
 actually divide by 128.0, not 127.0 :/

---
 GPU/Common/VertexDecoderArm64.cpp  | 91 +++++++++++++++++++++++++++++-
 GPU/Common/VertexDecoderCommon.cpp |  9 ++-
 GPU/Common/VertexDecoderCommon.h   |  3 +
 3 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index 438369b6bf..d6d285b188 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -53,7 +53,7 @@ static const ARM64Reg fpUVoffsetReg = D1;
 static const ARM64Reg neonScratchReg = D2;
 static const ARM64Reg neonScratchReg2 = D3;
 
-static const ARM64Reg neonScratchRegQ = Q1;  // Overlaps with all the scratch regs
+static const ARM64Reg neonScratchRegQ = Q1;
 
 // Everything above S6 is fair game for skinning
 
@@ -85,15 +85,18 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
 	{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
 	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
-
+	*/
 	{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
+	/*
 	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
 	{&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble},
 
+	*/
 	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
 	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
 	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
 
+	/*
 	{&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
 	{&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
 	{&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
@@ -105,9 +108,13 @@ static const JitLookup jitLookup[] = {
 	{&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
 
 	{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
+	*/
 	{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
+	/*
 	{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
 	*/
+	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
+	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
 	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
 	/*
 	{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
@@ -240,6 +247,13 @@ void VertexDecoderJitCache::Jit_TcU16() {
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
 }
 
+void VertexDecoderJitCache::Jit_TcU16Through() {
+	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
+	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
+	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
+	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
+}
+
 void VertexDecoderJitCache::Jit_TcFloat() {
 	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
 	LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4);
@@ -247,6 +261,20 @@ void VertexDecoderJitCache::Jit_TcFloat() {
 	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4);
 }
 
+void VertexDecoderJitCache::Jit_PosS8() {
+	Jit_AnyS8ToFloat(dec_->posoff);
+	STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff);
+	STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4);
+	STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_PosS16() {
+	Jit_AnyS16ToFloat(dec_->posoff);
+	STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff);
+	STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4);
+	STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8);
+}
+
 // Just copy 12 bytes.
 void VertexDecoderJitCache::Jit_PosFloat() {
 	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
@@ -256,3 +284,62 @@ void VertexDecoderJitCache::Jit_PosFloat() {
 	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.posoff + 4);
 	STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8);
 }
+
+void VertexDecoderJitCache::Jit_PosS16Through() {
+	LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
+	LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 2);
+	LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4);
+	fp.SCVTF(fpScratchReg, tempReg1);
+	fp.SCVTF(fpScratchReg2, tempReg2);
+	fp.SCVTF(fpScratchReg3, tempReg3);
+	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff);
+	STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4);
+	STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_NormalS8() {
+	LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
+	LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 1);
+	LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 2);
+	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8));
+	ORR(tempReg1, tempReg1, tempReg3, ArithOption(tempReg3, ST_LSL, 16));
+	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
+}
+
+// Copy 6 bytes and then 2 zeroes.
+void VertexDecoderJitCache::Jit_NormalS16() {
+	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
+	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 2);
+	LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 4);
+	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
+	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
+	STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_NormalFloat() {
+	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
+	LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4);
+	LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8);
+	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
+	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.nrmoff + 4);
+	STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
+	// TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too.
+	LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, srcoff);
+	LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 1);
+	LDRSB(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 2);
+	fp.SCVTF(src[0], tempReg1, 7);
+	fp.SCVTF(src[1], tempReg2, 7);
+	fp.SCVTF(src[2], tempReg3, 7);
+}
+
+void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
+	LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, srcoff);
+	LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 2);
+	LDRSH(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 4);
+	fp.SCVTF(src[0], tempReg1, 15);
+	fp.SCVTF(src[1], tempReg2, 15);
+	fp.SCVTF(src[2], tempReg3, 15);
+}
diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
index 977ee420f6..7f107b23f9 100644
--- a/GPU/Common/VertexDecoderCommon.cpp
+++ b/GPU/Common/VertexDecoderCommon.cpp
@@ -134,7 +134,8 @@ void PrintDecodedVertex(VertexReader &vtx) {
 	printf("P: %f %f %f\n", pos[0], pos[1], pos[2]);
 }
 
-VertexDecoder::VertexDecoder() : jitted_(0), decoded_(nullptr), ptr_(nullptr) {
+VertexDecoder::VertexDecoder() : jitted_(0), decoded_(nullptr), ptr_(nullptr)
+{
 }
 
 void VertexDecoder::Step_WeightsU8() const
@@ -1093,7 +1094,11 @@ int VertexDecoder::ToString(char *output) const {
 	return output - start;
 }
 
-VertexDecoderJitCache::VertexDecoderJitCache() {
+VertexDecoderJitCache::VertexDecoderJitCache()
+#ifdef ARM64
+ : fp(this)
+#endif
+{
 	// 256k should be enough.
 	AllocCodeSpace(1024 * 64 * 4);
 
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 426c1ae418..d39d82da6e 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -683,4 +683,7 @@ private:
 	void Jit_AnyFloatMorph(int srcoff, int dstoff);
 
 	const VertexDecoder *dec_;
+#ifdef ARM64
+	Arm64Gen::ARM64FloatEmitter fp;
+#endif
 };