From b3fdfc01c844f5b08ae496a509fa79e90f668388 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 6 Nov 2013 12:17:41 +0100
Subject: [PATCH] ARM vtx dec: Avoid all unaligned accesses entirely.

Seeing so much contradictory information on the support and performance
of these.
---
 GPU/GLES/VertexDecoder.cpp | 59 +++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp
index c32edb05a..96b410142 100644
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@@ -936,7 +936,9 @@ void VertexDecoderJitCache::Jit_TcU8() {
 }
 
 void VertexDecoderJitCache::Jit_TcU16() {
-	LDR(tempReg1, srcReg, dec_->tcoff);
+	LDRH(tempReg1, srcReg, dec_->tcoff);
+	LDRH(tempReg2, srcReg, dec_->tcoff + 2);
+	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
 	STR(tempReg1, dstReg, dec_->decFmt.uvoff);
 }
 
@@ -948,7 +950,9 @@ void VertexDecoderJitCache::Jit_TcFloat() {
 }
 
 void VertexDecoderJitCache::Jit_TcU16Through() {
-	LDR(tempReg1, srcReg, dec_->tcoff);  // possibly unaligned access
+	LDRH(tempReg1, srcReg, dec_->tcoff);
+	LDRH(tempReg2, srcReg, dec_->tcoff + 2);
+	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
 	STR(tempReg1, dstReg, dec_->decFmt.uvoff);
 }
 
@@ -1033,23 +1037,33 @@ void VertexDecoderJitCache::Jit_Color5551() {
 	STR(tempReg2, dstReg, dec_->decFmt.c0off);
 }
 
-// Copy 3 bytes and then a zero. Might as well copy four.
 void VertexDecoderJitCache::Jit_NormalS8() {
-	LDR(tempReg1, srcReg, dec_->nrmoff);
-	ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg);
+	LDRB(tempReg1, srcReg, dec_->nrmoff);
+	LDRB(tempReg2, srcReg, dec_->nrmoff + 1);
+	LDRB(tempReg3, srcReg, dec_->nrmoff + 2);
+	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
+	ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
 	STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
+
+	// Copy 3 bytes and then a zero. Might as well copy four.
+	// LDR(tempReg1, srcReg, dec_->nrmoff);
+	// ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg);
+	// STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
 }
 
 // Copy 6 bytes and then 2 zeroes.
 void VertexDecoderJitCache::Jit_NormalS16() {
-	LDR(tempReg1, srcReg, dec_->nrmoff);
-	LDRH(tempReg2, srcReg, dec_->nrmoff + 4);
+	LDRH(tempReg1, srcReg, dec_->nrmoff);
+	LDRH(tempReg2, srcReg, dec_->nrmoff + 2);
+	LDRH(tempReg3, srcReg, dec_->nrmoff + 4);
+	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
 	STR(tempReg1, dstReg, dec_->decFmt.nrmoff);
-	STR(tempReg2, dstReg, dec_->decFmt.nrmoff + 4);
+	STR(tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
 }
 
 void VertexDecoderJitCache::Jit_NormalFloat() {
 	// Might not be aligned to 4, so we can't use LDMIA.
+	// Actually - not true: This will always be aligned. TODO
 	LDR(tempReg1, srcReg, dec_->nrmoff);
 	LDR(tempReg2, srcReg, dec_->nrmoff + 4);
 	LDR(tempReg3, srcReg, dec_->nrmoff + 8);
@@ -1061,9 +1075,12 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
 // Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS8Through() {
 	// TODO: SIMD
+	LDRSB(tempReg1, srcReg, dec_->posoff);
+	LDRSB(tempReg2, srcReg, dec_->posoff + 1);
+	LDRSB(tempReg3, srcReg, dec_->posoff + 2);
+	static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
 	for (int i = 0; i < 3; i++) {
-		LDRSB(tempReg1, srcReg, dec_->posoff + i);
-		VMOV(S0, tempReg1);
+		VMOV(S0, tr[i]);
 		VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
 		VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4);
 	}
@@ -1071,10 +1088,13 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
 
 // Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS16Through() {
+	LDRSH(tempReg1, srcReg, dec_->posoff);
+	LDRSH(tempReg2, srcReg, dec_->posoff + 2);
+	LDRSH(tempReg3, srcReg, dec_->posoff + 4);
+	static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
 	// TODO: SIMD
 	for (int i = 0; i < 3; i++) {
-		LDRSH(tempReg1, srcReg, dec_->posoff + i * 2);
-		VMOV(S0, tempReg1);
+		VMOV(S0, tr[i]);
 		VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
 		VSTR(S0, dstReg, dec_->decFmt.posoff + i * 4);
 	}
@@ -1082,17 +1102,22 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
 
 // Copy 3 bytes and then a zero. Might as well copy four.
 void VertexDecoderJitCache::Jit_PosS8() {
-	LDR(tempReg1, srcReg, dec_->posoff);
-	ANDI2R(tempReg1, tempReg1, 0x00FFFFFF, scratchReg);
+	LDRB(tempReg1, srcReg, dec_->posoff);
+	LDRB(tempReg2, srcReg, dec_->posoff + 1);
+	LDRB(tempReg3, srcReg, dec_->posoff + 2);
+	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
+	ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
 	STR(tempReg1, dstReg, dec_->decFmt.posoff);
 }
 
 // Copy 6 bytes and then 2 zeroes.
 void VertexDecoderJitCache::Jit_PosS16() {
-	LDR(tempReg1, srcReg, dec_->posoff);
-	LDRH(tempReg2, srcReg, dec_->posoff + 4);
+	LDRH(tempReg1, srcReg, dec_->posoff);
+	LDRH(tempReg2, srcReg, dec_->posoff + 2);
+	LDRH(tempReg3, srcReg, dec_->posoff + 4);
+	ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
 	STR(tempReg1, dstReg, dec_->decFmt.posoff);
-	STR(tempReg2, dstReg, dec_->decFmt.posoff + 4);
+	STR(tempReg3, dstReg, dec_->decFmt.posoff + 4);
 }
 
 // Just copy 12 bytes.