From 880697f40a22b846576bfc52e402fe89848599c0 Mon Sep 17 00:00:00 2001
From: KentuckyCompass <kentuckycompass@rocketmail.com>
Date: Mon, 25 May 2015 18:04:52 -0700
Subject: [PATCH] Add NEON versions of the CheckAlpha family

---
 GPU/Common/TextureDecoder.cpp     | 109 +++++++++++++++------------
 GPU/Common/TextureDecoder.h       |  14 ++--
 GPU/Common/TextureDecoderNEON.cpp | 119 ++++++++++++++++++++++++++++++
 GPU/Common/TextureDecoderNEON.h   |   4 +
 4 files changed, 191 insertions(+), 55 deletions(-)

diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp
index fe0adf3f9..342d6952d 100644
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@@ -347,8 +347,10 @@ CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w,
 	const int w4 = w / 4;
 	const int stride4 = stride / 4;
 
+	// Have alpha values == 0 been seen?
 	__m128i hasZeroCursor = _mm_setzero_si128();
 	for (int y = 0; y < h; ++y) {
+		// Have alpha values > 0 and < 0xFF been seen?
 		__m128i hasAnyCursor = _mm_setzero_si128();
 
 		for (int i = 0; i < w4; ++i) {
@@ -420,29 +422,28 @@ CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w,
 }
 
 CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
-	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set1_epi16(1);
 
 	const __m128i *p = (const __m128i *)pixelData;
 	const int w8 = w / 8;
 	const int stride8 = stride / 8;
 
-	__m128i hasZeroCursor = _mm_setzero_si128();
+	__m128i bits = mask;
 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w8; ++i) {
-			const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 15);
-
-			const __m128i isZero = _mm_cmpeq_epi16(a, zero);
-			hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
+			const __m128i a = _mm_load_si128(&p[i]);
+			bits = _mm_and_si128(bits, a);
 		}
+
+		__m128i result = _mm_xor_si128(bits, mask);
+		if (CombineSSEBitsToDWORD(result) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride8;
 	}
 
-	// Now let's sum up the bits.
-	if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }
 
 CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
@@ -486,39 +487,42 @@ CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w,
 }
 
 CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
-	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set1_epi16((short)0x8000);
 
 	const __m128i *p = (const __m128i *)pixelData;
 	const int w8 = w / 8;
 	const int stride8 = stride / 8;
 
-	__m128i hasZeroCursor = _mm_setzero_si128();
+	__m128i bits = mask;
 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w8; ++i) {
-			const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 15);
-
-			const __m128i isZero = _mm_cmpeq_epi16(a, zero);
-			hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
+			const __m128i a = _mm_load_si128(&p[i]);
+			bits = _mm_and_si128(bits, a);
 		}
+
+		__m128i result = _mm_xor_si128(bits, mask);
+		if (CombineSSEBitsToDWORD(result) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride8;
 	}
 
-	// Now let's sum up the bits.
-	if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }
 #endif
 
 CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) {
-#ifdef _M_SSE
-	// Use SSE if aligned to 16 bytes / 4 pixels (almost always the case.)
+	// Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.)
 	if ((w & 3) == 0 && (stride & 3) == 0) {
+#ifdef _M_SSE
 		return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
-	}
+#elif defined(ARM) || defined(ARM64)
+		if (cpu_info.bNEON) {
+			return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
+		}
 #endif
+	}
 
 	u32 hitZeroAlpha = 0;
 
@@ -543,12 +547,16 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
 }
 
 CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
-#ifdef _M_SSE
-	// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
+	// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
 	if ((w & 7) == 0 && (stride & 7) == 0) {
+#ifdef _M_SSE
 		return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
-	}
+#elif defined(ARM) || defined(ARM64)
+		if (cpu_info.bNEON) {
+			return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
+		}
 #endif
+	}
 
 	u32 hitZeroAlpha = 0;
 
@@ -576,12 +584,16 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
 }
 
 CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
-#ifdef _M_SSE
-	// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
+	// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
 	if ((w & 7) == 0 && (stride & 7) == 0) {
+#ifdef _M_SSE
 		return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
-	}
+#elif defined(ARM) || defined(ARM64)
+		if (cpu_info.bNEON) {
+			return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
+		}
 #endif
+	}
 
 	u32 hitZeroAlpha = 0;
 
@@ -589,19 +601,20 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
 	const int w2 = (w + 1) / 2;
 	const int stride2 = (stride + 1) / 2;
 
+	u32 bits = 0x00010001;
 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w2; ++i) {
-			u32 a = p[i] & 0x00010001;
-			hitZeroAlpha |= a ^ 0x00010001;
+			bits &= p[i];
 		}
+
+		if ((bits ^ 0x00010001) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride2;
 	}
 
-	if (hitZeroAlpha) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }
 
 CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
@@ -645,7 +658,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
 	}
 #endif
 
-	u32 hitZeroAlpha = 0;
+	u32 bits = 0x80008000;
 
 	const u32 *p = pixelData;
 	const int w2 = (w + 1) / 2;
@@ -653,15 +666,15 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
 
 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w2; ++i) {
-			u32 a = p[i] & 0x80008000;
-			hitZeroAlpha |= a ^ 0x80008000;
+			bits &= p[i];
 		}
+
+		if ((bits ^ 0x80008000) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride;
 	}
 
-	if (hitZeroAlpha) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }
diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h
index d6ff74e52..fd9b75025 100644
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@@ -17,6 +17,13 @@
 
 #pragma once
 
+enum CheckAlphaResult {
+	// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
+	CHECKALPHA_FULL = 0,
+	CHECKALPHA_ANY = 4,
+	CHECKALPHA_ZERO = 8,
+};
+
 #include "Common/Common.h"
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
@@ -75,13 +82,6 @@ extern ReliableHash64Func DoReliableHash64;
 typedef u32 ReliableHashType;
 #endif
 
-enum CheckAlphaResult {
-	// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
-	CHECKALPHA_FULL = 0,
-	CHECKALPHA_ANY = 4,
-	CHECKALPHA_ZERO = 8,
-};
-
 CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h);
 CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h);
 CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h);
diff --git a/GPU/Common/TextureDecoderNEON.cpp b/GPU/Common/TextureDecoderNEON.cpp
index c1fb50a7e..b801096d3 100644
--- a/GPU/Common/TextureDecoderNEON.cpp
+++ b/GPU/Common/TextureDecoderNEON.cpp
@@ -243,3 +243,122 @@ u32 ReliableHash32NEON(const void *input, size_t len, u32 seed) {
 
 	return h32;
 }
+
+static inline bool VectorIsNonZeroNEON(const uint32x4_t &v) {
+	u64 low = vgetq_lane_u64(vreinterpretq_u64_u32(v), 0);
+	u64 high = vgetq_lane_u64(vreinterpretq_u64_u32(v), 1);
+
+	return (low | high) != 0;
+}
+
+static inline bool VectorIsNonZeroNEON(const uint16x8_t &v) {
+	u64 low = vgetq_lane_u64(vreinterpretq_u64_u16(v), 0);
+	u64 high = vgetq_lane_u64(vreinterpretq_u64_u16(v), 1);
+
+	return (low | high) != 0;
+}
+
+CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h) {
+	const uint32x4_t zero = vdupq_n_u32(0);
+	const uint32x4_t full = vdupq_n_u32(0xFF);
+
+	const u32 *p = (const u32 *)pixelData;
+
+	// Have alpha values == 0 been seen?
+	uint32x4_t foundAZero = zero;
+
+	for (int y = 0; y < h; ++y) {
+		// Have alpha values > 0 and < 0xFF been seen?
+		uint32x4_t foundFraction = zero;
+
+		for (int i = 0; i < w; i += 4) {
+			const uint32x4_t a = vshrq_n_u32(vld1q_u32(&p[i]), 24);
+
+			const uint32x4_t isZero = vceqq_u32(a, zero);
+			foundAZero = vorrq_u32(foundAZero, isZero);
+
+			// If a = FF, isNotFull will be 0 -> foundFraction will be 0.
+			// If a = 00, a & isNotFull will be 0 -> foundFraction will be 0.
+			// In any other case, foundFraction will have some bits set.
+			const uint32x4_t isNotFull = vcltq_u32(a, full);
+			foundFraction = vorrq_u32(foundFraction, vandq_u32(a, isNotFull));
+		}
+		p += stride;
+
+		// We check any early, in case we can skip the rest of the rows.
+		if (VectorIsNonZeroNEON(foundFraction)) {
+			return CHECKALPHA_ANY;
+		}
+	}
+
+	// Now let's sum up the bits.
+	if (VectorIsNonZeroNEON(foundAZero)) {
+		return CHECKALPHA_ZERO;
+	} else {
+		return CHECKALPHA_FULL;
+	}
+}
+
+CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h) {
+	const uint16x8_t zero = vdupq_n_u16(0);
+	const uint16x8_t full = vdupq_n_u16(0xF);
+
+	const u16 *p = (const u16 *)pixelData;
+
+	// Have alpha values == 0 been seen?
+	uint16x8_t foundAZero = zero;
+
+	for (int y = 0; y < h; ++y) {
+		// Have alpha values > 0 and < 0xFF been seen?
+		uint16x8_t foundFraction = zero;
+
+		for (int i = 0; i < w; i += 8) {
+			const uint16x8_t a = vshrq_n_u16(vld1q_u16(&p[i]), 12);
+
+			const uint16x8_t isZero = vceqq_u16(a, zero);
+			foundAZero = vorrq_u16(foundAZero, isZero);
+
+			// If a = F, isNotFull will be 0 -> foundFraction will be 0.
+			// If a = 0, a & isNotFull will be 0 -> foundFraction will be 0.
+			// In any other case, foundFraction will have some bits set.
+			const uint16x8_t isNotFull = vcltq_u16(a, full);
+			foundFraction = vorrq_u16(foundFraction, vandq_u16(a, isNotFull));
+		}
+		p += stride;
+
+		// We check any early, in case we can skip the rest of the rows.
+		if (VectorIsNonZeroNEON(foundFraction)) {
+			return CHECKALPHA_ANY;
+		}
+	}
+
+	// Now let's sum up the bits.
+	if (VectorIsNonZeroNEON(foundAZero)) {
+		return CHECKALPHA_ZERO;
+	} else {
+		return CHECKALPHA_FULL;
+	}
+}
+
+CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h) {
+	const u16 *p = (const u16 *)pixelData;
+
+	const uint16x8_t mask = vdupq_n_u16(1);
+	uint16x8_t bits = vdupq_n_u16(1);
+	for (int y = 0; y < h; ++y) {
+		for (int i = 0; i < w; i += 8) {
+			const uint16x8_t a = vld1q_u16(&p[i]);
+
+			bits = vandq_u16(bits, a);
+		}
+
+		uint16x8_t result = veorq_u16(bits, mask);
+		if (VectorIsNonZeroNEON(result)) {
+			return CHECKALPHA_ZERO;
+		}
+
+		p += stride;
+	}
+
+	return CHECKALPHA_FULL;
+}
diff --git a/GPU/Common/TextureDecoderNEON.h b/GPU/Common/TextureDecoderNEON.h
index 1003dc54f..cba7df5cd 100644
--- a/GPU/Common/TextureDecoderNEON.h
+++ b/GPU/Common/TextureDecoderNEON.h
@@ -20,3 +20,7 @@
 u32 QuickTexHashNEON(const void *checkp, u32 size);
 void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
 u32 ReliableHash32NEON(const void *input, size_t len, u32 seed);
+
+CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h);
+CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h);
+CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h);