Merge branch 'CheckAlphaNEON' of git://github.com/KentuckyCompass/ppsspp into KentuckyCompass-CheckAlphaNEON

2024-11-25 01:00:01 +00:00 · 2015-05-31 12:00:10 +02:00 · 2015-05-31 12:00:10 +02:00 · 7b50ec7b75
commit 7b50ec7b75
parent bc96afa14b 880697f40a
4 changed files with 191 additions and 55 deletions
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@ -347,8 +347,10 @@ CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w,
 	const int w4 = w / 4;
 	const int stride4 = stride / 4;

+	// Have alpha values == 0 been seen?
 	__m128i hasZeroCursor = _mm_setzero_si128();
 	for (int y = 0; y < h; ++y) {
+		// Have alpha values > 0 and < 0xFF been seen?
 		__m128i hasAnyCursor = _mm_setzero_si128();

 		for (int i = 0; i < w4; ++i) {
@ -420,29 +422,28 @@ CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w,
 }

 CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
-	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set1_epi16(1);

 	const __m128i *p = (const __m128i *)pixelData;
 	const int w8 = w / 8;
 	const int stride8 = stride / 8;

-	__m128i hasZeroCursor = _mm_setzero_si128();
+	__m128i bits = mask;
 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w8; ++i) {
-			const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 15);
-
-			const __m128i isZero = _mm_cmpeq_epi16(a, zero);
-			hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
+			const __m128i a = _mm_load_si128(&p[i]);
+			bits = _mm_and_si128(bits, a);
 		}
+
+		__m128i result = _mm_xor_si128(bits, mask);
+		if (CombineSSEBitsToDWORD(result) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride8;
 	}

-	// Now let's sum up the bits.
-	if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }

 CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
@ -486,39 +487,42 @@ CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w,
 }

 CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
-	const __m128i zero = _mm_setzero_si128();
+	const __m128i mask = _mm_set1_epi16((short)0x8000);

 	const __m128i *p = (const __m128i *)pixelData;
 	const int w8 = w / 8;
 	const int stride8 = stride / 8;

-	__m128i hasZeroCursor = _mm_setzero_si128();
+	__m128i bits = mask;
 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w8; ++i) {
-			const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 15);
-
-			const __m128i isZero = _mm_cmpeq_epi16(a, zero);
-			hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
+			const __m128i a = _mm_load_si128(&p[i]);
+			bits = _mm_and_si128(bits, a);
 		}
+
+		__m128i result = _mm_xor_si128(bits, mask);
+		if (CombineSSEBitsToDWORD(result) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride8;
 	}

-	// Now let's sum up the bits.
-	if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }
 #endif

 CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) {
-#ifdef _M_SSE
-	// Use SSE if aligned to 16 bytes / 4 pixels (almost always the case.)
+	// Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.)
 	if ((w & 3) == 0 && (stride & 3) == 0) {
+#ifdef _M_SSE
 		return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
-	}
+#elif defined(ARM) || defined(ARM64)
+		if (cpu_info.bNEON) {
+			return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
+		}
 #endif
+	}

 	u32 hitZeroAlpha = 0;

@ -543,12 +547,16 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
 }

 CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
-#ifdef _M_SSE
-	// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
+	// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
 	if ((w & 7) == 0 && (stride & 7) == 0) {
+#ifdef _M_SSE
 		return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
-	}
+#elif defined(ARM) || defined(ARM64)
+		if (cpu_info.bNEON) {
+			return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
+		}
 #endif
+	}

 	u32 hitZeroAlpha = 0;

@ -576,12 +584,16 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
 }

 CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
-#ifdef _M_SSE
-	// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
+	// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
 	if ((w & 7) == 0 && (stride & 7) == 0) {
+#ifdef _M_SSE
 		return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
-	}
+#elif defined(ARM) || defined(ARM64)
+		if (cpu_info.bNEON) {
+			return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
+		}
 #endif
+	}

 	u32 hitZeroAlpha = 0;

@ -589,19 +601,20 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
 	const int w2 = (w + 1) / 2;
 	const int stride2 = (stride + 1) / 2;

+	u32 bits = 0x00010001;
 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w2; ++i) {
-			u32 a = p[i] & 0x00010001;
-			hitZeroAlpha |= a ^ 0x00010001;
+			bits &= p[i];
 		}
+
+		if ((bits ^ 0x00010001) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride2;
 	}

-	if (hitZeroAlpha) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }

 CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
@ -645,7 +658,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
 	}
 #endif

-	u32 hitZeroAlpha = 0;
+	u32 bits = 0x80008000;

 	const u32 *p = pixelData;
 	const int w2 = (w + 1) / 2;
@ -653,15 +666,15 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w

 	for (int y = 0; y < h; ++y) {
 		for (int i = 0; i < w2; ++i) {
-			u32 a = p[i] & 0x80008000;
-			hitZeroAlpha |= a ^ 0x80008000;
+			bits &= p[i];
 		}
+
+		if ((bits ^ 0x80008000) != 0) {
+			return CHECKALPHA_ZERO;
+		}
+
 		p += stride;
 	}

-	if (hitZeroAlpha) {
-		return CHECKALPHA_ZERO;
-	} else {
-		return CHECKALPHA_FULL;
-	}
+	return CHECKALPHA_FULL;
 }
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@ -17,6 +17,13 @@

 #pragma once

+enum CheckAlphaResult {
+	// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
+	CHECKALPHA_FULL = 0,
+	CHECKALPHA_ANY = 4,
+	CHECKALPHA_ZERO = 8,
+};
+
 #include "Common/Common.h"
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
@ -75,13 +82,6 @@ extern ReliableHash64Func DoReliableHash64;
 typedef u32 ReliableHashType;
 #endif

-enum CheckAlphaResult {
-	// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
-	CHECKALPHA_FULL = 0,
-	CHECKALPHA_ANY = 4,
-	CHECKALPHA_ZERO = 8,
-};
-
 CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h);
 CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h);
 CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h);
--- a/GPU/Common/TextureDecoderNEON.cpp
+++ b/GPU/Common/TextureDecoderNEON.cpp
@ -243,3 +243,122 @@ u32 ReliableHash32NEON(const void *input, size_t len, u32 seed) {

 	return h32;
 }
+
+static inline bool VectorIsNonZeroNEON(const uint32x4_t &v) {
+	u64 low = vgetq_lane_u64(vreinterpretq_u64_u32(v), 0);
+	u64 high = vgetq_lane_u64(vreinterpretq_u64_u32(v), 1);
+
+	return (low | high) != 0;
+}
+
+static inline bool VectorIsNonZeroNEON(const uint16x8_t &v) {
+	u64 low = vgetq_lane_u64(vreinterpretq_u64_u16(v), 0);
+	u64 high = vgetq_lane_u64(vreinterpretq_u64_u16(v), 1);
+
+	return (low | high) != 0;
+}
+
+CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h) {
+	const uint32x4_t zero = vdupq_n_u32(0);
+	const uint32x4_t full = vdupq_n_u32(0xFF);
+
+	const u32 *p = (const u32 *)pixelData;
+
+	// Have alpha values == 0 been seen?
+	uint32x4_t foundAZero = zero;
+
+	for (int y = 0; y < h; ++y) {
+		// Have alpha values > 0 and < 0xFF been seen?
+		uint32x4_t foundFraction = zero;
+
+		for (int i = 0; i < w; i += 4) {
+			const uint32x4_t a = vshrq_n_u32(vld1q_u32(&p[i]), 24);
+
+			const uint32x4_t isZero = vceqq_u32(a, zero);
+			foundAZero = vorrq_u32(foundAZero, isZero);
+
+			// If a = FF, isNotFull will be 0 -> foundFraction will be 0.
+			// If a = 00, a & isNotFull will be 0 -> foundFraction will be 0.
+			// In any other case, foundFraction will have some bits set.
+			const uint32x4_t isNotFull = vcltq_u32(a, full);
+			foundFraction = vorrq_u32(foundFraction, vandq_u32(a, isNotFull));
+		}
+		p += stride;
+
+		// We check any early, in case we can skip the rest of the rows.
+		if (VectorIsNonZeroNEON(foundFraction)) {
+			return CHECKALPHA_ANY;
+		}
+	}
+
+	// Now let's sum up the bits.
+	if (VectorIsNonZeroNEON(foundAZero)) {
+		return CHECKALPHA_ZERO;
+	} else {
+		return CHECKALPHA_FULL;
+	}
+}
+
+CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h) {
+	const uint16x8_t zero = vdupq_n_u16(0);
+	const uint16x8_t full = vdupq_n_u16(0xF);
+
+	const u16 *p = (const u16 *)pixelData;
+
+	// Have alpha values == 0 been seen?
+	uint16x8_t foundAZero = zero;
+
+	for (int y = 0; y < h; ++y) {
+		// Have alpha values > 0 and < 0xFF been seen?
+		uint16x8_t foundFraction = zero;
+
+		for (int i = 0; i < w; i += 8) {
+			const uint16x8_t a = vshrq_n_u16(vld1q_u16(&p[i]), 12);
+
+			const uint16x8_t isZero = vceqq_u16(a, zero);
+			foundAZero = vorrq_u16(foundAZero, isZero);
+
+			// If a = F, isNotFull will be 0 -> foundFraction will be 0.
+			// If a = 0, a & isNotFull will be 0 -> foundFraction will be 0.
+			// In any other case, foundFraction will have some bits set.
+			const uint16x8_t isNotFull = vcltq_u16(a, full);
+			foundFraction = vorrq_u16(foundFraction, vandq_u16(a, isNotFull));
+		}
+		p += stride;
+
+		// We check any early, in case we can skip the rest of the rows.
+		if (VectorIsNonZeroNEON(foundFraction)) {
+			return CHECKALPHA_ANY;
+		}
+	}
+
+	// Now let's sum up the bits.
+	if (VectorIsNonZeroNEON(foundAZero)) {
+		return CHECKALPHA_ZERO;
+	} else {
+		return CHECKALPHA_FULL;
+	}
+}
+
+CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h) {
+	const u16 *p = (const u16 *)pixelData;
+
+	const uint16x8_t mask = vdupq_n_u16(1);
+	uint16x8_t bits = vdupq_n_u16(1);
+	for (int y = 0; y < h; ++y) {
+		for (int i = 0; i < w; i += 8) {
+			const uint16x8_t a = vld1q_u16(&p[i]);
+
+			bits = vandq_u16(bits, a);
+		}
+
+		uint16x8_t result = veorq_u16(bits, mask);
+		if (VectorIsNonZeroNEON(result)) {
+			return CHECKALPHA_ZERO;
+		}
+
+		p += stride;
+	}
+
+	return CHECKALPHA_FULL;
+}
--- a/GPU/Common/TextureDecoderNEON.h
+++ b/GPU/Common/TextureDecoderNEON.h
@ -20,3 +20,7 @@
 u32 QuickTexHashNEON(const void *checkp, u32 size);
 void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
 u32 ReliableHash32NEON(const void *input, size_t len, u32 seed);
+
+CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h);
+CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h);
+CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h);