From 880697f40a22b846576bfc52e402fe89848599c0 Mon Sep 17 00:00:00 2001 From: KentuckyCompass Date: Mon, 25 May 2015 18:04:52 -0700 Subject: [PATCH] Add NEON versions of the CheckAlpha family --- GPU/Common/TextureDecoder.cpp | 109 +++++++++++++++------------ GPU/Common/TextureDecoder.h | 14 ++-- GPU/Common/TextureDecoderNEON.cpp | 119 ++++++++++++++++++++++++++++++ GPU/Common/TextureDecoderNEON.h | 4 + 4 files changed, 191 insertions(+), 55 deletions(-) diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index fe0adf3f9..342d6952d 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -347,8 +347,10 @@ CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, const int w4 = w / 4; const int stride4 = stride / 4; + // Have alpha values == 0 been seen? __m128i hasZeroCursor = _mm_setzero_si128(); for (int y = 0; y < h; ++y) { + // Have alpha values > 0 and < 0xFF been seen? __m128i hasAnyCursor = _mm_setzero_si128(); for (int i = 0; i < w4; ++i) { @@ -420,29 +422,28 @@ CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, } CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) { - const __m128i zero = _mm_setzero_si128(); + const __m128i mask = _mm_set1_epi16(1); const __m128i *p = (const __m128i *)pixelData; const int w8 = w / 8; const int stride8 = stride / 8; - __m128i hasZeroCursor = _mm_setzero_si128(); + __m128i bits = mask; for (int y = 0; y < h; ++y) { for (int i = 0; i < w8; ++i) { - const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 15); - - const __m128i isZero = _mm_cmpeq_epi16(a, zero); - hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero); + const __m128i a = _mm_load_si128(&p[i]); + bits = _mm_and_si128(bits, a); } + + __m128i result = _mm_xor_si128(bits, mask); + if (CombineSSEBitsToDWORD(result) != 0) { + return CHECKALPHA_ZERO; + } + p += stride8; } - // Now let's sum up the bits. - if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) { - return CHECKALPHA_ZERO; - } else { - return CHECKALPHA_FULL; - } + return CHECKALPHA_FULL; } CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) { @@ -486,39 +487,42 @@ CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, } CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) { - const __m128i zero = _mm_setzero_si128(); + const __m128i mask = _mm_set1_epi16((short)0x8000); const __m128i *p = (const __m128i *)pixelData; const int w8 = w / 8; const int stride8 = stride / 8; - __m128i hasZeroCursor = _mm_setzero_si128(); + __m128i bits = mask; for (int y = 0; y < h; ++y) { for (int i = 0; i < w8; ++i) { - const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 15); - - const __m128i isZero = _mm_cmpeq_epi16(a, zero); - hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero); + const __m128i a = _mm_load_si128(&p[i]); + bits = _mm_and_si128(bits, a); } + + __m128i result = _mm_xor_si128(bits, mask); + if (CombineSSEBitsToDWORD(result) != 0) { + return CHECKALPHA_ZERO; + } + p += stride8; } - // Now let's sum up the bits. - if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) { - return CHECKALPHA_ZERO; - } else { - return CHECKALPHA_FULL; - } + return CHECKALPHA_FULL; } #endif CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) { -#ifdef _M_SSE - // Use SSE if aligned to 16 bytes / 4 pixels (almost always the case.) + // Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.) if ((w & 3) == 0 && (stride & 3) == 0) { +#ifdef _M_SSE return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h); - } +#elif defined(ARM) || defined(ARM64) + if (cpu_info.bNEON) { + return CheckAlphaRGBA8888NEON(pixelData, stride, w, h); + } #endif + } u32 hitZeroAlpha = 0; @@ -543,12 +547,16 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w } CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) { -#ifdef _M_SSE - // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) + // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { +#ifdef _M_SSE return CheckAlphaABGR4444SSE2(pixelData, stride, w, h); - } +#elif defined(ARM) || defined(ARM64) + if (cpu_info.bNEON) { + return CheckAlphaABGR4444NEON(pixelData, stride, w, h); + } #endif + } u32 hitZeroAlpha = 0; @@ -576,12 +584,16 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w } CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) { -#ifdef _M_SSE - // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) + // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { +#ifdef _M_SSE return CheckAlphaABGR1555SSE2(pixelData, stride, w, h); - } +#elif defined(ARM) || defined(ARM64) + if (cpu_info.bNEON) { + return CheckAlphaABGR1555NEON(pixelData, stride, w, h); + } #endif + } u32 hitZeroAlpha = 0; @@ -589,19 +601,20 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w const int w2 = (w + 1) / 2; const int stride2 = (stride + 1) / 2; + u32 bits = 0x00010001; for (int y = 0; y < h; ++y) { for (int i = 0; i < w2; ++i) { - u32 a = p[i] & 0x00010001; - hitZeroAlpha |= a ^ 0x00010001; + bits &= p[i]; } + + if ((bits ^ 0x00010001) != 0) { + return CHECKALPHA_ZERO; + } + p += stride2; } - if (hitZeroAlpha) { - return CHECKALPHA_ZERO; - } else { - return CHECKALPHA_FULL; - } + return CHECKALPHA_FULL; } CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) { @@ -645,7 +658,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w } #endif - u32 hitZeroAlpha = 0; + u32 bits = 0x80008000; const u32 *p = pixelData; const int w2 = (w + 1) / 2; @@ -653,15 +666,15 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w for (int y = 0; y < h; ++y) { for (int i = 0; i < w2; ++i) { - u32 a = p[i] & 0x80008000; - hitZeroAlpha |= a ^ 0x80008000; + bits &= p[i]; } + + if ((bits ^ 0x80008000) != 0) { + return CHECKALPHA_ZERO; + } + p += stride; } - if (hitZeroAlpha) { - return CHECKALPHA_ZERO; - } else { - return CHECKALPHA_FULL; - } + return CHECKALPHA_FULL; } diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index d6ff74e52..fd9b75025 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -17,6 +17,13 @@ #pragma once +enum CheckAlphaResult { + // These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc. + CHECKALPHA_FULL = 0, + CHECKALPHA_ANY = 4, + CHECKALPHA_ZERO = 8, +}; + #include "Common/Common.h" #include "Core/MemMap.h" #include "GPU/ge_constants.h" @@ -75,13 +82,6 @@ extern ReliableHash64Func DoReliableHash64; typedef u32 ReliableHashType; #endif -enum CheckAlphaResult { - // These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc. - CHECKALPHA_FULL = 0, - CHECKALPHA_ANY = 4, - CHECKALPHA_ZERO = 8, -}; - CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h); CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h); CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h); diff --git a/GPU/Common/TextureDecoderNEON.cpp b/GPU/Common/TextureDecoderNEON.cpp index c1fb50a7e..b801096d3 100644 --- a/GPU/Common/TextureDecoderNEON.cpp +++ b/GPU/Common/TextureDecoderNEON.cpp @@ -243,3 +243,122 @@ u32 ReliableHash32NEON(const void *input, size_t len, u32 seed) { return h32; } + +static inline bool VectorIsNonZeroNEON(const uint32x4_t &v) { + u64 low = vgetq_lane_u64(vreinterpretq_u64_u32(v), 0); + u64 high = vgetq_lane_u64(vreinterpretq_u64_u32(v), 1); + + return (low | high) != 0; +} + +static inline bool VectorIsNonZeroNEON(const uint16x8_t &v) { + u64 low = vgetq_lane_u64(vreinterpretq_u64_u16(v), 0); + u64 high = vgetq_lane_u64(vreinterpretq_u64_u16(v), 1); + + return (low | high) != 0; +} + +CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h) { + const uint32x4_t zero = vdupq_n_u32(0); + const uint32x4_t full = vdupq_n_u32(0xFF); + + const u32 *p = (const u32 *)pixelData; + + // Have alpha values == 0 been seen? + uint32x4_t foundAZero = zero; + + for (int y = 0; y < h; ++y) { + // Have alpha values > 0 and < 0xFF been seen? + uint32x4_t foundFraction = zero; + + for (int i = 0; i < w; i += 4) { + const uint32x4_t a = vshrq_n_u32(vld1q_u32(&p[i]), 24); + + const uint32x4_t isZero = vceqq_u32(a, zero); + foundAZero = vorrq_u32(foundAZero, isZero); + + // If a = FF, isNotFull will be 0 -> foundFraction will be 0. + // If a = 00, a & isNotFull will be 0 -> foundFraction will be 0. + // In any other case, foundFraction will have some bits set. + const uint32x4_t isNotFull = vcltq_u32(a, full); + foundFraction = vorrq_u32(foundFraction, vandq_u32(a, isNotFull)); + } + p += stride; + + // We check any early, in case we can skip the rest of the rows. + if (VectorIsNonZeroNEON(foundFraction)) { + return CHECKALPHA_ANY; + } + } + + // Now let's sum up the bits. + if (VectorIsNonZeroNEON(foundAZero)) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h) { + const uint16x8_t zero = vdupq_n_u16(0); + const uint16x8_t full = vdupq_n_u16(0xF); + + const u16 *p = (const u16 *)pixelData; + + // Have alpha values == 0 been seen? + uint16x8_t foundAZero = zero; + + for (int y = 0; y < h; ++y) { + // Have alpha values > 0 and < 0xFF been seen? + uint16x8_t foundFraction = zero; + + for (int i = 0; i < w; i += 8) { + const uint16x8_t a = vshrq_n_u16(vld1q_u16(&p[i]), 12); + + const uint16x8_t isZero = vceqq_u16(a, zero); + foundAZero = vorrq_u16(foundAZero, isZero); + + // If a = F, isNotFull will be 0 -> foundFraction will be 0. + // If a = 0, a & isNotFull will be 0 -> foundFraction will be 0. + // In any other case, foundFraction will have some bits set. + const uint16x8_t isNotFull = vcltq_u16(a, full); + foundFraction = vorrq_u16(foundFraction, vandq_u16(a, isNotFull)); + } + p += stride; + + // We check any early, in case we can skip the rest of the rows. + if (VectorIsNonZeroNEON(foundFraction)) { + return CHECKALPHA_ANY; + } + } + + // Now let's sum up the bits. + if (VectorIsNonZeroNEON(foundAZero)) { + return CHECKALPHA_ZERO; + } else { + return CHECKALPHA_FULL; + } +} + +CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h) { + const u16 *p = (const u16 *)pixelData; + + const uint16x8_t mask = vdupq_n_u16(1); + uint16x8_t bits = vdupq_n_u16(1); + for (int y = 0; y < h; ++y) { + for (int i = 0; i < w; i += 8) { + const uint16x8_t a = vld1q_u16(&p[i]); + + bits = vandq_u16(bits, a); + } + + uint16x8_t result = veorq_u16(bits, mask); + if (VectorIsNonZeroNEON(result)) { + return CHECKALPHA_ZERO; + } + + p += stride; + } + + return CHECKALPHA_FULL; +} diff --git a/GPU/Common/TextureDecoderNEON.h b/GPU/Common/TextureDecoderNEON.h index 1003dc54f..cba7df5cd 100644 --- a/GPU/Common/TextureDecoderNEON.h +++ b/GPU/Common/TextureDecoderNEON.h @@ -20,3 +20,7 @@ u32 QuickTexHashNEON(const void *checkp, u32 size); void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth); u32 ReliableHash32NEON(const void *input, size_t len, u32 seed); + +CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h);