Merge branch 'CheckAlphaNEON' of git://github.com/KentuckyCompass/ppsspp into KentuckyCompass-CheckAlphaNEON

This commit is contained in:
Henrik Rydgård 2015-05-31 12:00:10 +02:00
commit 7b50ec7b75
4 changed files with 191 additions and 55 deletions

View File

@ -347,8 +347,10 @@ CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w,
const int w4 = w / 4;
const int stride4 = stride / 4;
// Have alpha values == 0 been seen?
__m128i hasZeroCursor = _mm_setzero_si128();
for (int y = 0; y < h; ++y) {
// Have alpha values > 0 and < 0xFF been seen?
__m128i hasAnyCursor = _mm_setzero_si128();
for (int i = 0; i < w4; ++i) {
@ -420,29 +422,28 @@ CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w,
}
CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
const __m128i zero = _mm_setzero_si128();
const __m128i mask = _mm_set1_epi16(1);
const __m128i *p = (const __m128i *)pixelData;
const int w8 = w / 8;
const int stride8 = stride / 8;
__m128i hasZeroCursor = _mm_setzero_si128();
__m128i bits = mask;
for (int y = 0; y < h; ++y) {
for (int i = 0; i < w8; ++i) {
const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 15);
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
const __m128i a = _mm_load_si128(&p[i]);
bits = _mm_and_si128(bits, a);
}
__m128i result = _mm_xor_si128(bits, mask);
if (CombineSSEBitsToDWORD(result) != 0) {
return CHECKALPHA_ZERO;
}
p += stride8;
}
// Now let's sum up the bits.
if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
return CHECKALPHA_FULL;
}
CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
@ -486,39 +487,42 @@ CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w,
}
CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
const __m128i zero = _mm_setzero_si128();
const __m128i mask = _mm_set1_epi16((short)0x8000);
const __m128i *p = (const __m128i *)pixelData;
const int w8 = w / 8;
const int stride8 = stride / 8;
__m128i hasZeroCursor = _mm_setzero_si128();
__m128i bits = mask;
for (int y = 0; y < h; ++y) {
for (int i = 0; i < w8; ++i) {
const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 15);
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
const __m128i a = _mm_load_si128(&p[i]);
bits = _mm_and_si128(bits, a);
}
__m128i result = _mm_xor_si128(bits, mask);
if (CombineSSEBitsToDWORD(result) != 0) {
return CHECKALPHA_ZERO;
}
p += stride8;
}
// Now let's sum up the bits.
if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
return CHECKALPHA_FULL;
}
#endif
CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) {
#ifdef _M_SSE
// Use SSE if aligned to 16 bytes / 4 pixels (almost always the case.)
// Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.)
if ((w & 3) == 0 && (stride & 3) == 0) {
#ifdef _M_SSE
return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
}
#elif defined(ARM) || defined(ARM64)
if (cpu_info.bNEON) {
return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
}
#endif
}
u32 hitZeroAlpha = 0;
@ -543,12 +547,16 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
}
CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
#ifdef _M_SSE
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
if ((w & 7) == 0 && (stride & 7) == 0) {
#ifdef _M_SSE
return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
}
#elif defined(ARM) || defined(ARM64)
if (cpu_info.bNEON) {
return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
}
#endif
}
u32 hitZeroAlpha = 0;
@ -576,12 +584,16 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
}
CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
#ifdef _M_SSE
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
if ((w & 7) == 0 && (stride & 7) == 0) {
#ifdef _M_SSE
return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
}
#elif defined(ARM) || defined(ARM64)
if (cpu_info.bNEON) {
return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
}
#endif
}
u32 hitZeroAlpha = 0;
@ -589,19 +601,20 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
const int w2 = (w + 1) / 2;
const int stride2 = (stride + 1) / 2;
u32 bits = 0x00010001;
for (int y = 0; y < h; ++y) {
for (int i = 0; i < w2; ++i) {
u32 a = p[i] & 0x00010001;
hitZeroAlpha |= a ^ 0x00010001;
bits &= p[i];
}
if ((bits ^ 0x00010001) != 0) {
return CHECKALPHA_ZERO;
}
p += stride2;
}
if (hitZeroAlpha) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
return CHECKALPHA_FULL;
}
CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
@ -645,7 +658,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
}
#endif
u32 hitZeroAlpha = 0;
u32 bits = 0x80008000;
const u32 *p = pixelData;
const int w2 = (w + 1) / 2;
@ -653,15 +666,15 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
for (int y = 0; y < h; ++y) {
for (int i = 0; i < w2; ++i) {
u32 a = p[i] & 0x80008000;
hitZeroAlpha |= a ^ 0x80008000;
bits &= p[i];
}
if ((bits ^ 0x80008000) != 0) {
return CHECKALPHA_ZERO;
}
p += stride;
}
if (hitZeroAlpha) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
return CHECKALPHA_FULL;
}

View File

@ -17,6 +17,13 @@
#pragma once
enum CheckAlphaResult {
// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
CHECKALPHA_FULL = 0,
CHECKALPHA_ANY = 4,
CHECKALPHA_ZERO = 8,
};
#include "Common/Common.h"
#include "Core/MemMap.h"
#include "GPU/ge_constants.h"
@ -75,13 +82,6 @@ extern ReliableHash64Func DoReliableHash64;
typedef u32 ReliableHashType;
#endif
enum CheckAlphaResult {
// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
CHECKALPHA_FULL = 0,
CHECKALPHA_ANY = 4,
CHECKALPHA_ZERO = 8,
};
CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h);
CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h);
CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h);

View File

@ -243,3 +243,122 @@ u32 ReliableHash32NEON(const void *input, size_t len, u32 seed) {
return h32;
}
static inline bool VectorIsNonZeroNEON(const uint32x4_t &v) {
u64 low = vgetq_lane_u64(vreinterpretq_u64_u32(v), 0);
u64 high = vgetq_lane_u64(vreinterpretq_u64_u32(v), 1);
return (low | high) != 0;
}
static inline bool VectorIsNonZeroNEON(const uint16x8_t &v) {
u64 low = vgetq_lane_u64(vreinterpretq_u64_u16(v), 0);
u64 high = vgetq_lane_u64(vreinterpretq_u64_u16(v), 1);
return (low | high) != 0;
}
CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h) {
const uint32x4_t zero = vdupq_n_u32(0);
const uint32x4_t full = vdupq_n_u32(0xFF);
const u32 *p = (const u32 *)pixelData;
// Have alpha values == 0 been seen?
uint32x4_t foundAZero = zero;
for (int y = 0; y < h; ++y) {
// Have alpha values > 0 and < 0xFF been seen?
uint32x4_t foundFraction = zero;
for (int i = 0; i < w; i += 4) {
const uint32x4_t a = vshrq_n_u32(vld1q_u32(&p[i]), 24);
const uint32x4_t isZero = vceqq_u32(a, zero);
foundAZero = vorrq_u32(foundAZero, isZero);
// If a = FF, isNotFull will be 0 -> foundFraction will be 0.
// If a = 00, a & isNotFull will be 0 -> foundFraction will be 0.
// In any other case, foundFraction will have some bits set.
const uint32x4_t isNotFull = vcltq_u32(a, full);
foundFraction = vorrq_u32(foundFraction, vandq_u32(a, isNotFull));
}
p += stride;
// We check any early, in case we can skip the rest of the rows.
if (VectorIsNonZeroNEON(foundFraction)) {
return CHECKALPHA_ANY;
}
}
// Now let's sum up the bits.
if (VectorIsNonZeroNEON(foundAZero)) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
}
CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h) {
const uint16x8_t zero = vdupq_n_u16(0);
const uint16x8_t full = vdupq_n_u16(0xF);
const u16 *p = (const u16 *)pixelData;
// Have alpha values == 0 been seen?
uint16x8_t foundAZero = zero;
for (int y = 0; y < h; ++y) {
// Have alpha values > 0 and < 0xFF been seen?
uint16x8_t foundFraction = zero;
for (int i = 0; i < w; i += 8) {
const uint16x8_t a = vshrq_n_u16(vld1q_u16(&p[i]), 12);
const uint16x8_t isZero = vceqq_u16(a, zero);
foundAZero = vorrq_u16(foundAZero, isZero);
// If a = F, isNotFull will be 0 -> foundFraction will be 0.
// If a = 0, a & isNotFull will be 0 -> foundFraction will be 0.
// In any other case, foundFraction will have some bits set.
const uint16x8_t isNotFull = vcltq_u16(a, full);
foundFraction = vorrq_u16(foundFraction, vandq_u16(a, isNotFull));
}
p += stride;
// We check any early, in case we can skip the rest of the rows.
if (VectorIsNonZeroNEON(foundFraction)) {
return CHECKALPHA_ANY;
}
}
// Now let's sum up the bits.
if (VectorIsNonZeroNEON(foundAZero)) {
return CHECKALPHA_ZERO;
} else {
return CHECKALPHA_FULL;
}
}
CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h) {
const u16 *p = (const u16 *)pixelData;
const uint16x8_t mask = vdupq_n_u16(1);
uint16x8_t bits = vdupq_n_u16(1);
for (int y = 0; y < h; ++y) {
for (int i = 0; i < w; i += 8) {
const uint16x8_t a = vld1q_u16(&p[i]);
bits = vandq_u16(bits, a);
}
uint16x8_t result = veorq_u16(bits, mask);
if (VectorIsNonZeroNEON(result)) {
return CHECKALPHA_ZERO;
}
p += stride;
}
return CHECKALPHA_FULL;
}

View File

@ -20,3 +20,7 @@
u32 QuickTexHashNEON(const void *checkp, u32 size);
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
u32 ReliableHash32NEON(const void *input, size_t len, u32 seed);
CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h);
CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h);
CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h);