mirror of
https://github.com/libretro/ppsspp.git
synced 2024-11-25 01:00:01 +00:00
Merge branch 'CheckAlphaNEON' of git://github.com/KentuckyCompass/ppsspp into KentuckyCompass-CheckAlphaNEON
This commit is contained in:
commit
7b50ec7b75
@ -347,8 +347,10 @@ CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w,
|
||||
const int w4 = w / 4;
|
||||
const int stride4 = stride / 4;
|
||||
|
||||
// Have alpha values == 0 been seen?
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
for (int y = 0; y < h; ++y) {
|
||||
// Have alpha values > 0 and < 0xFF been seen?
|
||||
__m128i hasAnyCursor = _mm_setzero_si128();
|
||||
|
||||
for (int i = 0; i < w4; ++i) {
|
||||
@ -420,29 +422,28 @@ CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w,
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i mask = _mm_set1_epi16(1);
|
||||
|
||||
const __m128i *p = (const __m128i *)pixelData;
|
||||
const int w8 = w / 8;
|
||||
const int stride8 = stride / 8;
|
||||
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
__m128i bits = mask;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w8; ++i) {
|
||||
const __m128i a = _mm_slli_epi16(_mm_load_si128(&p[i]), 15);
|
||||
|
||||
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
|
||||
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
|
||||
const __m128i a = _mm_load_si128(&p[i]);
|
||||
bits = _mm_and_si128(bits, a);
|
||||
}
|
||||
|
||||
__m128i result = _mm_xor_si128(bits, mask);
|
||||
if (CombineSSEBitsToDWORD(result) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
}
|
||||
|
||||
p += stride8;
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
@ -486,39 +487,42 @@ CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w,
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i mask = _mm_set1_epi16((short)0x8000);
|
||||
|
||||
const __m128i *p = (const __m128i *)pixelData;
|
||||
const int w8 = w / 8;
|
||||
const int stride8 = stride / 8;
|
||||
|
||||
__m128i hasZeroCursor = _mm_setzero_si128();
|
||||
__m128i bits = mask;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w8; ++i) {
|
||||
const __m128i a = _mm_srli_epi16(_mm_load_si128(&p[i]), 15);
|
||||
|
||||
const __m128i isZero = _mm_cmpeq_epi16(a, zero);
|
||||
hasZeroCursor = _mm_or_si128(hasZeroCursor, isZero);
|
||||
const __m128i a = _mm_load_si128(&p[i]);
|
||||
bits = _mm_and_si128(bits, a);
|
||||
}
|
||||
|
||||
__m128i result = _mm_xor_si128(bits, mask);
|
||||
if (CombineSSEBitsToDWORD(result) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
}
|
||||
|
||||
p += stride8;
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (CombineSSEBitsToDWORD(hasZeroCursor) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 4 pixels (almost always the case.)
|
||||
// Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.)
|
||||
if ((w & 3) == 0 && (stride & 3) == 0) {
|
||||
#ifdef _M_SSE
|
||||
return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#elif defined(ARM) || defined(ARM64)
|
||||
if (cpu_info.bNEON) {
|
||||
return CheckAlphaRGBA8888NEON(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
@ -543,12 +547,16 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
if ((w & 7) == 0 && (stride & 7) == 0) {
|
||||
#ifdef _M_SSE
|
||||
return CheckAlphaABGR4444SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#elif defined(ARM) || defined(ARM64)
|
||||
if (cpu_info.bNEON) {
|
||||
return CheckAlphaABGR4444NEON(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
@ -576,12 +584,16 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
#ifdef _M_SSE
|
||||
// Use SSE if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
// Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.)
|
||||
if ((w & 7) == 0 && (stride & 7) == 0) {
|
||||
#ifdef _M_SSE
|
||||
return CheckAlphaABGR1555SSE2(pixelData, stride, w, h);
|
||||
}
|
||||
#elif defined(ARM) || defined(ARM64)
|
||||
if (cpu_info.bNEON) {
|
||||
return CheckAlphaABGR1555NEON(pixelData, stride, w, h);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
|
||||
@ -589,19 +601,20 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w
|
||||
const int w2 = (w + 1) / 2;
|
||||
const int stride2 = (stride + 1) / 2;
|
||||
|
||||
u32 bits = 0x00010001;
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w2; ++i) {
|
||||
u32 a = p[i] & 0x00010001;
|
||||
hitZeroAlpha |= a ^ 0x00010001;
|
||||
bits &= p[i];
|
||||
}
|
||||
|
||||
if ((bits ^ 0x00010001) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
}
|
||||
|
||||
p += stride2;
|
||||
}
|
||||
|
||||
if (hitZeroAlpha) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) {
|
||||
@ -645,7 +658,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
|
||||
}
|
||||
#endif
|
||||
|
||||
u32 hitZeroAlpha = 0;
|
||||
u32 bits = 0x80008000;
|
||||
|
||||
const u32 *p = pixelData;
|
||||
const int w2 = (w + 1) / 2;
|
||||
@ -653,15 +666,15 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w2; ++i) {
|
||||
u32 a = p[i] & 0x80008000;
|
||||
hitZeroAlpha |= a ^ 0x80008000;
|
||||
bits &= p[i];
|
||||
}
|
||||
|
||||
if ((bits ^ 0x80008000) != 0) {
|
||||
return CHECKALPHA_ZERO;
|
||||
}
|
||||
|
||||
p += stride;
|
||||
}
|
||||
|
||||
if (hitZeroAlpha) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
|
@ -17,6 +17,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
enum CheckAlphaResult {
|
||||
// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
|
||||
CHECKALPHA_FULL = 0,
|
||||
CHECKALPHA_ANY = 4,
|
||||
CHECKALPHA_ZERO = 8,
|
||||
};
|
||||
|
||||
#include "Common/Common.h"
|
||||
#include "Core/MemMap.h"
|
||||
#include "GPU/ge_constants.h"
|
||||
@ -75,13 +82,6 @@ extern ReliableHash64Func DoReliableHash64;
|
||||
typedef u32 ReliableHashType;
|
||||
#endif
|
||||
|
||||
enum CheckAlphaResult {
|
||||
// These are intended to line up with TexCacheEntry::STATUS_ALPHA_UNKNOWN, etc.
|
||||
CHECKALPHA_FULL = 0,
|
||||
CHECKALPHA_ANY = 4,
|
||||
CHECKALPHA_ZERO = 8,
|
||||
};
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h);
|
||||
|
@ -243,3 +243,122 @@ u32 ReliableHash32NEON(const void *input, size_t len, u32 seed) {
|
||||
|
||||
return h32;
|
||||
}
|
||||
|
||||
static inline bool VectorIsNonZeroNEON(const uint32x4_t &v) {
|
||||
u64 low = vgetq_lane_u64(vreinterpretq_u64_u32(v), 0);
|
||||
u64 high = vgetq_lane_u64(vreinterpretq_u64_u32(v), 1);
|
||||
|
||||
return (low | high) != 0;
|
||||
}
|
||||
|
||||
static inline bool VectorIsNonZeroNEON(const uint16x8_t &v) {
|
||||
u64 low = vgetq_lane_u64(vreinterpretq_u64_u16(v), 0);
|
||||
u64 high = vgetq_lane_u64(vreinterpretq_u64_u16(v), 1);
|
||||
|
||||
return (low | high) != 0;
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h) {
|
||||
const uint32x4_t zero = vdupq_n_u32(0);
|
||||
const uint32x4_t full = vdupq_n_u32(0xFF);
|
||||
|
||||
const u32 *p = (const u32 *)pixelData;
|
||||
|
||||
// Have alpha values == 0 been seen?
|
||||
uint32x4_t foundAZero = zero;
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
// Have alpha values > 0 and < 0xFF been seen?
|
||||
uint32x4_t foundFraction = zero;
|
||||
|
||||
for (int i = 0; i < w; i += 4) {
|
||||
const uint32x4_t a = vshrq_n_u32(vld1q_u32(&p[i]), 24);
|
||||
|
||||
const uint32x4_t isZero = vceqq_u32(a, zero);
|
||||
foundAZero = vorrq_u32(foundAZero, isZero);
|
||||
|
||||
// If a = FF, isNotFull will be 0 -> foundFraction will be 0.
|
||||
// If a = 00, a & isNotFull will be 0 -> foundFraction will be 0.
|
||||
// In any other case, foundFraction will have some bits set.
|
||||
const uint32x4_t isNotFull = vcltq_u32(a, full);
|
||||
foundFraction = vorrq_u32(foundFraction, vandq_u32(a, isNotFull));
|
||||
}
|
||||
p += stride;
|
||||
|
||||
// We check any early, in case we can skip the rest of the rows.
|
||||
if (VectorIsNonZeroNEON(foundFraction)) {
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (VectorIsNonZeroNEON(foundAZero)) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h) {
|
||||
const uint16x8_t zero = vdupq_n_u16(0);
|
||||
const uint16x8_t full = vdupq_n_u16(0xF);
|
||||
|
||||
const u16 *p = (const u16 *)pixelData;
|
||||
|
||||
// Have alpha values == 0 been seen?
|
||||
uint16x8_t foundAZero = zero;
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
// Have alpha values > 0 and < 0xFF been seen?
|
||||
uint16x8_t foundFraction = zero;
|
||||
|
||||
for (int i = 0; i < w; i += 8) {
|
||||
const uint16x8_t a = vshrq_n_u16(vld1q_u16(&p[i]), 12);
|
||||
|
||||
const uint16x8_t isZero = vceqq_u16(a, zero);
|
||||
foundAZero = vorrq_u16(foundAZero, isZero);
|
||||
|
||||
// If a = F, isNotFull will be 0 -> foundFraction will be 0.
|
||||
// If a = 0, a & isNotFull will be 0 -> foundFraction will be 0.
|
||||
// In any other case, foundFraction will have some bits set.
|
||||
const uint16x8_t isNotFull = vcltq_u16(a, full);
|
||||
foundFraction = vorrq_u16(foundFraction, vandq_u16(a, isNotFull));
|
||||
}
|
||||
p += stride;
|
||||
|
||||
// We check any early, in case we can skip the rest of the rows.
|
||||
if (VectorIsNonZeroNEON(foundFraction)) {
|
||||
return CHECKALPHA_ANY;
|
||||
}
|
||||
}
|
||||
|
||||
// Now let's sum up the bits.
|
||||
if (VectorIsNonZeroNEON(foundAZero)) {
|
||||
return CHECKALPHA_ZERO;
|
||||
} else {
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
}
|
||||
|
||||
CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h) {
|
||||
const u16 *p = (const u16 *)pixelData;
|
||||
|
||||
const uint16x8_t mask = vdupq_n_u16(1);
|
||||
uint16x8_t bits = vdupq_n_u16(1);
|
||||
for (int y = 0; y < h; ++y) {
|
||||
for (int i = 0; i < w; i += 8) {
|
||||
const uint16x8_t a = vld1q_u16(&p[i]);
|
||||
|
||||
bits = vandq_u16(bits, a);
|
||||
}
|
||||
|
||||
uint16x8_t result = veorq_u16(bits, mask);
|
||||
if (VectorIsNonZeroNEON(result)) {
|
||||
return CHECKALPHA_ZERO;
|
||||
}
|
||||
|
||||
p += stride;
|
||||
}
|
||||
|
||||
return CHECKALPHA_FULL;
|
||||
}
|
||||
|
@ -20,3 +20,7 @@
|
||||
u32 QuickTexHashNEON(const void *checkp, u32 size);
|
||||
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
|
||||
u32 ReliableHash32NEON(const void *input, size_t len, u32 seed);
|
||||
|
||||
CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h);
|
||||
CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h);
|
||||
|
Loading…
Reference in New Issue
Block a user