diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index 21d4a76e0d..4d66ecd1a7 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -64,10 +64,33 @@ static u32 QuickTexHashSSE2(const void *checkp, u32 size) { } static u32 QuickTexHashBasic(const void *checkp, u32 size) { -#ifdef __GNUC__ +#if defined(ARM) && defined(__GNUC__) __builtin_prefetch(checkp, 0, 0); -#endif + u32 check; + asm volatile ( + // Let's change size to the end address. + "add %1, %1, %2\n" + "mov r6, #0\n" + + // If we have zero sized input, we'll return garbage. Oh well, shouldn't happen. + "QuickTexHashBasic_next:\n" + "ldmia %2!, {r2-r5}\n" + "add r6, r6, r2\n" + "eor r6, r6, r3\n" + "cmp %2, %1\n" + "add r6, r6, r4\n" + "eor r6, r6, r5\n" + "blo QuickTexHashBasic_next\n" + + "QuickTexHashBasic_done:\n" + "mov %0, r6\n" + + : "=r"(check) + : "r"(size), "r"(checkp) + : "r2", "r3", "r4", "r5", "r6" + ); +#else u32 check = 0; const u32 size_u32 = size / 4; const u32 *p = (const u32 *)checkp; @@ -77,6 +100,7 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) { check += p[i + 2]; check ^= p[i + 3]; } +#endif return check; } diff --git a/GPU/Common/TextureDecoderNEON.cpp b/GPU/Common/TextureDecoderNEON.cpp index 3c249de082..fc7750a657 100644 --- a/GPU/Common/TextureDecoderNEON.cpp +++ b/GPU/Common/TextureDecoderNEON.cpp @@ -29,6 +29,7 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) { __builtin_prefetch(checkp, 0, 0); if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) { +#if 0 uint32x4_t cursor = vdupq_n_u32(0); uint32x4_t cursor2 = vld1q_u32((const u32 *)QuickTexHashInitial); uint32x4_t update = vdupq_n_u32(0x24552455U); @@ -46,6 +47,61 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) { cursor = vaddq_u32(cursor, cursor2); check = vgetq_lane_u32(cursor, 0) + vgetq_lane_u32(cursor, 1) + vgetq_lane_u32(cursor, 2) + vgetq_lane_u32(cursor, 3); +#else + // d0/d1 (q0) - cursor + // d2/d3 (q1) - cursor2 + // d4/d5 (q2) - update + // d6-d13 (q3-q6) - memory transfer + asm volatile ( + // Initialize cursor. + "vmov.i32 q0, #0\n" + + // Initialize cursor2. + "movw r0, 0x0001\n" + "movt r0, 0x0083\n" + "movw r1, 0x4309\n" + "movt r1, 0x4d9b\n" + "vmov d2, r0, r1\n" + "movw r0, 0xb651\n" + "movt r0, 0x4b73\n" + "movw r1, 0x9bd9\n" + "movt r1, 0xc00b\n" + "vmov d2, r0, r1\n" + + // Initialize update. + "movw r0, 0x2455\n" + "movt r0, 0x2455\n" + "mov r1, r0\n" + "vmov d4, r0, r1\n" + "vmov d5, r0, r1\n" + + // This is where we end. + "add r0, %1, %2\n" + + // Okay, do the memory hashing. + "QuickTexHashNEON_next:\n" + "pld [%2, #0xc0]\n" + "vldmia %2!, {d6-d13}\n" + "vmla.i32 q0, q1, q3\n" + "veor.i32 q0, q0, q4\n" + "vmul.i32 q6, q6, q1\n" + "cmp %2, r0\n" + "vadd.i32 q0, q0, q5\n" + "veor.i32 q0, q0, q6\n" + "vadd.i32 q1, q1, q2\n" + "blo QuickTexHashNEON_next\n" + + // Now let's get the result. + "vadd.i32 q0, q0, q1\n" + "vadd.i32 d0, d0, d1\n" + "vmov r0, r1, s0, s1\n" + "add %0, r0, r1\n" + + : "=r"(check) + : "r"(size), "r"(checkp) + : "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13" + ); +#endif } else { const u32 size_u32 = size / 4; const u32 *p = (const u32 *)checkp; diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index ae0244b04d..8edb791a22 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -48,6 +48,14 @@ #define GL_UNPACK_ROW_LENGTH 0x0CF2 #endif +// TODO: This helps when you have plenty of VRAM, sometimes quite a bit. +// But on Android, it sometimes causes out of memory that isn't recovered from. +#if !defined(USING_GLES2) && !defined(_XBOX) +#define USE_SECONDARY_CACHE 1 +#else +#define USE_SECONDARY_CACHE 0 +#endif + extern int g_iNumVideos; TextureCache::TextureCache() : clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL) { @@ -107,6 +115,7 @@ void TextureCache::Decimate() { else ++iter; } +#if USE_SECONDARY_CACHE for (TexCache::iterator iter = secondCache.begin(); iter != secondCache.end(); ) { if (lowMemoryMode_ || iter->second.lastFrame + TEXTURE_KILL_AGE < gpuStats.numFlips) { glDeleteTextures(1, &iter->second.texture); @@ -115,6 +124,7 @@ void TextureCache::Decimate() { else ++iter; } +#endif } void TextureCache::Invalidate(u32 addr, int size, GPUInvalidationType type) { @@ -955,6 +965,7 @@ void TextureCache::SetTexture(bool force) { // Don't give up just yet. Let's try the secondary cache if it's been invalidated before. // If it's failed a bunch of times, then the second cache is just wasting time and VRAM. +#if USE_SECONDARY_CACHE if (entry->numInvalidated > 2 && entry->numInvalidated < 128 && !lowMemoryMode_) { u64 secondKey = fullhash | (u64)cluthash << 32; TexCache::iterator secondIter = secondCache.find(secondKey); @@ -974,6 +985,7 @@ void TextureCache::SetTexture(bool force) { doDelete = false; } } +#endif } }