Merge pull request #4487 from unknownbrackets/perf

Texture cache hashing tweaks, disable second cache on mobile
2024-10-07 10:53:31 +00:00 · 2013-11-09 17:30:40 -08:00 · 2013-11-09 17:30:40 -08:00 · 345a4ccf0e
commit 345a4ccf0e
parent bcd499ff38 fa79d7f13c
3 changed files with 94 additions and 2 deletions
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@ -64,10 +64,33 @@ static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
 }

 static u32 QuickTexHashBasic(const void *checkp, u32 size) {
-#ifdef __GNUC__
+#if defined(ARM) && defined(__GNUC__)
 	__builtin_prefetch(checkp, 0, 0);
-#endif

+	u32 check;
+	asm volatile (
+		// Let's change size to the end address.
+		"add %1, %1, %2\n"
+		"mov r6, #0\n"
+
+		// If we have zero sized input, we'll return garbage.  Oh well, shouldn't happen.
+		"QuickTexHashBasic_next:\n"
+		"ldmia %2!, {r2-r5}\n"
+		"add r6, r6, r2\n"
+		"eor r6, r6, r3\n"
+		"cmp %2, %1\n"
+		"add r6, r6, r4\n"
+		"eor r6, r6, r5\n"
+		"blo QuickTexHashBasic_next\n"
+
+		"QuickTexHashBasic_done:\n"
+		"mov %0, r6\n"
+
+		: "=r"(check)
+		: "r"(size), "r"(checkp)
+		: "r2", "r3", "r4", "r5", "r6"
+	);
+#else
 	u32 check = 0;
 	const u32 size_u32 = size / 4;
 	const u32 *p = (const u32 *)checkp;
@ -77,6 +100,7 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) {
 		check += p[i + 2];
 		check ^= p[i + 3];
 	}
+#endif

 	return check;
 }
--- a/GPU/Common/TextureDecoderNEON.cpp
+++ b/GPU/Common/TextureDecoderNEON.cpp
@ -29,6 +29,7 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
 	__builtin_prefetch(checkp, 0, 0);

 	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
+#if 0
 		uint32x4_t cursor = vdupq_n_u32(0);
 		uint32x4_t cursor2 = vld1q_u32((const u32 *)QuickTexHashInitial);
 		uint32x4_t update = vdupq_n_u32(0x24552455U);
@ -46,6 +47,61 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {

 		cursor = vaddq_u32(cursor, cursor2);
 		check = vgetq_lane_u32(cursor, 0) + vgetq_lane_u32(cursor, 1) + vgetq_lane_u32(cursor, 2) + vgetq_lane_u32(cursor, 3);
+#else
+		// d0/d1 (q0) - cursor
+		// d2/d3 (q1) - cursor2
+		// d4/d5 (q2) - update
+		// d6-d13 (q3-q6) - memory transfer
+		asm volatile (
+			// Initialize cursor.
+			"vmov.i32 q0, #0\n"
+
+			// Initialize cursor2.
+			"movw r0, 0x0001\n"
+			"movt r0, 0x0083\n"
+			"movw r1, 0x4309\n"
+			"movt r1, 0x4d9b\n"
+			"vmov d2, r0, r1\n"
+			"movw r0, 0xb651\n"
+			"movt r0, 0x4b73\n"
+			"movw r1, 0x9bd9\n"
+			"movt r1, 0xc00b\n"
+			"vmov d2, r0, r1\n"
+
+			// Initialize update.
+			"movw r0, 0x2455\n"
+			"movt r0, 0x2455\n"
+			"mov r1, r0\n"
+			"vmov d4, r0, r1\n"
+			"vmov d5, r0, r1\n"
+
+			// This is where we end.
+			"add r0, %1, %2\n"
+
+			// Okay, do the memory hashing.
+			"QuickTexHashNEON_next:\n"
+			"pld [%2, #0xc0]\n"
+			"vldmia %2!, {d6-d13}\n"
+			"vmla.i32 q0, q1, q3\n"
+			"veor.i32 q0, q0, q4\n"
+			"vmul.i32 q6, q6, q1\n"
+			"cmp %2, r0\n"
+			"vadd.i32 q0, q0, q5\n"
+			"veor.i32 q0, q0, q6\n"
+			"vadd.i32 q1, q1, q2\n"
+			"blo QuickTexHashNEON_next\n"
+
+			// Now let's get the result.
+			"vadd.i32 q0, q0, q1\n"
+			"vadd.i32 d0, d0, d1\n"
+			"vmov r0, r1, s0, s1\n"
+			"add %0, r0, r1\n"
+
+			: "=r"(check)
+			: "r"(size), "r"(checkp)
+			: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13"
+		);
+#endif
 	} else {
 		const u32 size_u32 = size / 4;
 		const u32 *p = (const u32 *)checkp;
--- a/GPU/GLES/TextureCache.cpp
+++ b/GPU/GLES/TextureCache.cpp
@ -48,6 +48,14 @@
 #define GL_UNPACK_ROW_LENGTH 0x0CF2
 #endif

+// TODO: This helps when you have plenty of VRAM, sometimes quite a bit.
+// But on Android, it sometimes causes out of memory that isn't recovered from.
+#if !defined(USING_GLES2) && !defined(_XBOX)
+#define USE_SECONDARY_CACHE 1
+#else
+#define USE_SECONDARY_CACHE 0
+#endif
+
 extern int g_iNumVideos;

 TextureCache::TextureCache() : clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL) {
@ -107,6 +115,7 @@ void TextureCache::Decimate() {
 		else
 			++iter;
 	}
+#if USE_SECONDARY_CACHE
 	for (TexCache::iterator iter = secondCache.begin(); iter != secondCache.end(); ) {
 		if (lowMemoryMode_ || iter->second.lastFrame + TEXTURE_KILL_AGE < gpuStats.numFlips) {
 			glDeleteTextures(1, &iter->second.texture);
@ -115,6 +124,7 @@ void TextureCache::Decimate() {
 		else
 			++iter;
 	}
+#endif
 }

 void TextureCache::Invalidate(u32 addr, int size, GPUInvalidationType type) {
@ -955,6 +965,7 @@ void TextureCache::SetTexture(bool force) {

 				// Don't give up just yet.  Let's try the secondary cache if it's been invalidated before.
 				// If it's failed a bunch of times, then the second cache is just wasting time and VRAM.
+#if USE_SECONDARY_CACHE
 				if (entry->numInvalidated > 2 && entry->numInvalidated < 128 && !lowMemoryMode_) {
 					u64 secondKey = fullhash | (u64)cluthash << 32;
 					TexCache::iterator secondIter = secondCache.find(secondKey);
@ -974,6 +985,7 @@ void TextureCache::SetTexture(bool force) {
 						doDelete = false;
 					}
 				}
+#endif
 			}
 		}