mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 05:19:56 +00:00
Merge pull request #4487 from unknownbrackets/perf
Texture cache hashing tweaks, disable second cache on mobile
This commit is contained in:
commit
345a4ccf0e
@ -64,10 +64,33 @@ static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
|
||||
}
|
||||
|
||||
static u32 QuickTexHashBasic(const void *checkp, u32 size) {
|
||||
#ifdef __GNUC__
|
||||
#if defined(ARM) && defined(__GNUC__)
|
||||
__builtin_prefetch(checkp, 0, 0);
|
||||
#endif
|
||||
|
||||
u32 check;
|
||||
asm volatile (
|
||||
// Let's change size to the end address.
|
||||
"add %1, %1, %2\n"
|
||||
"mov r6, #0\n"
|
||||
|
||||
// If we have zero sized input, we'll return garbage. Oh well, shouldn't happen.
|
||||
"QuickTexHashBasic_next:\n"
|
||||
"ldmia %2!, {r2-r5}\n"
|
||||
"add r6, r6, r2\n"
|
||||
"eor r6, r6, r3\n"
|
||||
"cmp %2, %1\n"
|
||||
"add r6, r6, r4\n"
|
||||
"eor r6, r6, r5\n"
|
||||
"blo QuickTexHashBasic_next\n"
|
||||
|
||||
"QuickTexHashBasic_done:\n"
|
||||
"mov %0, r6\n"
|
||||
|
||||
: "=r"(check)
|
||||
: "r"(size), "r"(checkp)
|
||||
: "r2", "r3", "r4", "r5", "r6"
|
||||
);
|
||||
#else
|
||||
u32 check = 0;
|
||||
const u32 size_u32 = size / 4;
|
||||
const u32 *p = (const u32 *)checkp;
|
||||
@ -77,6 +100,7 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) {
|
||||
check += p[i + 2];
|
||||
check ^= p[i + 3];
|
||||
}
|
||||
#endif
|
||||
|
||||
return check;
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
__builtin_prefetch(checkp, 0, 0);
|
||||
|
||||
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
|
||||
#if 0
|
||||
uint32x4_t cursor = vdupq_n_u32(0);
|
||||
uint32x4_t cursor2 = vld1q_u32((const u32 *)QuickTexHashInitial);
|
||||
uint32x4_t update = vdupq_n_u32(0x24552455U);
|
||||
@ -46,6 +47,61 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
|
||||
cursor = vaddq_u32(cursor, cursor2);
|
||||
check = vgetq_lane_u32(cursor, 0) + vgetq_lane_u32(cursor, 1) + vgetq_lane_u32(cursor, 2) + vgetq_lane_u32(cursor, 3);
|
||||
#else
|
||||
// d0/d1 (q0) - cursor
|
||||
// d2/d3 (q1) - cursor2
|
||||
// d4/d5 (q2) - update
|
||||
// d6-d13 (q3-q6) - memory transfer
|
||||
asm volatile (
|
||||
// Initialize cursor.
|
||||
"vmov.i32 q0, #0\n"
|
||||
|
||||
// Initialize cursor2.
|
||||
"movw r0, 0x0001\n"
|
||||
"movt r0, 0x0083\n"
|
||||
"movw r1, 0x4309\n"
|
||||
"movt r1, 0x4d9b\n"
|
||||
"vmov d2, r0, r1\n"
|
||||
"movw r0, 0xb651\n"
|
||||
"movt r0, 0x4b73\n"
|
||||
"movw r1, 0x9bd9\n"
|
||||
"movt r1, 0xc00b\n"
|
||||
"vmov d2, r0, r1\n"
|
||||
|
||||
// Initialize update.
|
||||
"movw r0, 0x2455\n"
|
||||
"movt r0, 0x2455\n"
|
||||
"mov r1, r0\n"
|
||||
"vmov d4, r0, r1\n"
|
||||
"vmov d5, r0, r1\n"
|
||||
|
||||
// This is where we end.
|
||||
"add r0, %1, %2\n"
|
||||
|
||||
// Okay, do the memory hashing.
|
||||
"QuickTexHashNEON_next:\n"
|
||||
"pld [%2, #0xc0]\n"
|
||||
"vldmia %2!, {d6-d13}\n"
|
||||
"vmla.i32 q0, q1, q3\n"
|
||||
"veor.i32 q0, q0, q4\n"
|
||||
"vmul.i32 q6, q6, q1\n"
|
||||
"cmp %2, r0\n"
|
||||
"vadd.i32 q0, q0, q5\n"
|
||||
"veor.i32 q0, q0, q6\n"
|
||||
"vadd.i32 q1, q1, q2\n"
|
||||
"blo QuickTexHashNEON_next\n"
|
||||
|
||||
// Now let's get the result.
|
||||
"vadd.i32 q0, q0, q1\n"
|
||||
"vadd.i32 d0, d0, d1\n"
|
||||
"vmov r0, r1, s0, s1\n"
|
||||
"add %0, r0, r1\n"
|
||||
|
||||
: "=r"(check)
|
||||
: "r"(size), "r"(checkp)
|
||||
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13"
|
||||
);
|
||||
#endif
|
||||
} else {
|
||||
const u32 size_u32 = size / 4;
|
||||
const u32 *p = (const u32 *)checkp;
|
||||
|
@ -48,6 +48,14 @@
|
||||
#define GL_UNPACK_ROW_LENGTH 0x0CF2
|
||||
#endif
|
||||
|
||||
// TODO: This helps when you have plenty of VRAM, sometimes quite a bit.
|
||||
// But on Android, it sometimes causes out of memory that isn't recovered from.
|
||||
#if !defined(USING_GLES2) && !defined(_XBOX)
|
||||
#define USE_SECONDARY_CACHE 1
|
||||
#else
|
||||
#define USE_SECONDARY_CACHE 0
|
||||
#endif
|
||||
|
||||
extern int g_iNumVideos;
|
||||
|
||||
TextureCache::TextureCache() : clearCacheNextFrame_(false), lowMemoryMode_(false), clutBuf_(NULL) {
|
||||
@ -107,6 +115,7 @@ void TextureCache::Decimate() {
|
||||
else
|
||||
++iter;
|
||||
}
|
||||
#if USE_SECONDARY_CACHE
|
||||
for (TexCache::iterator iter = secondCache.begin(); iter != secondCache.end(); ) {
|
||||
if (lowMemoryMode_ || iter->second.lastFrame + TEXTURE_KILL_AGE < gpuStats.numFlips) {
|
||||
glDeleteTextures(1, &iter->second.texture);
|
||||
@ -115,6 +124,7 @@ void TextureCache::Decimate() {
|
||||
else
|
||||
++iter;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void TextureCache::Invalidate(u32 addr, int size, GPUInvalidationType type) {
|
||||
@ -955,6 +965,7 @@ void TextureCache::SetTexture(bool force) {
|
||||
|
||||
// Don't give up just yet. Let's try the secondary cache if it's been invalidated before.
|
||||
// If it's failed a bunch of times, then the second cache is just wasting time and VRAM.
|
||||
#if USE_SECONDARY_CACHE
|
||||
if (entry->numInvalidated > 2 && entry->numInvalidated < 128 && !lowMemoryMode_) {
|
||||
u64 secondKey = fullhash | (u64)cluthash << 32;
|
||||
TexCache::iterator secondIter = secondCache.find(secondKey);
|
||||
@ -974,6 +985,7 @@ void TextureCache::SetTexture(bool force) {
|
||||
doDelete = false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user