mirror of
https://github.com/libretro/ppsspp.git
synced 2024-11-25 01:00:01 +00:00
Merge pull request #7029 from unknownbrackets/texhash
Attempt to make the texture hash match(able) - needs testing
This commit is contained in:
commit
a4c8bb03a2
@ -39,7 +39,7 @@ u32 QuickTexHashSSE2(const void *checkp, u32 size) {
|
||||
const __m128i *p = (const __m128i *)checkp;
|
||||
for (u32 i = 0; i < size / 16; i += 4) {
|
||||
__m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);
|
||||
cursor = _mm_add_epi32(cursor, chunk);
|
||||
cursor = _mm_add_epi16(cursor, chunk);
|
||||
cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));
|
||||
cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));
|
||||
chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);
|
||||
@ -63,6 +63,57 @@ u32 QuickTexHashSSE2(const void *checkp, u32 size) {
|
||||
}
|
||||
#endif
|
||||
|
||||
u32 QuickTexHashNonSSE(const void *checkp, u32 size) {
|
||||
u32 check = 0;
|
||||
|
||||
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
|
||||
static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
|
||||
union u32x4_u16x8 {
|
||||
u32 x32[4];
|
||||
u16 x16[8];
|
||||
};
|
||||
u32x4_u16x8 cursor = {0, 0, 0, 0};
|
||||
u32x4_u16x8 cursor2;
|
||||
static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};
|
||||
|
||||
for (u32 j = 0; j < 8; ++j) {
|
||||
cursor2.x16[j] = cursor2_initial[j];
|
||||
}
|
||||
|
||||
const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;
|
||||
for (u32 i = 0; i < size / 16; i += 4) {
|
||||
for (u32 j = 0; j < 8; ++j) {
|
||||
const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];
|
||||
cursor.x16[j] += temp;
|
||||
}
|
||||
for (u32 j = 0; j < 4; ++j) {
|
||||
cursor.x32[j] ^= p[i + 1].x32[j];
|
||||
cursor.x32[j] += p[i + 2].x32[j];
|
||||
}
|
||||
for (u32 j = 0; j < 8; ++j) {
|
||||
const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];
|
||||
cursor.x16[j] ^= temp;
|
||||
}
|
||||
for (u32 j = 0; j < 8; ++j) {
|
||||
cursor2.x16[j] += update[j];
|
||||
}
|
||||
}
|
||||
|
||||
for (u32 j = 0; j < 4; ++j) {
|
||||
cursor.x32[j] += cursor2.x32[j];
|
||||
}
|
||||
check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];
|
||||
} else {
|
||||
const u32 *p = (const u32 *)checkp;
|
||||
for (u32 i = 0; i < size / 8; ++i) {
|
||||
check += *p++;
|
||||
check ^= *p++;
|
||||
}
|
||||
}
|
||||
|
||||
return check;
|
||||
}
|
||||
|
||||
static u32 QuickTexHashBasic(const void *checkp, u32 size) {
|
||||
#if defined(ARM) && defined(__GNUC__)
|
||||
__builtin_prefetch(checkp, 0, 0);
|
||||
|
@ -22,7 +22,7 @@
|
||||
#error Should not be compiled on non-ARM.
|
||||
#endif
|
||||
|
||||
static const u16 MEMORY_ALIGNED16(QuickTexHashInitial[8]) = {0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU};
|
||||
static const u16 MEMORY_ALIGNED16(QuickTexHashInitial[8]) = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
|
||||
|
||||
u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
u32 check = 0;
|
||||
@ -31,21 +31,21 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
|
||||
#ifdef IOS
|
||||
uint32x4_t cursor = vdupq_n_u32(0);
|
||||
uint32x4_t cursor2 = vld1q_u32((const u32 *)QuickTexHashInitial);
|
||||
uint32x4_t update = vdupq_n_u32(0x24552455U);
|
||||
uint16x8_t cursor2 = vld1q_u16(QuickTexHashInitial);
|
||||
uint16x8_t update = vdupq_n_u16(0x2455U);
|
||||
|
||||
const u32 *p = (const u32 *)checkp;
|
||||
for (u32 i = 0; i < size / 16; i += 4) {
|
||||
cursor = vmlaq_u32(cursor, vld1q_u32(&p[4 * 0]), cursor2);
|
||||
cursor = vreinterpretq_u32_u16(vmlaq_u16(vreinterpretq_u16_u32(cursor), vreinterpretq_u16_u32(vld1q_u32(&p[4 * 0])), cursor2));
|
||||
cursor = veorq_u32(cursor, vld1q_u32(&p[4 * 1]));
|
||||
cursor = vaddq_u32(cursor, vld1q_u32(&p[4 * 2]));
|
||||
cursor = veorq_u32(cursor, vmulq_u32(vld1q_u32(&p[4 * 3]), cursor2));
|
||||
cursor2 = vaddq_u32(cursor2, update);
|
||||
cursor = veorq_u32(cursor, vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(vld1q_u32(&p[4 * 3])), cursor2)));
|
||||
cursor2 = vaddq_u16(cursor2, update);
|
||||
|
||||
p += 4 * 4;
|
||||
}
|
||||
|
||||
cursor = vaddq_u32(cursor, cursor2);
|
||||
cursor = vaddq_u32(cursor, vreinterpretq_u32_u16(cursor2));
|
||||
check = vgetq_lane_u32(cursor, 0) + vgetq_lane_u32(cursor, 1) + vgetq_lane_u32(cursor, 2) + vgetq_lane_u32(cursor, 3);
|
||||
#else
|
||||
// TODO: Why does this crash on iOS, but only certain devices?
|
||||
@ -60,15 +60,15 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
"vmov.i32 q0, #0\n"
|
||||
|
||||
// Initialize cursor2.
|
||||
"movw r0, 0x0001\n"
|
||||
"movt r0, 0x0083\n"
|
||||
"movw r1, 0x4309\n"
|
||||
"movt r1, 0x4d9b\n"
|
||||
"movw r0, 0xc00b\n"
|
||||
"movt r0, 0x9bd9\n"
|
||||
"movw r1, 0x4b73\n"
|
||||
"movt r1, 0xb651\n"
|
||||
"vmov d2, r0, r1\n"
|
||||
"movw r0, 0xb651\n"
|
||||
"movt r0, 0x4b73\n"
|
||||
"movw r1, 0x9bd9\n"
|
||||
"movt r1, 0xc00b\n"
|
||||
"movw r0, 0x4d9b\n"
|
||||
"movt r0, 0x4309\n"
|
||||
"movw r1, 0x0083\n"
|
||||
"movt r1, 0x0001\n"
|
||||
"vmov d3, r0, r1\n"
|
||||
|
||||
// Initialize update.
|
||||
@ -82,12 +82,12 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
"QuickTexHashNEON_next:\n"
|
||||
"pld [%2, #0xc0]\n"
|
||||
"vldmia %2!, {d16-d23}\n"
|
||||
"vmla.i32 q0, q1, q8\n"
|
||||
"vmul.i32 q11, q11, q1\n"
|
||||
"vmla.i16 q0, q1, q8\n"
|
||||
"vmul.i16 q11, q11, q1\n"
|
||||
"veor.i32 q0, q0, q9\n"
|
||||
"cmp %2, r0\n"
|
||||
"vadd.i32 q0, q0, q10\n"
|
||||
"vadd.i32 q1, q1, q2\n"
|
||||
"vadd.i16 q1, q1, q2\n"
|
||||
"veor.i32 q0, q0, q11\n"
|
||||
"blo QuickTexHashNEON_next\n"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user