Rewrite QuickTexHashNEON directly in asm.

Seems like gcc couldn't optimize it to this.
This commit is contained in:
Unknown W. Brackets 2013-11-09 12:54:55 -08:00
parent 3f57f1f447
commit 2dfa2379f4

View File

@ -29,6 +29,7 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
__builtin_prefetch(checkp, 0, 0);
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
#if 0
uint32x4_t cursor = vdupq_n_u32(0);
uint32x4_t cursor2 = vld1q_u32((const u32 *)QuickTexHashInitial);
uint32x4_t update = vdupq_n_u32(0x24552455U);
@ -46,6 +47,61 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
cursor = vaddq_u32(cursor, cursor2);
check = vgetq_lane_u32(cursor, 0) + vgetq_lane_u32(cursor, 1) + vgetq_lane_u32(cursor, 2) + vgetq_lane_u32(cursor, 3);
#else
// d0/d1 (q0) - cursor
// d2/d3 (q1) - cursor2
// d4/d5 (q2) - update
// d6-d13 (q3-q6) - memory transfer
asm volatile (
// Initialize cursor.
"vmov.i32 q0, #0\n"
// Initialize cursor2.
"movw r0, 0x0001\n"
"movt r0, 0x0083\n"
"movw r1, 0x4309\n"
"movt r1, 0x4d9b\n"
"vmov d2, r0, r1\n"
"movw r0, 0xb651\n"
"movt r0, 0x4b73\n"
"movw r1, 0x9bd9\n"
"movt r1, 0xc00b\n"
"vmov d2, r0, r1\n"
// Initialize update.
"movw r0, 0x2455\n"
"movt r0, 0x2455\n"
"mov r1, r0\n"
"vmov d4, r0, r1\n"
"vmov d5, r0, r1\n"
// This is where we end.
"add r0, %1, %2\n"
// Okay, do the memory hashing.
"QuickTexHashNEON_next:\n"
"pld [%2, #0xc0]\n"
"vldmia %2!, {d6-d13}\n"
"vmla.i32 q0, q1, q3\n"
"veor.i32 q0, q0, q4\n"
"vmul.i32 q6, q6, q1\n"
"cmp %2, r0\n"
"vadd.i32 q0, q0, q5\n"
"veor.i32 q0, q0, q6\n"
"vadd.i32 q1, q1, q2\n"
"blo QuickTexHashNEON_next\n"
// Now let's get the result.
"vadd.i32 q0, q0, q1\n"
"vadd.i32 d0, d0, d1\n"
"vmov r0, r1, s0, s1\n"
"add %0, r0, r1\n"
: "=r"(check)
: "r"(size), "r"(checkp)
: "r0", "r1"
);
#endif
} else {
const u32 size_u32 = size / 4;
const u32 *p = (const u32 *)checkp;