From 510ef2c738ba8a3405b87a014461340578ce538a Mon Sep 17 00:00:00 2001 From: Tiny Tiger Date: Sat, 6 Aug 2016 23:56:17 +0200 Subject: [PATCH] More intense debugging. --- mupen64plus-rsp-cxd4/rsp.c | 5 +- mupen64plus-rsp-cxd4/vu/vmadn.h | 14 ++++ .../arch/x86_64/rsp/vmuln.h | 82 +++++++++++++++++++ mupen64plus-rsp-paraLLEl/rsp.cpp | 9 +- 4 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 mupen64plus-rsp-paraLLEl/arch/x86_64/rsp/vmuln.h diff --git a/mupen64plus-rsp-cxd4/rsp.c b/mupen64plus-rsp-cxd4/rsp.c index 50ab9001..b80dfbb2 100644 --- a/mupen64plus-rsp-cxd4/rsp.c +++ b/mupen64plus-rsp-cxd4/rsp.c @@ -93,7 +93,6 @@ static INLINE unsigned SPECIAL(uint32_t inst, uint32_t PC) { uint64_t hash = hash_imem((const uint8_t*)VR, sizeof(VR)); fprintf(stderr, "JR (PC: %u): 0, %llu\n", temp_PC & 0xfff, hash); - fprintf(stderr, " DMEM HASH: 0x%016llx\n", hash_imem(RSP.DMEM, 0x1000)); } return 1; @@ -486,6 +485,10 @@ EX: const int e = (inst >> 21) & 0xF; /* rs & 0xF */ COP2_C2[opcode](vd, vs, vt, e); + { + uint64_t hash = hash_imem((const uint8_t*)VR, sizeof(VR)); + fprintf(stderr, "CP2 (PC: %u): 0, %llu\n", opcode, hash); + } } else if (run_task_opcode(inst, inst >> 26)) { diff --git a/mupen64plus-rsp-cxd4/vu/vmadn.h b/mupen64plus-rsp-cxd4/vu/vmadn.h index 103ee75c..2cfb2ba0 100644 --- a/mupen64plus-rsp-cxd4/vu/vmadn.h +++ b/mupen64plus-rsp-cxd4/vu/vmadn.h @@ -15,6 +15,18 @@ INLINE static void do_madn(short* VD, short* VS, short* VT) { + unsigned i; + for (i = 0; i < 8; i++) + fprintf(stderr, "ACC LO[%u] = %d\n", i, VACC_L[i]); + for (i = 0; i < 8; i++) + fprintf(stderr, "ACC MD[%u] = %d\n", i, VACC_M[i]); + for (i = 0; i < 8; i++) + fprintf(stderr, "ACC HI[%u] = %d\n", i, VACC_H[i]); + for (i = 0; i < 8; i++) + fprintf(stderr, "VS[%u] = %d\n", i, VS[i]); + for (i = 0; i < 8; i++) + fprintf(stderr, "VT[%u] = %d\n", i, VT[i]); + #ifdef ARCH_MIN_SSE2 __m128i acc_hi, acc_md, acc_lo; __m128i prod_hi, prod_lo; @@ -76,6 +88,8 @@ INLINE static void do_madn(short* VD, short* VS, short* VT) vs = _mm_xor_si128(vs, acc_md); /* Stupid unsigned-clamp-ish adjustment. */ _mm_storeu_si128((__m128i *)VD, vs); + for (i = 0; i < 8; i++) + fprintf(stderr, "VD[%u] = %d\n", i, VD[i]); #else uint32_t addend[N]; register int i; diff --git a/mupen64plus-rsp-paraLLEl/arch/x86_64/rsp/vmuln.h b/mupen64plus-rsp-paraLLEl/arch/x86_64/rsp/vmuln.h new file mode 100644 index 00000000..8bc00a01 --- /dev/null +++ b/mupen64plus-rsp-paraLLEl/arch/x86_64/rsp/vmuln.h @@ -0,0 +1,82 @@ +// +// arch/x86_64/rsp/vmuln.h +// +// This file is subject to the terms and conditions defined in +// 'LICENSE', which is part of this source code package. +// + +template +static inline __m128i rsp_vmadn_vmudn(__m128i vs, __m128i vt, + __m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) { + __m128i lo, hi, sign, overflow_mask; + + if (VMADN) + { + for (unsigned i = 0; i < 8; i++) + fprintf(stderr, "ACC LO[%u] = %d\n", i, reinterpret_cast(acc_lo)[i]); + for (unsigned i = 0; i < 8; i++) + fprintf(stderr, "ACC MD[%u] = %d\n", i, reinterpret_cast(acc_md)[i]); + for (unsigned i = 0; i < 8; i++) + fprintf(stderr, "ACC HI[%u] = %d\n", i, reinterpret_cast(acc_hi)[i]); + for (unsigned i = 0; i < 8; i++) + fprintf(stderr, "VS[%u] = %d\n", i, reinterpret_cast(&vs)[i]); + for (unsigned i = 0; i < 8; i++) + fprintf(stderr, "VT[%u] = %d\n", i, reinterpret_cast(&vt)[i]); + } + + lo = _mm_mullo_epi16(vs, vt); + hi = _mm_mulhi_epu16(vs, vt); + + // What we're really want to do is unsigned vs * signed vt. + // However, we have no such instructions to do so. + // + // There's a trick to "fix" an unsigned product, though: + // If vt was negative, take the upper 16-bits of the product + // and subtract vs. + sign = _mm_srai_epi16(vt, 15); + vs = _mm_and_si128(vs, sign); + hi = _mm_sub_epi16(hi, vs); + + // VMADN + if (VMADN) { + // Tricky part: start accumulate everything. + // Get/keep the carry as we'll add it in later. + overflow_mask = _mm_adds_epu16(*acc_lo, lo); + *acc_lo = _mm_add_epi16(*acc_lo, lo); + + overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // This is REALLY clever. Since the product results from + // two 16-bit components, one positive and one negative, + // we don't have to worry about carrying the 1 (we can + // only borrow) past 32-bits. So we can just add it here. + hi = _mm_sub_epi16(hi, overflow_mask); + + // Check for overflow of the upper sum. + overflow_mask = _mm_adds_epu16(*acc_md, hi); + *acc_md = _mm_add_epi16(*acc_md, hi); + + overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // Finish up the accumulation of the... accumulator. + *acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15)); + *acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask); + //return rsp_uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero); + auto ret = rsp_uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero); + for (unsigned i = 0; i < 8; i++) + fprintf(stderr, "VD[%u] = %d\n", i, reinterpret_cast(&ret)[i]); + return ret; + } + + // VMUDN + else { + *acc_lo = lo; + *acc_md = hi; + *acc_hi = _mm_srai_epi16(hi, 15); + + return lo; + } +} + diff --git a/mupen64plus-rsp-paraLLEl/rsp.cpp b/mupen64plus-rsp-paraLLEl/rsp.cpp index 76efb8f7..1fe45639 100644 --- a/mupen64plus-rsp-paraLLEl/rsp.cpp +++ b/mupen64plus-rsp-paraLLEl/rsp.cpp @@ -18,7 +18,8 @@ void RSP_DEBUG(RSP::CPUState *rsp, const char *tag, unsigned pc, unsigned value) { uint64_t hash = hash_imem((const uint8_t*)rsp->cp2.regs, sizeof(rsp->cp2.regs)); fprintf(stderr, "%s (PC: %u): %u, %llu\n", tag, pc, value, hash); - fprintf(stderr, " DMEM HASH: 0x%016llx\n", hash_imem((const uint8_t*)rsp->dmem, 0x1000)); + if (value) + fprintf(stderr, " DMEM HASH: 0x%016llx\n", hash_imem((const uint8_t*)rsp->dmem, 0x1000)); } #endif } @@ -413,6 +414,10 @@ Func CPU::jit_region(uint64_t hash, unsigned pc, unsigned count) DISASM("RSP_RESERVED v%u, v%u, v%u[%u]\n", vd, vs, vt, e); //fprintf(stderr, "Unimplemented COP2 op %u.\n", op); } + +#ifdef INTENSE_DEBUG + APPEND("RSP_DEBUG(STATE, \"CP2\", %u, 0);\n", op); +#endif } else { @@ -475,7 +480,9 @@ Func CPU::jit_region(uint64_t hash, unsigned pc, unsigned count) set_pc_indirect(rs); pipe_pending_return = true; DISASM("jr %s\n", NAME(rs)); +#ifdef INTENSE_DEBUG APPEND("RSP_DEBUG(STATE, \"JR\", pipe_branch_delay * 4, 0);\n"); +#endif break; case 015: // BREAK