From 3b5c71170cdc0c4ce924d8fcbd8f5b84c3f8908d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 1 Jun 2024 18:54:22 +0200 Subject: [PATCH] IRInterpreter: Various SIMD optimization. Move out the reverse-bits implementation --- Common/BitSet.h | 17 ++++++++++++++- Core/MIPS/IR/IRInterpreter.cpp | 40 ++++++++++++++-------------------- Core/MIPS/IR/IRInterpreter.h | 37 +++++++++++++++++-------------- 3 files changed, 53 insertions(+), 41 deletions(-) diff --git a/Common/BitSet.h b/Common/BitSet.h index 28c7597048..e76ddcce63 100644 --- a/Common/BitSet.h +++ b/Common/BitSet.h @@ -8,7 +8,21 @@ #include #include "CommonTypes.h" -// Helper functions: +// TODO: ARM has an intrinsic for the RBIT instruction in some compilers, __rbit. +inline u32 ReverseBits32(u32 v) { + // http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + // swap odd and even bits + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + // swap consecutive pairs + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + // swap nibbles ... + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + // swap bytes + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + // swap 2-byte long pairs + v = (v >> 16) | (v << 16); + return v; +} #ifdef _WIN32 #include @@ -35,6 +49,7 @@ inline int LeastSignificantSetBit(u64 val) _BitScanForward64(&index, val); return (int)index; } + #endif #else inline int CountSetBits(u32 val) { return __builtin_popcount(val); } diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index a45d50455e..1ba4621f35 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -2,6 +2,7 @@ #include #include "ppsspp_config.h" +#include "Common/BitSet.h" #include "Common/BitScan.h" #include "Common/Common.h" #include "Common/Data/Convert/SmallDataConvert.h" @@ -88,25 +89,6 @@ u32 IRRunMemCheck(u32 pc, u32 addr) { return coreState != CORE_RUNNING ? 1 : 0; } -template -u32 RunValidateAddress(u32 pc, u32 addr, u32 isWrite) { - const auto toss = [&](MemoryExceptionType t) { - Core_MemoryException(addr, alignment, pc, t); - return coreState != CORE_RUNNING ? 1 : 0; - }; - - if (!Memory::IsValidRange(addr, alignment)) { - MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD; - if constexpr (alignment > 4) - t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK; - return toss(t); - } - if constexpr (alignment > 1) - if ((addr & (alignment - 1)) != 0) - return toss(MemoryExceptionType::ALIGNMENT); - return 0; -} - // We cannot use NEON on ARM32 here until we make it a hard dependency. We can, however, on ARM64. u32 IRInterpret(MIPSState *mips, const IRInst *inst) { while (true) { @@ -344,6 +326,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2]))); +#elif PPSSPP_ARCH(ARM64_NEON) + vst1q_f32(&mips->f[inst->dest], vdivq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2]))); #else for (int i = 0; i < 4; i++) mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i]; @@ -355,6 +339,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { { #if defined(_M_SSE) _mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2]))); +#elif PPSSPP_ARCH(ARM_NEON) + vst1q_f32(&mips->f[inst->dest], vmulq_lane_f32(vld1q_f32(&mips->f[inst->src1]), vdup_n_f32(mips->f[inst->src2]), 0)); #else const float factor = mips->f[inst->src2]; for (int i = 0; i < 4; i++) @@ -410,6 +396,11 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { src = _mm_unpacklo_epi8(src, _mm_setzero_si128()); src = _mm_unpacklo_epi16(src, _mm_setzero_si128()); _mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24)); +#elif PPSSPP_ARCH(ARM_NEON) && 0 // Untested + const uint8x8_t value = (uint8x8_t)vdup_n_u32(mips->fi[inst->src1]); + const uint16x8_t value16 = vmovl_u8(value); + const uint32x4_t value32 = vshll_n_u16(vget_low_u16(value16), 24); + vst1q_u32(&mips->fi[inst->dest], value32); #else mips->fi[inst->dest] = (mips->fi[inst->src1] << 24); mips->fi[inst->dest + 1] = (mips->fi[inst->src1] << 16) & 0xFF000000; @@ -498,8 +489,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { case IROp::FCmpVfpuBit: { - int op = inst->dest & 0xF; - int bit = inst->dest >> 4; + const int op = inst->dest & 0xF; + const int bit = inst->dest >> 4; int result = 0; switch (op) { case VC_EQ: result = mips->f[inst->src1] == mips->f[inst->src2]; break; @@ -531,8 +522,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { case IROp::FCmpVfpuAggregate: { - u32 mask = inst->dest; - u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC]; + const u32 mask = inst->dest; + const u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC]; int anyBit = (cc & mask) ? 0x10 : 0x00; int allBit = (cc & mask) == mask ? 0x20 : 0x00; mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | anyBit | allBit; @@ -734,13 +725,14 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) { case IROp::BSwap16: { u32 x = mips->r[inst->src1]; + // Don't think we can beat this with intrinsics. mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8); break; } case IROp::BSwap32: { u32 x = mips->r[inst->src1]; - mips->r[inst->dest] = ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) | ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24); + mips->r[inst->dest] = swap32(x); break; } diff --git a/Core/MIPS/IR/IRInterpreter.h b/Core/MIPS/IR/IRInterpreter.h index d381183586..6f02ec9482 100644 --- a/Core/MIPS/IR/IRInterpreter.h +++ b/Core/MIPS/IR/IRInterpreter.h @@ -1,26 +1,31 @@ #pragma once #include "Common/CommonTypes.h" +#include "Core/Core.h" +#include "Core/MemMap.h" class MIPSState; struct IRInst; -inline static u32 ReverseBits32(u32 v) { - // http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel - // swap odd and even bits - v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); - // swap consecutive pairs - v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); - // swap nibbles ... - v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); - // swap bytes - v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); - // swap 2-byte long pairs - v = ( v >> 16 ) | ( v << 16); - return v; -} - u32 IRRunBreakpoint(u32 pc); u32 IRRunMemCheck(u32 pc, u32 addr); - u32 IRInterpret(MIPSState *ms, const IRInst *inst); + +template +u32 RunValidateAddress(u32 pc, u32 addr, u32 isWrite) { + const auto toss = [&](MemoryExceptionType t) { + Core_MemoryException(addr, alignment, pc, t); + return coreState != CORE_RUNNING ? 1 : 0; + }; + + if (!Memory::IsValidRange(addr, alignment)) { + MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD; + if constexpr (alignment > 4) + t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK; + return toss(t); + } + if constexpr (alignment > 1) + if ((addr & (alignment - 1)) != 0) + return toss(MemoryExceptionType::ALIGNMENT); + return 0; +}