ppsspp/Core/HLE/ReplaceTables.cpp

465 lines
14 KiB
C++

// Copyright (c) 2013- PPSSPP Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include <map>
#include "base/basictypes.h"
#include "base/logging.h"
#include "Core/MIPS/JitCommon/JitCommon.h"
#include "Core/MIPS/MIPSAnalyst.h"
#include "Core/HLE/ReplaceTables.h"
#include "Core/HLE/FunctionWrappers.h"
#include "GPU/Math3D.h"
#if defined(_M_IX86) || defined(_M_X64)
#include <emmintrin.h>
#endif
// I think these have to be pretty accurate as these are libc replacements,
// but we can probably get away with approximating the VFPU vsin/vcos and vrot
// pretty roughly.
static int Replace_sinf() {
float f = PARAMF(0);
RETURNF(sinf(f));
return 80; // guess number of cycles
}
static int Replace_cosf() {
float f = PARAMF(0);
RETURNF(cosf(f));
return 80; // guess number of cycles
}
static int Replace_tanf() {
float f = PARAMF(0);
RETURNF(tanf(f));
return 80; // guess number of cycles
}
static int Replace_acosf() {
float f = PARAMF(0);
RETURNF(acosf(f));
return 80; // guess number of cycles
}
static int Replace_asinf() {
float f = PARAMF(0);
RETURNF(asinf(f));
return 80; // guess number of cycles
}
static int Replace_atanf() {
float f = PARAMF(0);
RETURNF(atanf(f));
return 80; // guess number of cycles
}
static int Replace_sqrtf() {
float f = PARAMF(0);
RETURNF(sqrtf(f));
return 80; // guess number of cycles
}
static int Replace_atan2f() {
float f1 = PARAMF(0);
float f2 = PARAMF(1);
RETURNF(atan2f(f1, f2));
return 120; // guess number of cycles
}
static int Replace_floorf() {
float f1 = PARAMF(0);
RETURNF(floorf(f1));
return 30; // guess number of cycles
}
static int Replace_ceilf() {
float f1 = PARAMF(0);
RETURNF(ceilf(f1));
return 30; // guess number of cycles
}
// Should probably do JIT versions of this, possibly ones that only delegate
// large copies to a C function.
static int Replace_memcpy() {
u32 destPtr = PARAM(0);
u32 srcPtr = PARAM(1);
u32 bytes = PARAM(2);
if (bytes != 0) {
u8 *dst = Memory::GetPointerUnchecked(destPtr);
u8 *src = Memory::GetPointerUnchecked(srcPtr);
memmove(dst, src, bytes);
}
RETURN(destPtr);
return 10 + bytes / 4; // approximation
}
static int Replace_memcpy16() {
u32 destPtr = PARAM(0);
u32 srcPtr = PARAM(1);
u32 bytes = PARAM(2) * 16;
if (bytes != 0) {
u8 *dst = Memory::GetPointerUnchecked(destPtr);
u8 *src = Memory::GetPointerUnchecked(srcPtr);
memmove(dst, src, bytes);
}
RETURN(destPtr);
return 10 + bytes / 4; // approximation
}
static int Replace_memmove() {
u32 destPtr = PARAM(0);
u32 srcPtr = PARAM(1);
u32 bytes = PARAM(2);
if (bytes != 0) {
u8 *dst = Memory::GetPointerUnchecked(destPtr);
u8 *src = Memory::GetPointerUnchecked(srcPtr);
memmove(dst, src, bytes);
}
RETURN(destPtr);
return 10 + bytes / 4; // approximation
}
static int Replace_memset() {
u32 destPtr = PARAM(0);
u8 *dst = Memory::GetPointerUnchecked(destPtr);
u8 value = PARAM(1);
u32 bytes = PARAM(2);
memset(dst, value, bytes);
RETURN(destPtr);
return 10 + bytes / 4; // approximation
}
static int Replace_strlen() {
u32 srcPtr = PARAM(0);
const char *src = (const char *)Memory::GetPointerUnchecked(srcPtr);
u32 len = (u32)strlen(src);
RETURN(len);
return 4 + len; // approximation
}
static int Replace_strcpy() {
u32 destPtr = PARAM(0);
char *dst = (char *)Memory::GetPointerUnchecked(destPtr);
const char *src = (const char *)Memory::GetPointerUnchecked(PARAM(1));
strcpy(dst, src);
RETURN(destPtr);
return 10; // approximation
}
static int Replace_strncpy() {
u32 destPtr = PARAM(0);
char *dst = (char *)Memory::GetPointerUnchecked(destPtr);
const char *src = (const char *)Memory::GetPointerUnchecked(PARAM(1));
u32 bytes = PARAM(2);
strncpy(dst, src, bytes);
RETURN(destPtr);
return 10; // approximation
}
static int Replace_strcmp() {
const char *a = (const char *)Memory::GetPointerUnchecked(PARAM(0));
const char *b = (const char *)Memory::GetPointerUnchecked(PARAM(1));
RETURN(strcmp(a, b));
return 10; // approximation
}
static int Replace_strncmp() {
const char *a = (const char *)Memory::GetPointerUnchecked(PARAM(0));
const char *b = (const char *)Memory::GetPointerUnchecked(PARAM(1));
u32 bytes = PARAM(2);
RETURN(strncmp(a, b, bytes));
return 10 + bytes / 4; // approximation
}
static int Replace_vmmul_q_transp() {
float *out = (float *)Memory::GetPointerUnchecked(PARAM(0));
const float *a = (const float *)Memory::GetPointerUnchecked(PARAM(1));
const float *b = (const float *)Memory::GetPointerUnchecked(PARAM(2));
// TODO: Actually use an optimized matrix multiply here...
Matrix4ByMatrix4(out, b, a);
return 16;
}
// a0 = pointer to destination address
// a1 = matrix
// a2 = source address
static int Replace_gta_dl_write_matrix() {
u32 *ptr = (u32 *)Memory::GetPointerUnchecked(PARAM(0));
u32 *dest = (u32_le *)Memory::GetPointerUnchecked(ptr[0]);
u32 *src = (u32_le *)Memory::GetPointerUnchecked(PARAM(2));
u32 matrix = PARAM(1) << 24;
#if defined(_M_IX86) || defined(_M_X64)
__m128i topBytes = _mm_set1_epi32(matrix);
__m128i m0 = _mm_loadu_si128((const __m128i *)src);
__m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4));
__m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8));
__m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12));
m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes);
m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes);
m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes);
m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes);
// These three stores overlap by a word, due to the offsets.
_mm_storeu_si128((__m128i *)dest, m0);
_mm_storeu_si128((__m128i *)(dest + 3), m1);
_mm_storeu_si128((__m128i *)(dest + 6), m2);
// Store the last one in parts to not overwrite forwards (probably mostly risk free though)
_mm_storel_epi64((__m128i *)(dest + 9), m3);
m3 = _mm_srli_si128(m3, 8);
_mm_store_ss((float *)(dest + 11), _mm_castsi128_ps(m3));
#else
// Bit tricky to SIMD (note the offsets) but should be doable if not perfect
dest[0] = matrix | (src[0] >> 8);
dest[1] = matrix | (src[1] >> 8);
dest[2] = matrix | (src[2] >> 8);
dest[3] = matrix | (src[4] >> 8);
dest[4] = matrix | (src[5] >> 8);
dest[5] = matrix | (src[6] >> 8);
dest[6] = matrix | (src[8] >> 8);
dest[7] = matrix | (src[9] >> 8);
dest[8] = matrix | (src[10] >> 8);
dest[9] = matrix | (src[12] >> 8);
dest[10] = matrix | (src[13] >> 8);
dest[11] = matrix | (src[14] >> 8);
#endif
(*ptr) += 0x30;
RETURN(0);
return 38;
}
// TODO: Inline into a few NEON or SSE instructions - especially if a1 is a known immediate!
// Anyway, not sure if worth it. There's not that many matrices written per frame normally.
static int Replace_dl_write_matrix() {
u32 *dlStruct = (u32 *)Memory::GetPointerUnchecked(PARAM(0));
u32 *dest = (u32 *)Memory::GetPointerUnchecked(dlStruct[2]);
u32 *src = (u32 *)Memory::GetPointerUnchecked(PARAM(2));
u32 matrix;
int count = 12;
switch (PARAM(1)) {
case 3:
matrix = 0x40000000; // tex mtx
break;
case 2:
matrix = 0x3A000000;
break;
case 1:
matrix = 0x3C000000;
break;
case 0:
matrix = 0x3E000000;
count = 16;
break;
}
*dest++ = matrix;
matrix += 0x01000000;
if (count == 16) {
// Ultra SIMD friendly! These intrinsics generate pretty much perfect code,
// no point in hand rolling.
#if defined(_M_IX86) || defined(_M_X64)
__m128i topBytes = _mm_set1_epi32(matrix);
__m128i m0 = _mm_loadu_si128((const __m128i *)src);
__m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4));
__m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8));
__m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12));
m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes);
m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes);
m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes);
m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes);
_mm_storeu_si128((__m128i *)dest, m0);
_mm_storeu_si128((__m128i *)(dest + 4), m1);
_mm_storeu_si128((__m128i *)(dest + 8), m2);
_mm_storeu_si128((__m128i *)(dest + 12), m3);
#else
#if 0
//TODO: Finish NEON, make conditional somehow
uint32x4_t topBytes = vdupq_n_u32(matrix);
uint32x4_t m0 = vld1q_u32(dataPtr);
uint32x4_t m1 = vld1q_u32(dataPtr + 4);
uint32x4_t m2 = vld1q_u32(dataPtr + 8);
uint32x4_t m3 = vld1q_u32(dataPtr + 12);
m0 = vorr_u32(vsri_n_u32(m0, 8), topBytes); // TODO: look into VSRI
m1 = vorr_u32(vshr_n_u32(m1, 8), topBytes);
m2 = vorr_u32(vshr_n_u32(m2, 8), topBytes);
m3 = vorr_u32(vshr_n_u32(m3, 8), topBytes);
vst1q_u32(dlPtr, m0);
vst1q_u32(dlPtr + 4, m1);
vst1q_u32(dlPtr + 8, m2);
vst1q_u32(dlPtr + 12, m3);
#endif
for (int i = 0; i < count; i++) {
dest[i] = matrix | (src[i] >> 8);
}
#endif
} else {
#if defined(_M_IX86) || defined(_M_X64)
__m128i topBytes = _mm_set1_epi32(matrix);
__m128i m0 = _mm_loadu_si128((const __m128i *)src);
__m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4));
__m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8));
__m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12));
m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes);
m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes);
m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes);
m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes);
// These three stores overlap by a word, due to the offsets.
_mm_storeu_si128((__m128i *)dest, m0);
_mm_storeu_si128((__m128i *)(dest + 3), m1);
_mm_storeu_si128((__m128i *)(dest + 6), m2);
// Store the last one in parts to not overwrite forwards (probably mostly risk free though)
_mm_storel_epi64((__m128i *)(dest + 9), m3);
m3 = _mm_srli_si128(m3, 8);
_mm_store_ss((float *)(dest + 11), _mm_castsi128_ps(m3));
#else
// Bit tricky to SIMD (note the offsets) but should be doable if not perfect
dest[0] = matrix | (src[0] >> 8);
dest[1] = matrix | (src[1] >> 8);
dest[2] = matrix | (src[2] >> 8);
dest[3] = matrix | (src[4] >> 8);
dest[4] = matrix | (src[5] >> 8);
dest[5] = matrix | (src[6] >> 8);
dest[6] = matrix | (src[8] >> 8);
dest[7] = matrix | (src[9] >> 8);
dest[8] = matrix | (src[10] >> 8);
dest[9] = matrix | (src[12] >> 8);
dest[10] = matrix | (src[13] >> 8);
dest[11] = matrix | (src[14] >> 8);
#endif
}
dlStruct[2] += (1 + count) * 4;
RETURN(dlStruct[2]);
return 60;
}
// Can either replace with C functions or functions emitted in Asm/ArmAsm.
static const ReplacementTableEntry entries[] = {
// TODO: I think some games can be helped quite a bit by implementing the
// double-precision soft-float routines: __adddf3, __subdf3 and so on. These
// should of course be implemented JIT style, inline.
{ "sinf", &Replace_sinf, 0, 0},
{ "cosf", &Replace_cosf, 0, 0},
{ "tanf", &Replace_tanf, 0, 0},
/* These two collide (same hash) and thus can't be replaced :/
{ "asinf", &Replace_asinf, 0, 0},
{ "acosf", &Replace_acosf, 0, 0},
*/
{ "atanf", &Replace_atanf, 0, 0},
{ "sqrtf", &Replace_sqrtf, 0, 0},
{ "atan2f", &Replace_atan2f, 0, 0},
{ "floorf", &Replace_floorf, 0, 0},
{ "ceilf", &Replace_ceilf, 0, 0},
{ "memcpy", &Replace_memcpy, 0, 0},
{ "memcpy16", &Replace_memcpy16, 0, 0},
{ "memmove", &Replace_memmove, 0, 0},
{ "memset", &Replace_memset, 0, 0},
{ "strlen", &Replace_strlen, 0, 0},
{ "strcpy", &Replace_strcpy, 0, 0},
{ "strncpy", &Replace_strncpy, 0, 0},
{ "strcmp", &Replace_strcmp, 0, 0},
{ "strncmp", &Replace_strncmp, 0, 0},
{ "fabsf", 0, &MIPSComp::Jit::Replace_fabsf, REPFLAG_ALLOWINLINE},
{ "dl_write_matrix", &Replace_dl_write_matrix, 0, 0}, // &MIPSComp::Jit::Replace_dl_write_matrix, 0},
{ "dl_write_matrix_2", &Replace_dl_write_matrix, 0, 0},
{ "gta_dl_write_matrix", &Replace_gta_dl_write_matrix, 0, 0},
// dl_write_matrix_3 doesn't take the dl as a parameter, it accesses a global instead. Need to extract the address of the global from the code when replacing...
// Haven't investigated write_matrix_4 and 5 but I think they are similar to 1 and 2.
// { "vmmul_q_transp", &Replace_vmmul_q_transp, 0, 0},
{}
};
static std::map<u32, u32> replacedInstructions;
void Replacement_Init() {
}
void Replacement_Shutdown() {
replacedInstructions.clear();
}
// TODO: Do something on load state?
int GetNumReplacementFuncs() {
return ARRAY_SIZE(entries);
}
int GetReplacementFuncIndex(u64 hash, int funcSize) {
const char *name = MIPSAnalyst::LookupHash(hash, funcSize);
if (!name) {
return -1;
}
// TODO: Build a lookup and keep it around
for (int i = 0; i < ARRAY_SIZE(entries); i++) {
if (!entries[i].name)
continue;
if (!strcmp(name, entries[i].name)) {
return i;
}
}
return -1;
}
const ReplacementTableEntry *GetReplacementFunc(int i) {
return &entries[i];
}
void WriteReplaceInstruction(u32 address, u64 hash, int size) {
int index = GetReplacementFuncIndex(hash, size);
if (index >= 0) {
u32 prevInstr = Memory::Read_U32(address);
if (MIPS_IS_REPLACEMENT(prevInstr)) {
return;
}
if (MIPS_IS_RUNBLOCK(prevInstr)) {
// Likely already both replaced and jitted. Ignore.
return;
}
replacedInstructions[address] = prevInstr;
INFO_LOG(HLE, "Replaced %s at %08x with hash %016llx", entries[index].name, address, hash);
Memory::Write_U32(MIPS_EMUHACK_CALL_REPLACEMENT | (int)index, address);
}
}
bool GetReplacedOpAt(u32 address, u32 *op) {
u32 instr = Memory::Read_Opcode_JIT(address).encoding;
if (MIPS_IS_REPLACEMENT(instr)) {
auto iter = replacedInstructions.find(address);
if (iter != replacedInstructions.end()) {
*op = iter->second;
return true;
} else {
return false;
}
}
return false;
}