mirror of
https://github.com/libretro/ppsspp.git
synced 2025-01-10 18:40:42 +00:00
465 lines
14 KiB
C++
465 lines
14 KiB
C++
// Copyright (c) 2013- PPSSPP Project.
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation, version 2.0 or later versions.
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License 2.0 for more details.
|
|
|
|
// A copy of the GPL 2.0 should have been included with the program.
|
|
// If not, see http://www.gnu.org/licenses/
|
|
|
|
// Official git repository and contact information can be found at
|
|
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
|
|
|
#include <map>
|
|
|
|
#include "base/basictypes.h"
|
|
#include "base/logging.h"
|
|
#include "Core/MIPS/JitCommon/JitCommon.h"
|
|
#include "Core/MIPS/MIPSAnalyst.h"
|
|
#include "Core/HLE/ReplaceTables.h"
|
|
#include "Core/HLE/FunctionWrappers.h"
|
|
|
|
#include "GPU/Math3D.h"
|
|
|
|
#if defined(_M_IX86) || defined(_M_X64)
|
|
#include <emmintrin.h>
|
|
#endif
|
|
|
|
// I think these have to be pretty accurate as these are libc replacements,
|
|
// but we can probably get away with approximating the VFPU vsin/vcos and vrot
|
|
// pretty roughly.
|
|
static int Replace_sinf() {
|
|
float f = PARAMF(0);
|
|
RETURNF(sinf(f));
|
|
return 80; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_cosf() {
|
|
float f = PARAMF(0);
|
|
RETURNF(cosf(f));
|
|
return 80; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_tanf() {
|
|
float f = PARAMF(0);
|
|
RETURNF(tanf(f));
|
|
return 80; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_acosf() {
|
|
float f = PARAMF(0);
|
|
RETURNF(acosf(f));
|
|
return 80; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_asinf() {
|
|
float f = PARAMF(0);
|
|
RETURNF(asinf(f));
|
|
return 80; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_atanf() {
|
|
float f = PARAMF(0);
|
|
RETURNF(atanf(f));
|
|
return 80; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_sqrtf() {
|
|
float f = PARAMF(0);
|
|
RETURNF(sqrtf(f));
|
|
return 80; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_atan2f() {
|
|
float f1 = PARAMF(0);
|
|
float f2 = PARAMF(1);
|
|
RETURNF(atan2f(f1, f2));
|
|
return 120; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_floorf() {
|
|
float f1 = PARAMF(0);
|
|
RETURNF(floorf(f1));
|
|
return 30; // guess number of cycles
|
|
}
|
|
|
|
static int Replace_ceilf() {
|
|
float f1 = PARAMF(0);
|
|
RETURNF(ceilf(f1));
|
|
return 30; // guess number of cycles
|
|
}
|
|
|
|
// Should probably do JIT versions of this, possibly ones that only delegate
|
|
// large copies to a C function.
|
|
static int Replace_memcpy() {
|
|
u32 destPtr = PARAM(0);
|
|
u32 srcPtr = PARAM(1);
|
|
u32 bytes = PARAM(2);
|
|
if (bytes != 0) {
|
|
u8 *dst = Memory::GetPointerUnchecked(destPtr);
|
|
u8 *src = Memory::GetPointerUnchecked(srcPtr);
|
|
memmove(dst, src, bytes);
|
|
}
|
|
RETURN(destPtr);
|
|
return 10 + bytes / 4; // approximation
|
|
}
|
|
|
|
static int Replace_memcpy16() {
|
|
u32 destPtr = PARAM(0);
|
|
u32 srcPtr = PARAM(1);
|
|
u32 bytes = PARAM(2) * 16;
|
|
if (bytes != 0) {
|
|
u8 *dst = Memory::GetPointerUnchecked(destPtr);
|
|
u8 *src = Memory::GetPointerUnchecked(srcPtr);
|
|
memmove(dst, src, bytes);
|
|
}
|
|
RETURN(destPtr);
|
|
return 10 + bytes / 4; // approximation
|
|
}
|
|
|
|
static int Replace_memmove() {
|
|
u32 destPtr = PARAM(0);
|
|
u32 srcPtr = PARAM(1);
|
|
u32 bytes = PARAM(2);
|
|
if (bytes != 0) {
|
|
u8 *dst = Memory::GetPointerUnchecked(destPtr);
|
|
u8 *src = Memory::GetPointerUnchecked(srcPtr);
|
|
memmove(dst, src, bytes);
|
|
}
|
|
RETURN(destPtr);
|
|
return 10 + bytes / 4; // approximation
|
|
}
|
|
|
|
static int Replace_memset() {
|
|
u32 destPtr = PARAM(0);
|
|
u8 *dst = Memory::GetPointerUnchecked(destPtr);
|
|
u8 value = PARAM(1);
|
|
u32 bytes = PARAM(2);
|
|
memset(dst, value, bytes);
|
|
RETURN(destPtr);
|
|
return 10 + bytes / 4; // approximation
|
|
}
|
|
|
|
static int Replace_strlen() {
|
|
u32 srcPtr = PARAM(0);
|
|
const char *src = (const char *)Memory::GetPointerUnchecked(srcPtr);
|
|
u32 len = (u32)strlen(src);
|
|
RETURN(len);
|
|
return 4 + len; // approximation
|
|
}
|
|
|
|
static int Replace_strcpy() {
|
|
u32 destPtr = PARAM(0);
|
|
char *dst = (char *)Memory::GetPointerUnchecked(destPtr);
|
|
const char *src = (const char *)Memory::GetPointerUnchecked(PARAM(1));
|
|
strcpy(dst, src);
|
|
RETURN(destPtr);
|
|
return 10; // approximation
|
|
}
|
|
|
|
static int Replace_strncpy() {
|
|
u32 destPtr = PARAM(0);
|
|
char *dst = (char *)Memory::GetPointerUnchecked(destPtr);
|
|
const char *src = (const char *)Memory::GetPointerUnchecked(PARAM(1));
|
|
u32 bytes = PARAM(2);
|
|
strncpy(dst, src, bytes);
|
|
RETURN(destPtr);
|
|
return 10; // approximation
|
|
}
|
|
|
|
static int Replace_strcmp() {
|
|
const char *a = (const char *)Memory::GetPointerUnchecked(PARAM(0));
|
|
const char *b = (const char *)Memory::GetPointerUnchecked(PARAM(1));
|
|
RETURN(strcmp(a, b));
|
|
return 10; // approximation
|
|
}
|
|
|
|
static int Replace_strncmp() {
|
|
const char *a = (const char *)Memory::GetPointerUnchecked(PARAM(0));
|
|
const char *b = (const char *)Memory::GetPointerUnchecked(PARAM(1));
|
|
u32 bytes = PARAM(2);
|
|
RETURN(strncmp(a, b, bytes));
|
|
return 10 + bytes / 4; // approximation
|
|
}
|
|
|
|
static int Replace_vmmul_q_transp() {
|
|
float *out = (float *)Memory::GetPointerUnchecked(PARAM(0));
|
|
const float *a = (const float *)Memory::GetPointerUnchecked(PARAM(1));
|
|
const float *b = (const float *)Memory::GetPointerUnchecked(PARAM(2));
|
|
|
|
// TODO: Actually use an optimized matrix multiply here...
|
|
Matrix4ByMatrix4(out, b, a);
|
|
return 16;
|
|
}
|
|
|
|
// a0 = pointer to destination address
|
|
// a1 = matrix
|
|
// a2 = source address
|
|
static int Replace_gta_dl_write_matrix() {
|
|
u32 *ptr = (u32 *)Memory::GetPointerUnchecked(PARAM(0));
|
|
u32 *dest = (u32_le *)Memory::GetPointerUnchecked(ptr[0]);
|
|
u32 *src = (u32_le *)Memory::GetPointerUnchecked(PARAM(2));
|
|
u32 matrix = PARAM(1) << 24;
|
|
|
|
#if defined(_M_IX86) || defined(_M_X64)
|
|
__m128i topBytes = _mm_set1_epi32(matrix);
|
|
__m128i m0 = _mm_loadu_si128((const __m128i *)src);
|
|
__m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4));
|
|
__m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8));
|
|
__m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12));
|
|
m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes);
|
|
m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes);
|
|
m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes);
|
|
m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes);
|
|
// These three stores overlap by a word, due to the offsets.
|
|
_mm_storeu_si128((__m128i *)dest, m0);
|
|
_mm_storeu_si128((__m128i *)(dest + 3), m1);
|
|
_mm_storeu_si128((__m128i *)(dest + 6), m2);
|
|
// Store the last one in parts to not overwrite forwards (probably mostly risk free though)
|
|
_mm_storel_epi64((__m128i *)(dest + 9), m3);
|
|
m3 = _mm_srli_si128(m3, 8);
|
|
_mm_store_ss((float *)(dest + 11), _mm_castsi128_ps(m3));
|
|
#else
|
|
// Bit tricky to SIMD (note the offsets) but should be doable if not perfect
|
|
dest[0] = matrix | (src[0] >> 8);
|
|
dest[1] = matrix | (src[1] >> 8);
|
|
dest[2] = matrix | (src[2] >> 8);
|
|
dest[3] = matrix | (src[4] >> 8);
|
|
dest[4] = matrix | (src[5] >> 8);
|
|
dest[5] = matrix | (src[6] >> 8);
|
|
dest[6] = matrix | (src[8] >> 8);
|
|
dest[7] = matrix | (src[9] >> 8);
|
|
dest[8] = matrix | (src[10] >> 8);
|
|
dest[9] = matrix | (src[12] >> 8);
|
|
dest[10] = matrix | (src[13] >> 8);
|
|
dest[11] = matrix | (src[14] >> 8);
|
|
#endif
|
|
|
|
(*ptr) += 0x30;
|
|
RETURN(0);
|
|
return 38;
|
|
}
|
|
|
|
|
|
// TODO: Inline into a few NEON or SSE instructions - especially if a1 is a known immediate!
|
|
// Anyway, not sure if worth it. There's not that many matrices written per frame normally.
|
|
static int Replace_dl_write_matrix() {
|
|
u32 *dlStruct = (u32 *)Memory::GetPointerUnchecked(PARAM(0));
|
|
u32 *dest = (u32 *)Memory::GetPointerUnchecked(dlStruct[2]);
|
|
u32 *src = (u32 *)Memory::GetPointerUnchecked(PARAM(2));
|
|
|
|
u32 matrix;
|
|
int count = 12;
|
|
switch (PARAM(1)) {
|
|
case 3:
|
|
matrix = 0x40000000; // tex mtx
|
|
break;
|
|
case 2:
|
|
matrix = 0x3A000000;
|
|
break;
|
|
case 1:
|
|
matrix = 0x3C000000;
|
|
break;
|
|
case 0:
|
|
matrix = 0x3E000000;
|
|
count = 16;
|
|
break;
|
|
}
|
|
|
|
*dest++ = matrix;
|
|
matrix += 0x01000000;
|
|
|
|
if (count == 16) {
|
|
// Ultra SIMD friendly! These intrinsics generate pretty much perfect code,
|
|
// no point in hand rolling.
|
|
#if defined(_M_IX86) || defined(_M_X64)
|
|
__m128i topBytes = _mm_set1_epi32(matrix);
|
|
__m128i m0 = _mm_loadu_si128((const __m128i *)src);
|
|
__m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4));
|
|
__m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8));
|
|
__m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12));
|
|
m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes);
|
|
m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes);
|
|
m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes);
|
|
m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes);
|
|
_mm_storeu_si128((__m128i *)dest, m0);
|
|
_mm_storeu_si128((__m128i *)(dest + 4), m1);
|
|
_mm_storeu_si128((__m128i *)(dest + 8), m2);
|
|
_mm_storeu_si128((__m128i *)(dest + 12), m3);
|
|
#else
|
|
#if 0
|
|
//TODO: Finish NEON, make conditional somehow
|
|
uint32x4_t topBytes = vdupq_n_u32(matrix);
|
|
uint32x4_t m0 = vld1q_u32(dataPtr);
|
|
uint32x4_t m1 = vld1q_u32(dataPtr + 4);
|
|
uint32x4_t m2 = vld1q_u32(dataPtr + 8);
|
|
uint32x4_t m3 = vld1q_u32(dataPtr + 12);
|
|
m0 = vorr_u32(vsri_n_u32(m0, 8), topBytes); // TODO: look into VSRI
|
|
m1 = vorr_u32(vshr_n_u32(m1, 8), topBytes);
|
|
m2 = vorr_u32(vshr_n_u32(m2, 8), topBytes);
|
|
m3 = vorr_u32(vshr_n_u32(m3, 8), topBytes);
|
|
vst1q_u32(dlPtr, m0);
|
|
vst1q_u32(dlPtr + 4, m1);
|
|
vst1q_u32(dlPtr + 8, m2);
|
|
vst1q_u32(dlPtr + 12, m3);
|
|
#endif
|
|
for (int i = 0; i < count; i++) {
|
|
dest[i] = matrix | (src[i] >> 8);
|
|
}
|
|
#endif
|
|
} else {
|
|
#if defined(_M_IX86) || defined(_M_X64)
|
|
__m128i topBytes = _mm_set1_epi32(matrix);
|
|
__m128i m0 = _mm_loadu_si128((const __m128i *)src);
|
|
__m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4));
|
|
__m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8));
|
|
__m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12));
|
|
m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes);
|
|
m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes);
|
|
m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes);
|
|
m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes);
|
|
// These three stores overlap by a word, due to the offsets.
|
|
_mm_storeu_si128((__m128i *)dest, m0);
|
|
_mm_storeu_si128((__m128i *)(dest + 3), m1);
|
|
_mm_storeu_si128((__m128i *)(dest + 6), m2);
|
|
// Store the last one in parts to not overwrite forwards (probably mostly risk free though)
|
|
_mm_storel_epi64((__m128i *)(dest + 9), m3);
|
|
m3 = _mm_srli_si128(m3, 8);
|
|
_mm_store_ss((float *)(dest + 11), _mm_castsi128_ps(m3));
|
|
#else
|
|
// Bit tricky to SIMD (note the offsets) but should be doable if not perfect
|
|
dest[0] = matrix | (src[0] >> 8);
|
|
dest[1] = matrix | (src[1] >> 8);
|
|
dest[2] = matrix | (src[2] >> 8);
|
|
dest[3] = matrix | (src[4] >> 8);
|
|
dest[4] = matrix | (src[5] >> 8);
|
|
dest[5] = matrix | (src[6] >> 8);
|
|
dest[6] = matrix | (src[8] >> 8);
|
|
dest[7] = matrix | (src[9] >> 8);
|
|
dest[8] = matrix | (src[10] >> 8);
|
|
dest[9] = matrix | (src[12] >> 8);
|
|
dest[10] = matrix | (src[13] >> 8);
|
|
dest[11] = matrix | (src[14] >> 8);
|
|
#endif
|
|
}
|
|
|
|
dlStruct[2] += (1 + count) * 4;
|
|
RETURN(dlStruct[2]);
|
|
return 60;
|
|
}
|
|
|
|
// Can either replace with C functions or functions emitted in Asm/ArmAsm.
|
|
static const ReplacementTableEntry entries[] = {
|
|
// TODO: I think some games can be helped quite a bit by implementing the
|
|
// double-precision soft-float routines: __adddf3, __subdf3 and so on. These
|
|
// should of course be implemented JIT style, inline.
|
|
|
|
{ "sinf", &Replace_sinf, 0, 0},
|
|
{ "cosf", &Replace_cosf, 0, 0},
|
|
|
|
{ "tanf", &Replace_tanf, 0, 0},
|
|
|
|
/* These two collide (same hash) and thus can't be replaced :/
|
|
{ "asinf", &Replace_asinf, 0, 0},
|
|
{ "acosf", &Replace_acosf, 0, 0},
|
|
*/
|
|
|
|
{ "atanf", &Replace_atanf, 0, 0},
|
|
{ "sqrtf", &Replace_sqrtf, 0, 0},
|
|
{ "atan2f", &Replace_atan2f, 0, 0},
|
|
{ "floorf", &Replace_floorf, 0, 0},
|
|
{ "ceilf", &Replace_ceilf, 0, 0},
|
|
|
|
{ "memcpy", &Replace_memcpy, 0, 0},
|
|
{ "memcpy16", &Replace_memcpy16, 0, 0},
|
|
{ "memmove", &Replace_memmove, 0, 0},
|
|
{ "memset", &Replace_memset, 0, 0},
|
|
{ "strlen", &Replace_strlen, 0, 0},
|
|
{ "strcpy", &Replace_strcpy, 0, 0},
|
|
{ "strncpy", &Replace_strncpy, 0, 0},
|
|
{ "strcmp", &Replace_strcmp, 0, 0},
|
|
{ "strncmp", &Replace_strncmp, 0, 0},
|
|
|
|
{ "fabsf", 0, &MIPSComp::Jit::Replace_fabsf, REPFLAG_ALLOWINLINE},
|
|
{ "dl_write_matrix", &Replace_dl_write_matrix, 0, 0}, // &MIPSComp::Jit::Replace_dl_write_matrix, 0},
|
|
{ "dl_write_matrix_2", &Replace_dl_write_matrix, 0, 0},
|
|
{ "gta_dl_write_matrix", &Replace_gta_dl_write_matrix, 0, 0},
|
|
// dl_write_matrix_3 doesn't take the dl as a parameter, it accesses a global instead. Need to extract the address of the global from the code when replacing...
|
|
// Haven't investigated write_matrix_4 and 5 but I think they are similar to 1 and 2.
|
|
|
|
// { "vmmul_q_transp", &Replace_vmmul_q_transp, 0, 0},
|
|
{}
|
|
};
|
|
|
|
static std::map<u32, u32> replacedInstructions;
|
|
|
|
void Replacement_Init() {
|
|
}
|
|
|
|
void Replacement_Shutdown() {
|
|
replacedInstructions.clear();
|
|
}
|
|
|
|
// TODO: Do something on load state?
|
|
|
|
int GetNumReplacementFuncs() {
|
|
return ARRAY_SIZE(entries);
|
|
}
|
|
|
|
int GetReplacementFuncIndex(u64 hash, int funcSize) {
|
|
const char *name = MIPSAnalyst::LookupHash(hash, funcSize);
|
|
if (!name) {
|
|
return -1;
|
|
}
|
|
|
|
// TODO: Build a lookup and keep it around
|
|
for (int i = 0; i < ARRAY_SIZE(entries); i++) {
|
|
if (!entries[i].name)
|
|
continue;
|
|
if (!strcmp(name, entries[i].name)) {
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
const ReplacementTableEntry *GetReplacementFunc(int i) {
|
|
return &entries[i];
|
|
}
|
|
|
|
void WriteReplaceInstruction(u32 address, u64 hash, int size) {
|
|
int index = GetReplacementFuncIndex(hash, size);
|
|
if (index >= 0) {
|
|
u32 prevInstr = Memory::Read_U32(address);
|
|
if (MIPS_IS_REPLACEMENT(prevInstr)) {
|
|
return;
|
|
}
|
|
if (MIPS_IS_RUNBLOCK(prevInstr)) {
|
|
// Likely already both replaced and jitted. Ignore.
|
|
return;
|
|
}
|
|
replacedInstructions[address] = prevInstr;
|
|
INFO_LOG(HLE, "Replaced %s at %08x with hash %016llx", entries[index].name, address, hash);
|
|
Memory::Write_U32(MIPS_EMUHACK_CALL_REPLACEMENT | (int)index, address);
|
|
}
|
|
}
|
|
|
|
bool GetReplacedOpAt(u32 address, u32 *op) {
|
|
u32 instr = Memory::Read_Opcode_JIT(address).encoding;
|
|
if (MIPS_IS_REPLACEMENT(instr)) {
|
|
auto iter = replacedInstructions.find(address);
|
|
if (iter != replacedInstructions.end()) {
|
|
*op = iter->second;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
return false;
|
|
}
|