// Copyright (c) 2013- PPSSPP Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0 or later versions. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include #include "base/basictypes.h" #include "base/logging.h" #include "Core/MIPS/JitCommon/JitCommon.h" #include "Core/MIPS/MIPSAnalyst.h" #include "Core/HLE/ReplaceTables.h" #include "Core/HLE/FunctionWrappers.h" #include "GPU/Math3D.h" #if defined(_M_IX86) || defined(_M_X64) #include #endif // I think these have to be pretty accurate as these are libc replacements, // but we can probably get away with approximating the VFPU vsin/vcos and vrot // pretty roughly. static int Replace_sinf() { float f = PARAMF(0); RETURNF(sinf(f)); return 80; // guess number of cycles } static int Replace_cosf() { float f = PARAMF(0); RETURNF(cosf(f)); return 80; // guess number of cycles } static int Replace_tanf() { float f = PARAMF(0); RETURNF(tanf(f)); return 80; // guess number of cycles } static int Replace_acosf() { float f = PARAMF(0); RETURNF(acosf(f)); return 80; // guess number of cycles } static int Replace_asinf() { float f = PARAMF(0); RETURNF(asinf(f)); return 80; // guess number of cycles } static int Replace_atanf() { float f = PARAMF(0); RETURNF(atanf(f)); return 80; // guess number of cycles } static int Replace_sqrtf() { float f = PARAMF(0); RETURNF(sqrtf(f)); return 80; // guess number of cycles } static int Replace_atan2f() { float f1 = PARAMF(0); float f2 = PARAMF(1); RETURNF(atan2f(f1, f2)); return 120; // guess number of cycles } static int Replace_floorf() { float f1 = PARAMF(0); RETURNF(floorf(f1)); return 30; // guess number of cycles } static int Replace_ceilf() { float f1 = PARAMF(0); RETURNF(ceilf(f1)); return 30; // guess number of cycles } // Should probably do JIT versions of this, possibly ones that only delegate // large copies to a C function. static int Replace_memcpy() { u32 destPtr = PARAM(0); u32 srcPtr = PARAM(1); u32 bytes = PARAM(2); if (bytes != 0) { u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 *src = Memory::GetPointerUnchecked(srcPtr); memmove(dst, src, bytes); } RETURN(destPtr); return 10 + bytes / 4; // approximation } static int Replace_memcpy16() { u32 destPtr = PARAM(0); u32 srcPtr = PARAM(1); u32 bytes = PARAM(2) * 16; if (bytes != 0) { u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 *src = Memory::GetPointerUnchecked(srcPtr); memmove(dst, src, bytes); } RETURN(destPtr); return 10 + bytes / 4; // approximation } static int Replace_memmove() { u32 destPtr = PARAM(0); u32 srcPtr = PARAM(1); u32 bytes = PARAM(2); if (bytes != 0) { u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 *src = Memory::GetPointerUnchecked(srcPtr); memmove(dst, src, bytes); } RETURN(destPtr); return 10 + bytes / 4; // approximation } static int Replace_memset() { u32 destPtr = PARAM(0); u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 value = PARAM(1); u32 bytes = PARAM(2); memset(dst, value, bytes); RETURN(destPtr); return 10 + bytes / 4; // approximation } static int Replace_strlen() { u32 srcPtr = PARAM(0); const char *src = (const char *)Memory::GetPointerUnchecked(srcPtr); u32 len = (u32)strlen(src); RETURN(len); return 4 + len; // approximation } static int Replace_strcpy() { u32 destPtr = PARAM(0); char *dst = (char *)Memory::GetPointerUnchecked(destPtr); const char *src = (const char *)Memory::GetPointerUnchecked(PARAM(1)); strcpy(dst, src); RETURN(destPtr); return 10; // approximation } static int Replace_strncpy() { u32 destPtr = PARAM(0); char *dst = (char *)Memory::GetPointerUnchecked(destPtr); const char *src = (const char *)Memory::GetPointerUnchecked(PARAM(1)); u32 bytes = PARAM(2); strncpy(dst, src, bytes); RETURN(destPtr); return 10; // approximation } static int Replace_strcmp() { const char *a = (const char *)Memory::GetPointerUnchecked(PARAM(0)); const char *b = (const char *)Memory::GetPointerUnchecked(PARAM(1)); RETURN(strcmp(a, b)); return 10; // approximation } static int Replace_strncmp() { const char *a = (const char *)Memory::GetPointerUnchecked(PARAM(0)); const char *b = (const char *)Memory::GetPointerUnchecked(PARAM(1)); u32 bytes = PARAM(2); RETURN(strncmp(a, b, bytes)); return 10 + bytes / 4; // approximation } static int Replace_vmmul_q_transp() { float *out = (float *)Memory::GetPointerUnchecked(PARAM(0)); const float *a = (const float *)Memory::GetPointerUnchecked(PARAM(1)); const float *b = (const float *)Memory::GetPointerUnchecked(PARAM(2)); // TODO: Actually use an optimized matrix multiply here... Matrix4ByMatrix4(out, b, a); return 16; } // a0 = pointer to destination address // a1 = matrix // a2 = source address static int Replace_gta_dl_write_matrix() { u32 *ptr = (u32 *)Memory::GetPointerUnchecked(PARAM(0)); u32 *dest = (u32_le *)Memory::GetPointerUnchecked(ptr[0]); u32 *src = (u32_le *)Memory::GetPointerUnchecked(PARAM(2)); u32 matrix = PARAM(1) << 24; #if defined(_M_IX86) || defined(_M_X64) __m128i topBytes = _mm_set1_epi32(matrix); __m128i m0 = _mm_loadu_si128((const __m128i *)src); __m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4)); __m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8)); __m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12)); m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes); m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes); m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes); m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes); // These three stores overlap by a word, due to the offsets. _mm_storeu_si128((__m128i *)dest, m0); _mm_storeu_si128((__m128i *)(dest + 3), m1); _mm_storeu_si128((__m128i *)(dest + 6), m2); // Store the last one in parts to not overwrite forwards (probably mostly risk free though) _mm_storel_epi64((__m128i *)(dest + 9), m3); m3 = _mm_srli_si128(m3, 8); _mm_store_ss((float *)(dest + 11), _mm_castsi128_ps(m3)); #else // Bit tricky to SIMD (note the offsets) but should be doable if not perfect dest[0] = matrix | (src[0] >> 8); dest[1] = matrix | (src[1] >> 8); dest[2] = matrix | (src[2] >> 8); dest[3] = matrix | (src[4] >> 8); dest[4] = matrix | (src[5] >> 8); dest[5] = matrix | (src[6] >> 8); dest[6] = matrix | (src[8] >> 8); dest[7] = matrix | (src[9] >> 8); dest[8] = matrix | (src[10] >> 8); dest[9] = matrix | (src[12] >> 8); dest[10] = matrix | (src[13] >> 8); dest[11] = matrix | (src[14] >> 8); #endif (*ptr) += 0x30; RETURN(0); return 38; } // TODO: Inline into a few NEON or SSE instructions - especially if a1 is a known immediate! // Anyway, not sure if worth it. There's not that many matrices written per frame normally. static int Replace_dl_write_matrix() { u32 *dlStruct = (u32 *)Memory::GetPointerUnchecked(PARAM(0)); u32 *dest = (u32 *)Memory::GetPointerUnchecked(dlStruct[2]); u32 *src = (u32 *)Memory::GetPointerUnchecked(PARAM(2)); u32 matrix; int count = 12; switch (PARAM(1)) { case 3: matrix = 0x40000000; // tex mtx break; case 2: matrix = 0x3A000000; break; case 1: matrix = 0x3C000000; break; case 0: matrix = 0x3E000000; count = 16; break; } *dest++ = matrix; matrix += 0x01000000; if (count == 16) { // Ultra SIMD friendly! These intrinsics generate pretty much perfect code, // no point in hand rolling. #if defined(_M_IX86) || defined(_M_X64) __m128i topBytes = _mm_set1_epi32(matrix); __m128i m0 = _mm_loadu_si128((const __m128i *)src); __m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4)); __m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8)); __m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12)); m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes); m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes); m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes); m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes); _mm_storeu_si128((__m128i *)dest, m0); _mm_storeu_si128((__m128i *)(dest + 4), m1); _mm_storeu_si128((__m128i *)(dest + 8), m2); _mm_storeu_si128((__m128i *)(dest + 12), m3); #else #if 0 //TODO: Finish NEON, make conditional somehow uint32x4_t topBytes = vdupq_n_u32(matrix); uint32x4_t m0 = vld1q_u32(dataPtr); uint32x4_t m1 = vld1q_u32(dataPtr + 4); uint32x4_t m2 = vld1q_u32(dataPtr + 8); uint32x4_t m3 = vld1q_u32(dataPtr + 12); m0 = vorr_u32(vsri_n_u32(m0, 8), topBytes); // TODO: look into VSRI m1 = vorr_u32(vshr_n_u32(m1, 8), topBytes); m2 = vorr_u32(vshr_n_u32(m2, 8), topBytes); m3 = vorr_u32(vshr_n_u32(m3, 8), topBytes); vst1q_u32(dlPtr, m0); vst1q_u32(dlPtr + 4, m1); vst1q_u32(dlPtr + 8, m2); vst1q_u32(dlPtr + 12, m3); #endif for (int i = 0; i < count; i++) { dest[i] = matrix | (src[i] >> 8); } #endif } else { #if defined(_M_IX86) || defined(_M_X64) __m128i topBytes = _mm_set1_epi32(matrix); __m128i m0 = _mm_loadu_si128((const __m128i *)src); __m128i m1 = _mm_loadu_si128((const __m128i *)(src + 4)); __m128i m2 = _mm_loadu_si128((const __m128i *)(src + 8)); __m128i m3 = _mm_loadu_si128((const __m128i *)(src + 12)); m0 = _mm_or_si128(_mm_srli_epi32(m0, 8), topBytes); m1 = _mm_or_si128(_mm_srli_epi32(m1, 8), topBytes); m2 = _mm_or_si128(_mm_srli_epi32(m2, 8), topBytes); m3 = _mm_or_si128(_mm_srli_epi32(m3, 8), topBytes); // These three stores overlap by a word, due to the offsets. _mm_storeu_si128((__m128i *)dest, m0); _mm_storeu_si128((__m128i *)(dest + 3), m1); _mm_storeu_si128((__m128i *)(dest + 6), m2); // Store the last one in parts to not overwrite forwards (probably mostly risk free though) _mm_storel_epi64((__m128i *)(dest + 9), m3); m3 = _mm_srli_si128(m3, 8); _mm_store_ss((float *)(dest + 11), _mm_castsi128_ps(m3)); #else // Bit tricky to SIMD (note the offsets) but should be doable if not perfect dest[0] = matrix | (src[0] >> 8); dest[1] = matrix | (src[1] >> 8); dest[2] = matrix | (src[2] >> 8); dest[3] = matrix | (src[4] >> 8); dest[4] = matrix | (src[5] >> 8); dest[5] = matrix | (src[6] >> 8); dest[6] = matrix | (src[8] >> 8); dest[7] = matrix | (src[9] >> 8); dest[8] = matrix | (src[10] >> 8); dest[9] = matrix | (src[12] >> 8); dest[10] = matrix | (src[13] >> 8); dest[11] = matrix | (src[14] >> 8); #endif } dlStruct[2] += (1 + count) * 4; RETURN(dlStruct[2]); return 60; } // Can either replace with C functions or functions emitted in Asm/ArmAsm. static const ReplacementTableEntry entries[] = { // TODO: I think some games can be helped quite a bit by implementing the // double-precision soft-float routines: __adddf3, __subdf3 and so on. These // should of course be implemented JIT style, inline. { "sinf", &Replace_sinf, 0, 0}, { "cosf", &Replace_cosf, 0, 0}, { "tanf", &Replace_tanf, 0, 0}, /* These two collide (same hash) and thus can't be replaced :/ { "asinf", &Replace_asinf, 0, 0}, { "acosf", &Replace_acosf, 0, 0}, */ { "atanf", &Replace_atanf, 0, 0}, { "sqrtf", &Replace_sqrtf, 0, 0}, { "atan2f", &Replace_atan2f, 0, 0}, { "floorf", &Replace_floorf, 0, 0}, { "ceilf", &Replace_ceilf, 0, 0}, { "memcpy", &Replace_memcpy, 0, 0}, { "memcpy16", &Replace_memcpy16, 0, 0}, { "memmove", &Replace_memmove, 0, 0}, { "memset", &Replace_memset, 0, 0}, { "strlen", &Replace_strlen, 0, 0}, { "strcpy", &Replace_strcpy, 0, 0}, { "strncpy", &Replace_strncpy, 0, 0}, { "strcmp", &Replace_strcmp, 0, 0}, { "strncmp", &Replace_strncmp, 0, 0}, { "fabsf", 0, &MIPSComp::Jit::Replace_fabsf, REPFLAG_ALLOWINLINE}, { "dl_write_matrix", &Replace_dl_write_matrix, 0, 0}, // &MIPSComp::Jit::Replace_dl_write_matrix, 0}, { "dl_write_matrix_2", &Replace_dl_write_matrix, 0, 0}, { "gta_dl_write_matrix", &Replace_gta_dl_write_matrix, 0, 0}, // dl_write_matrix_3 doesn't take the dl as a parameter, it accesses a global instead. Need to extract the address of the global from the code when replacing... // Haven't investigated write_matrix_4 and 5 but I think they are similar to 1 and 2. // { "vmmul_q_transp", &Replace_vmmul_q_transp, 0, 0}, {} }; static std::map replacedInstructions; void Replacement_Init() { } void Replacement_Shutdown() { replacedInstructions.clear(); } // TODO: Do something on load state? int GetNumReplacementFuncs() { return ARRAY_SIZE(entries); } int GetReplacementFuncIndex(u64 hash, int funcSize) { const char *name = MIPSAnalyst::LookupHash(hash, funcSize); if (!name) { return -1; } // TODO: Build a lookup and keep it around for (int i = 0; i < ARRAY_SIZE(entries); i++) { if (!entries[i].name) continue; if (!strcmp(name, entries[i].name)) { return i; } } return -1; } const ReplacementTableEntry *GetReplacementFunc(int i) { return &entries[i]; } void WriteReplaceInstruction(u32 address, u64 hash, int size) { int index = GetReplacementFuncIndex(hash, size); if (index >= 0) { u32 prevInstr = Memory::Read_U32(address); if (MIPS_IS_REPLACEMENT(prevInstr)) { return; } if (MIPS_IS_RUNBLOCK(prevInstr)) { // Likely already both replaced and jitted. Ignore. return; } replacedInstructions[address] = prevInstr; INFO_LOG(HLE, "Replaced %s at %08x with hash %016llx", entries[index].name, address, hash); Memory::Write_U32(MIPS_EMUHACK_CALL_REPLACEMENT | (int)index, address); } } bool GetReplacedOpAt(u32 address, u32 *op) { u32 instr = Memory::Read_Opcode_JIT(address).encoding; if (MIPS_IS_REPLACEMENT(instr)) { auto iter = replacedInstructions.find(address); if (iter != replacedInstructions.end()) { *op = iter->second; return true; } else { return false; } } return false; }