mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-12-04 03:32:29 +00:00
417 lines
12 KiB
C++
417 lines
12 KiB
C++
// Copyright (c) 2013- PPSSPP Project.
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation, version 2.0 or later versions.
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License 2.0 for more details.
|
|
|
|
// A copy of the GPL 2.0 should have been included with the program.
|
|
// If not, see http://www.gnu.org/licenses/
|
|
|
|
// Official git repository and contact information can be found at
|
|
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
|
|
|
// NEON VFPU
|
|
// This is where we will create an alternate implementation of the VFPU emulation
|
|
// that uses NEON Q registers to cache pairs/tris/quads, and so on.
|
|
// Will require major extensions to the reg cache and other things.
|
|
|
|
// ARM NEON can only do pairs and quads, not tris and scalars.
|
|
// We can do scalars, though, for many operations if all the operands
|
|
// are below Q8 (D16, S32) using regular VFP instructions but really not sure
|
|
// if it's worth it.
|
|
|
|
#include <cmath>
|
|
|
|
#include "base/logging.h"
|
|
#include "math/math_util.h"
|
|
|
|
#include "Common/CPUDetect.h"
|
|
#include "Core/MemMap.h"
|
|
#include "Core/MIPS/MIPS.h"
|
|
#include "Core/MIPS/MIPSAnalyst.h"
|
|
#include "Core/MIPS/MIPSCodeUtils.h"
|
|
#include "Core/MIPS/MIPSVFPUUtils.h"
|
|
#include "Core/Config.h"
|
|
#include "Core/Reporting.h"
|
|
|
|
#include "Core/MIPS/ARM/ArmJit.h"
|
|
#include "Core/MIPS/ARM/ArmRegCache.h"
|
|
#include "Core/MIPS/ARM/ArmCompVFPUNEONUtil.h"
|
|
|
|
// TODO: Somehow #ifdef away on ARMv5eabi, without breaking the linker.
|
|
|
|
#define _RS MIPS_GET_RS(op)
|
|
#define _RT MIPS_GET_RT(op)
|
|
#define _RD MIPS_GET_RD(op)
|
|
#define _FS MIPS_GET_FS(op)
|
|
#define _FT MIPS_GET_FT(op)
|
|
#define _FD MIPS_GET_FD(op)
|
|
#define _SA MIPS_GET_SA(op)
|
|
#define _POS ((op>> 6) & 0x1F)
|
|
#define _SIZE ((op>>11) & 0x1F)
|
|
#define _IMM16 (signed short)(op & 0xFFFF)
|
|
#define _IMM26 (op & 0x03FFFFFF)
|
|
|
|
namespace MIPSComp {
|
|
|
|
using namespace ArmGen;
|
|
using namespace ArmJitConstants;
|
|
|
|
static const float minus_one = -1.0f;
|
|
static const float one = 1.0f;
|
|
static const float zero = 0.0f;
|
|
|
|
// On NEON, we map triples to Q registers and singles to D registers.
|
|
// Sometimes, as when doing dot products, it matters what's in that unused reg. This zeroes it.
|
|
void ArmJit::NEONMaskToSize(ARMReg vs, VectorSize sz) {
|
|
// TODO
|
|
}
|
|
|
|
ARMReg ArmJit::NEONMapPrefixST(int mipsReg, VectorSize sz, u32 prefix, int mapFlags) {
|
|
static const float constantArray[8] = { 0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f };
|
|
static const float constantArrayNegated[8] = { -0.f, -1.f, -2.f, -0.5f, -3.f, -1.f / 3.f, -0.25f, -1.f / 6.f };
|
|
|
|
// Applying prefixes in SIMD fashion will actually be a lot easier than the old style.
|
|
if (prefix == 0xE4) {
|
|
return fpr.QMapReg(mipsReg, sz, mapFlags);
|
|
}
|
|
|
|
int n = GetNumVectorElements(sz);
|
|
|
|
int regnum[4] = { -1, -1, -1, -1 };
|
|
int abs[4] = { 0 };
|
|
int negate[4] = { 0 };
|
|
int constants[4] = { 0 };
|
|
int constNum[4] = { 0 };
|
|
|
|
int full_mask = (1 << n) - 1;
|
|
|
|
int abs_mask = (prefix >> 8) & full_mask;
|
|
int negate_mask = (prefix >> 16) & full_mask;
|
|
int constants_mask = (prefix >> 12) & full_mask;
|
|
|
|
// Decode prefix to keep the rest readable
|
|
int permuteMask = 0;
|
|
for (int i = 0; i < n; i++) {
|
|
permuteMask |= 3 << (i * 2);
|
|
regnum[i] = (prefix >> (i * 2)) & 3;
|
|
abs[i] = (prefix >> (8 + i)) & 1;
|
|
negate[i] = (prefix >> (16 + i)) & 1;
|
|
constants[i] = (prefix >> (12 + i)) & 1;
|
|
|
|
if (constants[i]) {
|
|
constNum[i] = regnum[i] + (abs[i] << 2);
|
|
abs[i] = 0;
|
|
}
|
|
}
|
|
abs_mask &= ~constants_mask;
|
|
|
|
bool anyPermute = (prefix & permuteMask) != (0xE4 & permuteMask);
|
|
|
|
if (constants_mask == full_mask) {
|
|
// It's all constants! Don't even bother mapping the input register,
|
|
// just allocate a temp one.
|
|
// If a single, this can sometimes be done cheaper. But meh.
|
|
ARMReg ar = fpr.QAllocTemp(sz);
|
|
for (int i = 0; i < n; i++) {
|
|
if ((i & 1) == 0) {
|
|
if (constNum[i] == constNum[i + 1]) {
|
|
// Replace two loads with a single immediate when easily possible.
|
|
ARMReg dest = i & 2 ? D_1(ar) : D_0(ar);
|
|
switch (constNum[i]) {
|
|
case 0:
|
|
case 1:
|
|
{
|
|
float c = constantArray[constNum[i]];
|
|
VMOV_immf(dest, negate[i] ? -c : c);
|
|
}
|
|
break;
|
|
// TODO: There are a few more that are doable.
|
|
default:
|
|
goto skip;
|
|
}
|
|
|
|
i++;
|
|
continue;
|
|
skip:
|
|
;
|
|
}
|
|
}
|
|
MOVP2R(R0, (negate[i] ? constantArrayNegated : constantArray) + constNum[i]);
|
|
VLD1_lane(F_32, ar, R0, i, true);
|
|
}
|
|
return ar;
|
|
}
|
|
|
|
// 1. Permute.
|
|
// 2. Abs
|
|
// If any constants:
|
|
// 3. Replace values with constants
|
|
// 4. Negate
|
|
|
|
ARMReg inputAR = fpr.QMapReg(mipsReg, sz, mapFlags);
|
|
ARMReg ar = fpr.QAllocTemp(sz);
|
|
|
|
if (!anyPermute) {
|
|
VMOV(ar, inputAR);
|
|
// No permutations!
|
|
} else {
|
|
bool allSame = false;
|
|
for (int i = 1; i < n; i++) {
|
|
if (regnum[0] == regnum[i])
|
|
allSame = true;
|
|
}
|
|
|
|
if (allSame) {
|
|
// Easy, someone is duplicating one value onto all the reg parts.
|
|
// If this is happening and QMapReg must load, we can combine these two actions
|
|
// into a VLD1_lane. TODO
|
|
VDUP(F_32, ar, inputAR, regnum[0]);
|
|
} else {
|
|
// Do some special cases
|
|
if (regnum[0] == 1 && regnum[1] == 0) {
|
|
INFO_LOG(HLE, "PREFIXST: Bottom swap!");
|
|
VREV64(I_32, ar, inputAR);
|
|
regnum[0] = 0;
|
|
regnum[1] = 1;
|
|
}
|
|
|
|
// TODO: Make a generic fallback using another temp register
|
|
|
|
bool match = true;
|
|
for (int i = 0; i < n; i++) {
|
|
if (regnum[i] != i)
|
|
match = false;
|
|
}
|
|
|
|
// TODO: Cannot do this permutation yet!
|
|
if (!match) {
|
|
ERROR_LOG(HLE, "PREFIXST: Unsupported permute! %i %i %i %i / %i", regnum[0], regnum[1], regnum[2], regnum[3], n);
|
|
VMOV(ar, inputAR);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ABS
|
|
// Two methods: If all lanes are "absoluted", it's easy.
|
|
if (abs_mask == full_mask) {
|
|
// TODO: elide the above VMOV (in !anyPermute) when possible
|
|
VABS(F_32, ar, ar);
|
|
} else if (abs_mask != 0) {
|
|
// Partial ABS!
|
|
if (abs_mask == 3) {
|
|
VABS(F_32, D_0(ar), D_0(ar));
|
|
} else {
|
|
// Horrifying fallback: Mov to Q0, abs, move back.
|
|
// TODO: Optimize for lower quads where we don't need to move.
|
|
VMOV(MatchSize(Q0, ar), ar);
|
|
for (int i = 0; i < n; i++) {
|
|
if (abs_mask & (1 << i)) {
|
|
VABS((ARMReg)(S0 + i), (ARMReg)(S0 + i));
|
|
}
|
|
}
|
|
VMOV(ar, MatchSize(Q0, ar));
|
|
INFO_LOG(HLE, "PREFIXST: Partial ABS %i/%i! Slow fallback generated.", abs_mask, full_mask);
|
|
}
|
|
}
|
|
|
|
if (negate_mask == full_mask) {
|
|
// TODO: elide the above VMOV when possible
|
|
VNEG(F_32, ar, ar);
|
|
} else if (negate_mask != 0) {
|
|
// Partial negate! I guess we build sign bits in another register
|
|
// and simply XOR.
|
|
if (negate_mask == 3) {
|
|
VNEG(F_32, D_0(ar), D_0(ar));
|
|
} else {
|
|
// Horrifying fallback: Mov to Q0, negate, move back.
|
|
// TODO: Optimize for lower quads where we don't need to move.
|
|
VMOV(MatchSize(Q0, ar), ar);
|
|
for (int i = 0; i < n; i++) {
|
|
if (negate_mask & (1 << i)) {
|
|
VNEG((ARMReg)(S0 + i), (ARMReg)(S0 + i));
|
|
}
|
|
}
|
|
VMOV(ar, MatchSize(Q0, ar));
|
|
INFO_LOG(HLE, "PREFIXST: Partial Negate %i/%i! Slow fallback generated.", negate_mask, full_mask);
|
|
}
|
|
}
|
|
|
|
// Insert constants where requested, and check negate!
|
|
for (int i = 0; i < n; i++) {
|
|
if (constants[i]) {
|
|
MOVP2R(R0, (negate[i] ? constantArrayNegated : constantArray) + constNum[i]);
|
|
VLD1_lane(F_32, ar, R0, i, true);
|
|
}
|
|
}
|
|
|
|
return ar;
|
|
}
|
|
|
|
ArmJit::DestARMReg ArmJit::NEONMapPrefixD(int vreg, VectorSize sz, int mapFlags) {
|
|
// Inverted from the actual bits, easier to reason about 1 == write
|
|
int writeMask = (~(js.prefixD >> 8)) & 0xF;
|
|
int n = GetNumVectorElements(sz);
|
|
int full_mask = (1 << n) - 1;
|
|
|
|
DestARMReg dest;
|
|
dest.sz = sz;
|
|
if ((writeMask & full_mask) == full_mask) {
|
|
// No need to apply a write mask.
|
|
// Let's not make things complicated.
|
|
dest.rd = fpr.QMapReg(vreg, sz, mapFlags);
|
|
dest.backingRd = dest.rd;
|
|
} else {
|
|
// Allocate a temporary register.
|
|
ELOG("PREFIXD: Write mask allocated! %i/%i", writeMask, full_mask);
|
|
dest.rd = fpr.QAllocTemp(sz);
|
|
dest.backingRd = fpr.QMapReg(vreg, sz, mapFlags & ~MAP_NOINIT); // Force initialization of the backing reg.
|
|
}
|
|
return dest;
|
|
}
|
|
|
|
void ArmJit::NEONApplyPrefixD(DestARMReg dest) {
|
|
// Apply clamps to dest.rd
|
|
int n = GetNumVectorElements(dest.sz);
|
|
|
|
int sat1_mask = 0;
|
|
int sat3_mask = 0;
|
|
int full_mask = (1 << n) - 1;
|
|
for (int i = 0; i < n; i++) {
|
|
int sat = (js.prefixD >> (i * 2)) & 3;
|
|
if (sat == 1)
|
|
sat1_mask |= 1 << i;
|
|
if (sat == 3)
|
|
sat3_mask |= 1 << i;
|
|
}
|
|
|
|
if (sat1_mask && sat3_mask) {
|
|
// Why would anyone do this?
|
|
ELOG("PREFIXD: Can't have both sat[0-1] and sat[-1-1] at the same time yet");
|
|
}
|
|
|
|
if (sat1_mask) {
|
|
if (sat1_mask != full_mask) {
|
|
ELOG("PREFIXD: Can't have partial sat1 mask yet (%i vs %i)", sat1_mask, full_mask);
|
|
}
|
|
if (IsD(dest.rd)) {
|
|
VMOV_immf(D0, 0.0);
|
|
VMOV_immf(D1, 1.0);
|
|
VMAX(F_32, dest.rd, dest.rd, D0);
|
|
VMIN(F_32, dest.rd, dest.rd, D1);
|
|
} else {
|
|
VMOV_immf(Q0, 1.0);
|
|
VMIN(F_32, dest.rd, dest.rd, Q0);
|
|
VMOV_immf(Q0, 0.0);
|
|
VMAX(F_32, dest.rd, dest.rd, Q0);
|
|
}
|
|
}
|
|
|
|
if (sat3_mask && sat1_mask != full_mask) {
|
|
if (sat3_mask != full_mask) {
|
|
ELOG("PREFIXD: Can't have partial sat3 mask yet (%i vs %i)", sat3_mask, full_mask);
|
|
}
|
|
if (IsD(dest.rd)) {
|
|
VMOV_immf(D0, 0.0);
|
|
VMOV_immf(D1, 1.0);
|
|
VMAX(F_32, dest.rd, dest.rd, D0);
|
|
VMIN(F_32, dest.rd, dest.rd, D1);
|
|
} else {
|
|
VMOV_immf(Q0, 1.0);
|
|
VMIN(F_32, dest.rd, dest.rd, Q0);
|
|
VMOV_immf(Q0, -1.0);
|
|
VMAX(F_32, dest.rd, dest.rd, Q0);
|
|
}
|
|
}
|
|
|
|
// Check for actual mask operation (unrelated to the "masks" above).
|
|
if (dest.backingRd != dest.rd) {
|
|
// This means that we need to apply the write mask, from rd to backingRd.
|
|
// What a pain. We can at least shortcut easy cases like half the register.
|
|
// And we can generate the masks easily with some of the crazy vector imm modes. (bits2bytes for example).
|
|
// So no need to load them from RAM.
|
|
int writeMask = (~(js.prefixD >> 8)) & 0xF;
|
|
|
|
if (writeMask == 3) {
|
|
ILOG("Doing writemask = 3");
|
|
VMOV(D_0(dest.rd), D_0(dest.backingRd));
|
|
} else {
|
|
// TODO
|
|
ELOG("PREFIXD: Arbitrary write masks not supported (%i / %i)", writeMask, full_mask);
|
|
VMOV(dest.backingRd, dest.rd);
|
|
}
|
|
}
|
|
}
|
|
|
|
ArmJit::MappedRegs ArmJit::NEONMapDirtyInIn(MIPSOpcode op, VectorSize dsize, VectorSize ssize, VectorSize tsize, bool applyPrefixes) {
|
|
MappedRegs regs;
|
|
if (applyPrefixes) {
|
|
regs.vs = NEONMapPrefixS(_VS, ssize, 0);
|
|
regs.vt = NEONMapPrefixT(_VT, tsize, 0);
|
|
} else {
|
|
regs.vs = fpr.QMapReg(_VS, ssize, 0);
|
|
regs.vt = fpr.QMapReg(_VT, ssize, 0);
|
|
}
|
|
|
|
regs.overlap = GetVectorOverlap(_VD, dsize, _VS, ssize) > 0 || GetVectorOverlap(_VD, dsize, _VT, ssize);
|
|
if (applyPrefixes) {
|
|
regs.vd = NEONMapPrefixD(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));
|
|
} else {
|
|
regs.vd.rd = fpr.QMapReg(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));
|
|
regs.vd.backingRd = regs.vd.rd;
|
|
regs.vd.sz = dsize;
|
|
}
|
|
return regs;
|
|
}
|
|
|
|
ArmJit::MappedRegs ArmJit::NEONMapInIn(MIPSOpcode op, VectorSize ssize, VectorSize tsize, bool applyPrefixes) {
|
|
MappedRegs regs;
|
|
if (applyPrefixes) {
|
|
regs.vs = NEONMapPrefixS(_VS, ssize, 0);
|
|
regs.vt = NEONMapPrefixT(_VT, tsize, 0);
|
|
} else {
|
|
regs.vs = fpr.QMapReg(_VS, ssize, 0);
|
|
regs.vt = fpr.QMapReg(_VT, ssize, 0);
|
|
}
|
|
regs.vd.rd = INVALID_REG;
|
|
regs.vd.sz = V_Invalid;
|
|
return regs;
|
|
}
|
|
|
|
ArmJit::MappedRegs ArmJit::NEONMapDirtyIn(MIPSOpcode op, VectorSize dsize, VectorSize ssize, bool applyPrefixes) {
|
|
MappedRegs regs;
|
|
regs.vs = NEONMapPrefixS(_VS, ssize, 0);
|
|
regs.overlap = GetVectorOverlap(_VD, dsize, _VS, ssize) > 0;
|
|
regs.vd = NEONMapPrefixD(_VD, dsize, MAP_DIRTY | (regs.overlap ? 0 : MAP_NOINIT));
|
|
return regs;
|
|
}
|
|
|
|
// Requires quad registers.
|
|
void ArmJit::NEONTranspose4x4(ARMReg cols[4]) {
|
|
// 0123 _\ 0426
|
|
// 4567 / 1537
|
|
VTRN(F_32, cols[0], cols[1]);
|
|
|
|
// 89ab _\ 8cae
|
|
// cdef / 9dbf
|
|
VTRN(F_32, cols[2], cols[3]);
|
|
|
|
// 04[26] 048c
|
|
// 15 37 -> 1537
|
|
// [8c]ae 26ae
|
|
// 9d bf 9dbf
|
|
VSWP(D_1(cols[0]), D_0(cols[2]));
|
|
|
|
// 04 8c 048c
|
|
// 15[37] -> 159d
|
|
// 26 ae 26ae
|
|
// [9d]bf 37bf
|
|
VSWP(D_1(cols[1]), D_0(cols[3]));
|
|
}
|
|
|
|
} // namespace MIPSComp
|