mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 13:30:02 +00:00
ARM emitter: Implement VMLA and VMUL by scalar, VLD1/VST1 multiple
This commit is contained in:
parent
97cfbd1a5f
commit
b64f44c3fc
@ -18,6 +18,8 @@
|
||||
#include "ArmEmitter.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#include "base/logging.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
@ -1043,21 +1045,6 @@ void ARMXEmitter::LDMBitmask(ARMReg dest, bool Add, bool Before, bool WriteBack,
|
||||
|
||||
#undef VA_TO_REGLIST
|
||||
|
||||
ARMReg SubBase(ARMReg Reg)
|
||||
{
|
||||
if (Reg >= S0)
|
||||
{
|
||||
if (Reg >= D0)
|
||||
{
|
||||
if (Reg >= Q0)
|
||||
return (ARMReg)((Reg - Q0) * 2); // Always gets encoded as a double register
|
||||
return (ARMReg)(Reg - D0);
|
||||
}
|
||||
return (ARMReg)(Reg - S0);
|
||||
}
|
||||
return Reg;
|
||||
}
|
||||
|
||||
// NEON Specific
|
||||
void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||
{
|
||||
@ -1181,15 +1168,38 @@ u32 EncodeVm(ARMReg Vm)
|
||||
ARMReg Reg = SubBase(Vm);
|
||||
|
||||
if (quad_reg)
|
||||
return ((Reg & 0x10) << 2) | (Reg & 0xF);
|
||||
return ((Reg & 0x10) << 1) | (Reg & 0xF);
|
||||
else {
|
||||
if (double_reg)
|
||||
return ((Reg & 0x10) << 2) | (Reg & 0xF);
|
||||
return ((Reg & 0x10) << 1) | (Reg & 0xF);
|
||||
else
|
||||
return ((Reg & 0x1) << 5) | (Reg >> 1);
|
||||
}
|
||||
}
|
||||
|
||||
ARMReg SubBase(ARMReg Reg)
|
||||
{
|
||||
if (Reg >= S0)
|
||||
{
|
||||
if (Reg >= D0)
|
||||
{
|
||||
if (Reg >= Q0)
|
||||
return (ARMReg)((Reg - Q0) * 2); // Always gets encoded as a double register
|
||||
return (ARMReg)(Reg - D0);
|
||||
}
|
||||
return (ARMReg)(Reg - S0);
|
||||
}
|
||||
return Reg;
|
||||
}
|
||||
|
||||
ARMReg DScalar(ARMReg dreg, int subScalar) {
|
||||
int dr = (int)(SubBase(dreg)) & 0xF;
|
||||
int scalar = ((subScalar << 4) | dr);
|
||||
ARMReg ret = (ARMReg)(D0 + scalar);
|
||||
// ILOG("Scalar: %i D0: %i AR: %i", scalar, (int)D0, (int)ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||
{
|
||||
bool quad_reg = Vd >= Q0;
|
||||
@ -1928,6 +1938,41 @@ void ARMXEmitter::VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||
Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
|
||||
(0xC0 << 4) | ((Size & I_POLYNOMIAL) ? 1 << 9 : 0) | EncodeVm(Vm));
|
||||
}
|
||||
void ARMXEmitter::VMLA_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||
{
|
||||
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
|
||||
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
|
||||
|
||||
bool register_quad = Vd >= Q0;
|
||||
|
||||
// No idea if the Non-Q case here works. Not really that interested.
|
||||
if (Size & F_32)
|
||||
Write32((0xF2 << 24) | (register_quad << 24) | (1 << 23) | (2 << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x14 << 4) | EncodeVm(Vm));
|
||||
else
|
||||
_dbg_assert_msg_(JIT, false, "VMLA_scalar only supports float atm");
|
||||
//else
|
||||
// Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x90 << 4) | (1 << 6) | EncodeVm(Vm));
|
||||
// Unsigned support missing
|
||||
}
|
||||
void ARMXEmitter::VMUL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||
{
|
||||
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
|
||||
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
|
||||
|
||||
bool register_quad = Vd >= Q0;
|
||||
|
||||
int VmEnc = EncodeVm(Vm);
|
||||
// No idea if the Non-Q case here works. Not really that interested.
|
||||
if (Size & F_32) // Q flag
|
||||
Write32((0xF2 << 24) | (register_quad << 24) | (1 << 23) | (2 << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x94 << 4) | VmEnc);
|
||||
else
|
||||
_dbg_assert_msg_(JIT, false, "VMUL_scalar only supports float atm");
|
||||
|
||||
// Write32((0xF2 << 24) | ((Size & I_POLYNOMIAL) ? (1 << 24) : 0) | (1 << 23) | (encodedSize(Size) << 20) |
|
||||
// EncodeVn(Vn) | EncodeVd(Vd) | (0x84 << 4) | (register_quad << 6) | EncodeVm(Vm));
|
||||
// Unsigned support missing
|
||||
}
|
||||
|
||||
void ARMXEmitter::VNEG(u32 Size, ARMReg Vd, ARMReg Vm)
|
||||
{
|
||||
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
|
||||
@ -2303,9 +2348,32 @@ void ARMXEmitter::VZIP(u32 Size, ARMReg Vd, ARMReg Vm)
|
||||
Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 17) | EncodeVd(Vd) | \
|
||||
(0x18 << 4) | (register_quad << 6) | EncodeVm(Vm));
|
||||
}
|
||||
void ARMXEmitter::VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
|
||||
|
||||
|
||||
static int RegCountToType(int nRegs, NEONAlignment align) {
|
||||
switch (nRegs) {
|
||||
case 1:
|
||||
_dbg_assert_msg_(JIT, !((int)align & 1), "align & 1 must be == 0");
|
||||
return 7;
|
||||
case 2:
|
||||
_dbg_assert_msg_(JIT, !((int)align & 3), "align & 3 must be == 0");
|
||||
return 10;
|
||||
case 3:
|
||||
_dbg_assert_msg_(JIT, !((int)align & 1), "align & 1 must be == 0");
|
||||
return 6;
|
||||
case 4:
|
||||
return 4;
|
||||
default:
|
||||
_dbg_assert_msg_(JIT, false, "Invalid number of registers passed to vector load/store");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
void ARMXEmitter::VLD1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm, NEONAlignment align)
|
||||
{
|
||||
u32 spacing = 0x7; // Only support loading to 1 reg
|
||||
u32 spacing = RegCountToType(regCount, align); // Only support loading to 1 reg
|
||||
// Gets encoded as a double register
|
||||
Vd = SubBase(Vd);
|
||||
|
||||
@ -2313,6 +2381,30 @@ void ARMXEmitter::VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMR
|
||||
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
|
||||
| (align << 4) | Rm);
|
||||
}
|
||||
|
||||
void ARMXEmitter::VST1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm, NEONAlignment align)
|
||||
{
|
||||
u32 spacing = RegCountToType(regCount, align); // Only support loading to 1 reg
|
||||
// Gets encoded as a double register
|
||||
Vd = SubBase(Vd);
|
||||
|
||||
Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (Rn << 16)
|
||||
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
|
||||
| (align << 4) | Rm);
|
||||
}
|
||||
|
||||
void ARMXEmitter::VLD1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm) {
|
||||
_dbg_assert_msg_(JIT, false, "VLD1_lane not done yet");
|
||||
// TODO
|
||||
}
|
||||
|
||||
void ARMXEmitter::VST1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm) {
|
||||
_dbg_assert_msg_(JIT, false, "VST1_lane not done yet");
|
||||
// TODO
|
||||
}
|
||||
|
||||
|
||||
|
||||
void ARMXEmitter::VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
|
||||
{
|
||||
u32 spacing = 0x8; // Single spaced registers
|
||||
@ -2323,16 +2415,6 @@ void ARMXEmitter::VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMR
|
||||
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
|
||||
| (align << 4) | Rm);
|
||||
}
|
||||
void ARMXEmitter::VST1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
|
||||
{
|
||||
u32 spacing = 0x7; // Single spaced registers
|
||||
// Gets encoded as a double register
|
||||
Vd = SubBase(Vd);
|
||||
|
||||
Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (Rn << 16)
|
||||
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
|
||||
| (align << 4) | Rm);
|
||||
}
|
||||
|
||||
void ARMXEmitter::VREVX(u32 size, u32 Size, ARMReg Vd, ARMReg Vm)
|
||||
{
|
||||
|
@ -359,9 +359,14 @@ const u32 I_POLYNOMIAL = (1 << 7); // Only used in VMUL/VMULL
|
||||
u32 EncodeVd(ARMReg Vd);
|
||||
u32 EncodeVn(ARMReg Vn);
|
||||
u32 EncodeVm(ARMReg Vm);
|
||||
|
||||
// Subtracts the base from the register to give us the real one
|
||||
ARMReg SubBase(ARMReg Reg);
|
||||
|
||||
// See A.7.1 in the ARMv7-A
|
||||
// VMUL F32 scalars can only be up to D15[0], D15[1] - higher scalars cannot be individually addressed
|
||||
ARMReg DScalar(ARMReg dreg, int subScalar);
|
||||
|
||||
enum NEONAlignment {
|
||||
ALIGN_NONE = 0,
|
||||
ALIGN_64 = 1,
|
||||
@ -644,12 +649,38 @@ public:
|
||||
void VHSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
|
||||
// Three registers
|
||||
void VMLA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMLS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMUL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
|
||||
// Two registers and a scalar
|
||||
// These two are super useful for matrix multiplication
|
||||
void VMUL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMLA_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
|
||||
// TODO:
|
||||
/*
|
||||
void VMLS_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQRDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
*/
|
||||
|
||||
void VNEG(u32 Size, ARMReg Vd, ARMReg Vm);
|
||||
void VORN(ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
@ -660,12 +691,7 @@ public:
|
||||
void VPMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQABS(u32 Size, ARMReg Vd, ARMReg Vm);
|
||||
void VQADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQNEG(u32 Size, ARMReg Vd, ARMReg Vm);
|
||||
void VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
void VQSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||
@ -692,10 +718,28 @@ public:
|
||||
void VREV32(u32 Size, ARMReg Vd, ARMReg Vm);
|
||||
void VREV16(u32 Size, ARMReg Vd, ARMReg Vm);
|
||||
|
||||
void VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
|
||||
// Notes:
|
||||
// Rm == _PC is interpreted as no offset, otherwise, effective address is sum of Rn and Rm
|
||||
// Rm == R13 is interpreted as VLD1, .... [Rn]!
|
||||
|
||||
|
||||
// Load/store multiple registers full of elements (a register is a D register)
|
||||
void VLD1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm = _PC, NEONAlignment align = ALIGN_NONE);
|
||||
void VST1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm = _PC, NEONAlignment align = ALIGN_NONE);
|
||||
|
||||
// Load/store single lanes of D registers
|
||||
// TODO
|
||||
void VLD1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = _PC);
|
||||
void VST1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = _PC);
|
||||
|
||||
|
||||
// TODO: Make quad-oriented wrappers for the above.
|
||||
|
||||
|
||||
|
||||
// Deinterleave two loads... or something. TODO
|
||||
void VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
|
||||
|
||||
void VST1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
|
||||
|
||||
void VMRS_APSR();
|
||||
void VMRS(ARMReg Rt);
|
||||
|
@ -223,7 +223,7 @@ public:
|
||||
} else {
|
||||
// Read standard icon
|
||||
size_t sz;
|
||||
INFO_LOG(LOADER, "Loading unknown.png because a PBP was missing an icon");
|
||||
DEBUG_LOG(LOADER, "Loading unknown.png because a PBP was missing an icon");
|
||||
uint8_t *contents = VFSReadFile("unknown.png", &sz);
|
||||
if (contents) {
|
||||
lock_guard lock(info_->lock);
|
||||
@ -253,7 +253,7 @@ public:
|
||||
// Read standard icon
|
||||
size_t sz;
|
||||
uint8_t *contents = VFSReadFile("unknown.png", &sz);
|
||||
INFO_LOG(LOADER, "Loading unknown.png because there was an ELF");
|
||||
DEBUG_LOG(LOADER, "Loading unknown.png because there was an ELF");
|
||||
if (contents) {
|
||||
lock_guard lock(info_->lock);
|
||||
info_->iconTextureData = std::string((const char *)contents, sz);
|
||||
|
@ -22,11 +22,37 @@ TestCode::TestCode()
|
||||
|
||||
static float abc[256] = {1.0f, 2.0f, 0.0f};
|
||||
|
||||
static float a[4] = {1.0f, 2.0f, 3.0f, 4.5f};
|
||||
static float b[4] = {1.0f, 1.0f, 1.0f, 0.5f};
|
||||
static float c[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
|
||||
void TestCode::Generate()
|
||||
{
|
||||
testCodePtr = this->GetCodePtr();
|
||||
// Sonic1 commented that R11 is the frame pointer in debug mode, whatever "debug mode" means.
|
||||
PUSH(2, R11, _LR);
|
||||
|
||||
// Load the three pointers
|
||||
MOVP2R(R0, a);
|
||||
MOVP2R(R1, b);
|
||||
MOVP2R(R2, c);
|
||||
|
||||
// Load from two, do the operation, write to the third.
|
||||
VLD1(F_32, D0, R0, 2); // Load 2 doubles
|
||||
VLD1(F_32, D2, R1, 2); // Load another 2 doubles
|
||||
// VADD(F_32, Q2, Q0, Q1); // Add them, seeing them as floating point quads
|
||||
VMUL_scalar(F_32, Q2, Q0, DScalar(D3, 1)); // Multiply a quad by a scalar (ultra efficient for matrix mul! limitation: Scalar has to come out of D0-D15)
|
||||
u32 word = *(u32 *)(GetCodePtr() - 4);
|
||||
ILOG("Instruction Word: %08x", word);
|
||||
// VMUL(F_32, Q2, Q0, Q1);
|
||||
VST1(F_32, D4, R2, 2);
|
||||
|
||||
// This works!
|
||||
|
||||
// c will later be logged.
|
||||
|
||||
/*
|
||||
MOVI2R(R11, (u32)&abc[0]);
|
||||
MOVI2R(R1, 0x3f800000);
|
||||
STR(R11, R1, 4 * (32 + 31));
|
||||
@ -35,9 +61,13 @@ void TestCode::Generate()
|
||||
VADD(S12, S0, S1);
|
||||
VSTR(S0, R11, 4 * (32 + 31));
|
||||
VSTR(S12, R11, 4 * (32 + 31));
|
||||
*/
|
||||
//VSTR(S2, R0, 8);
|
||||
POP(2, R11, _PC); // Yup, this is how you return.
|
||||
|
||||
FlushLitPool();
|
||||
FlushIcache();
|
||||
|
||||
//VLDR(S1, R0, 4);
|
||||
//VADD(S2, S0, S1);
|
||||
//VSTR(S2, R0, 8);
|
||||
@ -61,13 +91,24 @@ void ArmEmitterTest()
|
||||
{
|
||||
// Disabled for now.
|
||||
return;
|
||||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
ILOG("--------------------------");
|
||||
}
|
||||
ILOG("--------------------------");
|
||||
ILOG("Running ARM emitter test!");
|
||||
ILOG("--------------------------");
|
||||
|
||||
TestCode gen;
|
||||
gen.ReserveCodeSpace(0x1000);
|
||||
const u8 *codeStart = gen.GetCodePtr();
|
||||
gen.Generate();
|
||||
DisassembleArm(codeStart, gen.GetCodePtr()-codeStart);
|
||||
|
||||
u32 retval = CallPtr(gen.testCodePtr);
|
||||
ILOG("ARM emitter test 1 passed if %f == 3.0! retval = %08x", abc[32 + 31], retval);
|
||||
// ILOG("ARM emitter test 1 passed if %f == 3.0! retval = %08x", abc[32 + 31], retval);
|
||||
ILOG("c: %f %f %f %f", c[0], c[1], c[2], c[3]);
|
||||
for (int i = 0; i < 6; i++) {
|
||||
ILOG("--------------------------");
|
||||
}
|
||||
// DisassembleArm(codeStart, gen.GetCodePtr()-codeStart);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user