ARM emitter: Implement VMLA and VMUL by scalar, VLD1/VST1 multiple

This commit is contained in:
Henrik Rydgard 2013-11-23 01:51:35 +01:00
parent 97cfbd1a5f
commit b64f44c3fc
4 changed files with 207 additions and 40 deletions

View File

@ -18,6 +18,8 @@
#include "ArmEmitter.h"
#include "CPUDetect.h"
#include "base/logging.h"
#include <assert.h>
#include <stdarg.h>
#include <stddef.h>
@ -1043,21 +1045,6 @@ void ARMXEmitter::LDMBitmask(ARMReg dest, bool Add, bool Before, bool WriteBack,
#undef VA_TO_REGLIST
ARMReg SubBase(ARMReg Reg)
{
if (Reg >= S0)
{
if (Reg >= D0)
{
if (Reg >= Q0)
return (ARMReg)((Reg - Q0) * 2); // Always gets encoded as a double register
return (ARMReg)(Reg - D0);
}
return (ARMReg)(Reg - S0);
}
return Reg;
}
// NEON Specific
void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
@ -1181,15 +1168,38 @@ u32 EncodeVm(ARMReg Vm)
ARMReg Reg = SubBase(Vm);
if (quad_reg)
return ((Reg & 0x10) << 2) | (Reg & 0xF);
return ((Reg & 0x10) << 1) | (Reg & 0xF);
else {
if (double_reg)
return ((Reg & 0x10) << 2) | (Reg & 0xF);
return ((Reg & 0x10) << 1) | (Reg & 0xF);
else
return ((Reg & 0x1) << 5) | (Reg >> 1);
}
}
ARMReg SubBase(ARMReg Reg)
{
if (Reg >= S0)
{
if (Reg >= D0)
{
if (Reg >= Q0)
return (ARMReg)((Reg - Q0) * 2); // Always gets encoded as a double register
return (ARMReg)(Reg - D0);
}
return (ARMReg)(Reg - S0);
}
return Reg;
}
ARMReg DScalar(ARMReg dreg, int subScalar) {
int dr = (int)(SubBase(dreg)) & 0xF;
int scalar = ((subScalar << 4) | dr);
ARMReg ret = (ARMReg)(D0 + scalar);
// ILOG("Scalar: %i D0: %i AR: %i", scalar, (int)D0, (int)ret);
return ret;
}
void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
bool quad_reg = Vd >= Q0;
@ -1928,6 +1938,41 @@ void ARMXEmitter::VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | \
(0xC0 << 4) | ((Size & I_POLYNOMIAL) ? 1 << 9 : 0) | EncodeVm(Vm));
}
void ARMXEmitter::VMLA_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
bool register_quad = Vd >= Q0;
// No idea if the Non-Q case here works. Not really that interested.
if (Size & F_32)
Write32((0xF2 << 24) | (register_quad << 24) | (1 << 23) | (2 << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x14 << 4) | EncodeVm(Vm));
else
_dbg_assert_msg_(JIT, false, "VMLA_scalar only supports float atm");
//else
// Write32((0xF2 << 24) | (1 << 23) | (encodedSize(Size) << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x90 << 4) | (1 << 6) | EncodeVm(Vm));
// Unsigned support missing
}
void ARMXEmitter::VMUL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
_dbg_assert_msg_(JIT, cpu_info.bNEON, "Can't use " __FUNCTION__ " when CPU doesn't support it");
bool register_quad = Vd >= Q0;
int VmEnc = EncodeVm(Vm);
// No idea if the Non-Q case here works. Not really that interested.
if (Size & F_32) // Q flag
Write32((0xF2 << 24) | (register_quad << 24) | (1 << 23) | (2 << 20) | EncodeVn(Vn) | EncodeVd(Vd) | (0x94 << 4) | VmEnc);
else
_dbg_assert_msg_(JIT, false, "VMUL_scalar only supports float atm");
// Write32((0xF2 << 24) | ((Size & I_POLYNOMIAL) ? (1 << 24) : 0) | (1 << 23) | (encodedSize(Size) << 20) |
// EncodeVn(Vn) | EncodeVd(Vd) | (0x84 << 4) | (register_quad << 6) | EncodeVm(Vm));
// Unsigned support missing
}
void ARMXEmitter::VNEG(u32 Size, ARMReg Vd, ARMReg Vm)
{
_dbg_assert_msg_(JIT, Vd >= D0, "Pass invalid register to " __FUNCTION__);
@ -2303,9 +2348,32 @@ void ARMXEmitter::VZIP(u32 Size, ARMReg Vd, ARMReg Vm)
Write32((0xF3 << 24) | (0xB << 20) | (encodedSize(Size) << 18) | (1 << 17) | EncodeVd(Vd) | \
(0x18 << 4) | (register_quad << 6) | EncodeVm(Vm));
}
void ARMXEmitter::VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
static int RegCountToType(int nRegs, NEONAlignment align) {
switch (nRegs) {
case 1:
_dbg_assert_msg_(JIT, !((int)align & 1), "align & 1 must be == 0");
return 7;
case 2:
_dbg_assert_msg_(JIT, !((int)align & 3), "align & 3 must be == 0");
return 10;
case 3:
_dbg_assert_msg_(JIT, !((int)align & 1), "align & 1 must be == 0");
return 6;
case 4:
return 4;
default:
_dbg_assert_msg_(JIT, false, "Invalid number of registers passed to vector load/store");
return 0;
}
}
void ARMXEmitter::VLD1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm, NEONAlignment align)
{
u32 spacing = 0x7; // Only support loading to 1 reg
u32 spacing = RegCountToType(regCount, align); // Only support loading to 1 reg
// Gets encoded as a double register
Vd = SubBase(Vd);
@ -2313,6 +2381,30 @@ void ARMXEmitter::VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMR
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
| (align << 4) | Rm);
}
void ARMXEmitter::VST1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm, NEONAlignment align)
{
u32 spacing = RegCountToType(regCount, align); // Only support loading to 1 reg
// Gets encoded as a double register
Vd = SubBase(Vd);
Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (Rn << 16)
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
| (align << 4) | Rm);
}
void ARMXEmitter::VLD1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm) {
_dbg_assert_msg_(JIT, false, "VLD1_lane not done yet");
// TODO
}
void ARMXEmitter::VST1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm) {
_dbg_assert_msg_(JIT, false, "VST1_lane not done yet");
// TODO
}
void ARMXEmitter::VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
{
u32 spacing = 0x8; // Single spaced registers
@ -2323,16 +2415,6 @@ void ARMXEmitter::VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMR
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
| (align << 4) | Rm);
}
void ARMXEmitter::VST1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
{
u32 spacing = 0x7; // Single spaced registers
// Gets encoded as a double register
Vd = SubBase(Vd);
Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (Rn << 16)
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
| (align << 4) | Rm);
}
void ARMXEmitter::VREVX(u32 size, u32 Size, ARMReg Vd, ARMReg Vm)
{

View File

@ -359,9 +359,14 @@ const u32 I_POLYNOMIAL = (1 << 7); // Only used in VMUL/VMULL
u32 EncodeVd(ARMReg Vd);
u32 EncodeVn(ARMReg Vn);
u32 EncodeVm(ARMReg Vm);
// Subtracts the base from the register to give us the real one
ARMReg SubBase(ARMReg Reg);
// See A.7.1 in the ARMv7-A
// VMUL F32 scalars can only be up to D15[0], D15[1] - higher scalars cannot be individually addressed
ARMReg DScalar(ARMReg dreg, int subScalar);
enum NEONAlignment {
ALIGN_NONE = 0,
ALIGN_64 = 1,
@ -644,12 +649,38 @@ public:
void VHSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMAX(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
// Three registers
void VMLA(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMLS(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMUL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
// Two registers and a scalar
// These two are super useful for matrix multiplication
void VMUL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMLA_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
// TODO:
/*
void VMLS_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMLAL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMLSL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMULL_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQRDMULH_scalar(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
*/
void VNEG(u32 Size, ARMReg Vd, ARMReg Vm);
void VORN(ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VORR(ARMReg Vd, ARMReg Vn, ARMReg Vm);
@ -660,12 +691,7 @@ public:
void VPMIN(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQABS(u32 Size, ARMReg Vd, ARMReg Vm);
void VQADD(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMLAL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMLSL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQDMULL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQNEG(u32 Size, ARMReg Vd, ARMReg Vm);
void VQRDMULH(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQRSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQSHL(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
void VQSUB(u32 Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
@ -692,10 +718,28 @@ public:
void VREV32(u32 Size, ARMReg Vd, ARMReg Vm);
void VREV16(u32 Size, ARMReg Vd, ARMReg Vm);
void VLD1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
// Notes:
// Rm == _PC is interpreted as no offset, otherwise, effective address is sum of Rn and Rm
// Rm == R13 is interpreted as VLD1, .... [Rn]!
// Load/store multiple registers full of elements (a register is a D register)
void VLD1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm = _PC, NEONAlignment align = ALIGN_NONE);
void VST1(u32 Size, ARMReg Vd, ARMReg Rn, int regCount, ARMReg Rm = _PC, NEONAlignment align = ALIGN_NONE);
// Load/store single lanes of D registers
// TODO
void VLD1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = _PC);
void VST1_lane(u32 Size, ARMReg Vd, ARMReg Rn, int lane, ARMReg Rm = _PC);
// TODO: Make quad-oriented wrappers for the above.
// Deinterleave two loads... or something. TODO
void VLD2(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
void VST1(u32 Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
void VMRS_APSR();
void VMRS(ARMReg Rt);

View File

@ -223,7 +223,7 @@ public:
} else {
// Read standard icon
size_t sz;
INFO_LOG(LOADER, "Loading unknown.png because a PBP was missing an icon");
DEBUG_LOG(LOADER, "Loading unknown.png because a PBP was missing an icon");
uint8_t *contents = VFSReadFile("unknown.png", &sz);
if (contents) {
lock_guard lock(info_->lock);
@ -253,7 +253,7 @@ public:
// Read standard icon
size_t sz;
uint8_t *contents = VFSReadFile("unknown.png", &sz);
INFO_LOG(LOADER, "Loading unknown.png because there was an ELF");
DEBUG_LOG(LOADER, "Loading unknown.png because there was an ELF");
if (contents) {
lock_guard lock(info_->lock);
info_->iconTextureData = std::string((const char *)contents, sz);

View File

@ -22,11 +22,37 @@ TestCode::TestCode()
static float abc[256] = {1.0f, 2.0f, 0.0f};
static float a[4] = {1.0f, 2.0f, 3.0f, 4.5f};
static float b[4] = {1.0f, 1.0f, 1.0f, 0.5f};
static float c[4] = {0.0f, 0.0f, 0.0f, 0.0f};
void TestCode::Generate()
{
testCodePtr = this->GetCodePtr();
// Sonic1 commented that R11 is the frame pointer in debug mode, whatever "debug mode" means.
PUSH(2, R11, _LR);
// Load the three pointers
MOVP2R(R0, a);
MOVP2R(R1, b);
MOVP2R(R2, c);
// Load from two, do the operation, write to the third.
VLD1(F_32, D0, R0, 2); // Load 2 doubles
VLD1(F_32, D2, R1, 2); // Load another 2 doubles
// VADD(F_32, Q2, Q0, Q1); // Add them, seeing them as floating point quads
VMUL_scalar(F_32, Q2, Q0, DScalar(D3, 1)); // Multiply a quad by a scalar (ultra efficient for matrix mul! limitation: Scalar has to come out of D0-D15)
u32 word = *(u32 *)(GetCodePtr() - 4);
ILOG("Instruction Word: %08x", word);
// VMUL(F_32, Q2, Q0, Q1);
VST1(F_32, D4, R2, 2);
// This works!
// c will later be logged.
/*
MOVI2R(R11, (u32)&abc[0]);
MOVI2R(R1, 0x3f800000);
STR(R11, R1, 4 * (32 + 31));
@ -35,9 +61,13 @@ void TestCode::Generate()
VADD(S12, S0, S1);
VSTR(S0, R11, 4 * (32 + 31));
VSTR(S12, R11, 4 * (32 + 31));
*/
//VSTR(S2, R0, 8);
POP(2, R11, _PC); // Yup, this is how you return.
FlushLitPool();
FlushIcache();
//VLDR(S1, R0, 4);
//VADD(S2, S0, S1);
//VSTR(S2, R0, 8);
@ -61,13 +91,24 @@ void ArmEmitterTest()
{
// Disabled for now.
return;
for (int i = 0; i < 6; i++) {
ILOG("--------------------------");
}
ILOG("--------------------------");
ILOG("Running ARM emitter test!");
ILOG("--------------------------");
TestCode gen;
gen.ReserveCodeSpace(0x1000);
const u8 *codeStart = gen.GetCodePtr();
gen.Generate();
DisassembleArm(codeStart, gen.GetCodePtr()-codeStart);
u32 retval = CallPtr(gen.testCodePtr);
ILOG("ARM emitter test 1 passed if %f == 3.0! retval = %08x", abc[32 + 31], retval);
// ILOG("ARM emitter test 1 passed if %f == 3.0! retval = %08x", abc[32 + 31], retval);
ILOG("c: %f %f %f %f", c[0], c[1], c[2], c[3]);
for (int i = 0; i < 6; i++) {
ILOG("--------------------------");
}
// DisassembleArm(codeStart, gen.GetCodePtr()-codeStart);
}