ARM64: Fix FCVTL, use it in v2hf

This commit is contained in:
Henrik Rydgard 2015-03-22 22:42:33 +01:00
parent 8eedcc7fb0
commit acf08eefa8
6 changed files with 115 additions and 37 deletions

View File

@ -2084,14 +2084,13 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd
(1 << 10) | (Rn << 5) | Rd);
}
void ARM64FloatEmitter::Emit2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
_assert_msg_(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__);
bool quad = IsQuad(Rd);
Rd = DecodeReg(Rd);
Rn = DecodeReg(Rn);
Write32((quad << 30) | (U << 29) | (0x71 << 21) | (size << 22) | \
Write32((Q << 30) | (U << 29) | (0x71 << 21) | (size << 22) | \
(opcode << 12) | (1 << 11) | (Rn << 5) | Rd);
}
@ -2786,27 +2785,27 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
}
void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, 2 | (size >> 6), 0xF, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
}
void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn)
void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn, bool source_upper)
{
Emit2RegMisc(0, size >> 6, 0x17, Rd, Rn);
Emit2RegMisc(source_upper, 0, size >> 6, 0x17, Rd, Rn);
}
void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, dest_size >> 5, 0x16, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 5, 0x16, Rd, Rn);
}
void ARM64FloatEmitter::FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, 2 | (size >> 6), 0x1B, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1B, Rd, Rn);
}
void ARM64FloatEmitter::FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, 2 | (size >> 6), 0x1B, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1B, Rd, Rn);
}
void ARM64FloatEmitter::FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
@ -2818,11 +2817,11 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
}
void ARM64FloatEmitter::FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, 2 | (size >> 6), 0xF, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xF, Rd, Rn);
}
void ARM64FloatEmitter::FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, 2 | (size >> 6), 0x1D, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1D, Rd, Rn);
}
void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
@ -2830,7 +2829,7 @@ void ARM64FloatEmitter::FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
}
void ARM64FloatEmitter::NOT(ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, 0, 5, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, 0, 5, Rd, Rn);
}
void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
@ -2838,28 +2837,28 @@ void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
}
void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, size >> 4, 1, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
}
void ARM64FloatEmitter::REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, size >> 4, 0, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, size >> 4, 0, Rd, Rn);
}
void ARM64FloatEmitter::REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, size >> 4, 0, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 0, Rd, Rn);
}
void ARM64FloatEmitter::SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, size >> 6, 0x1D, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, size >> 6, 0x1D, Rd, Rn);
}
void ARM64FloatEmitter::UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, size >> 6, 0x1D, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, size >> 6, 0x1D, Rd, Rn);
}
void ARM64FloatEmitter::XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, dest_size >> 4, 0x12, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, dest_size >> 4, 0x12, Rd, Rn);
}
// Move
@ -3096,7 +3095,7 @@ void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
}
void ARM64FloatEmitter::FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, 2 | (size >> 6), 0x1D, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x1D, Rd, Rn);
}
void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
@ -3104,7 +3103,7 @@ void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
}
void ARM64FloatEmitter::FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, 2 | (size >> 6), 0x1C, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0x1C, Rd, Rn);
}
void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
@ -3112,15 +3111,15 @@ void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
}
void ARM64FloatEmitter::FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, 2 | (size >> 6), 0x0C, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0x0C, Rd, Rn);
}
void ARM64FloatEmitter::FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(1, 2 | (size >> 6), 0xD, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 1, 2 | (size >> 6), 0xD, Rd, Rn);
}
void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(0, 2 | (size >> 6), 0xE, Rd, Rn);
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
}
void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)

View File

@ -796,7 +796,7 @@ public:
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn, bool source_upper = false);
void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
@ -889,7 +889,7 @@ private:
void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
void Emit2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn);
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);

View File

@ -2151,9 +2151,6 @@ namespace MIPSComp
MOVI2F(S0, 1.0f, SCRATCHREG1);
for (int i = 0; i < n; ++i) {
fpr.MapDirtyInV(tempregs[i], sregs[i]);
// Let's do it integer registers for now. NEON later.
// There's gotta be a shorter way, can't find one though that takes
// care of NaNs like the interpreter (ignores them and just operates on the bits).
VSUB(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
}

View File

@ -851,11 +851,93 @@ namespace MIPSComp
}
void Arm64Jit::Comp_Vi2f(MIPSOpcode op) {
DISABLE;
CONDITIONAL_DISABLE;
if (js.HasUnknownPrefix()) {
DISABLE;
}
VectorSize sz = GetVecSize(op);
int n = GetNumVectorElements(sz);
int imm = (op >> 16) & 0x1f;
const float mult = 1.0f / (float)(1UL << imm);
u8 sregs[4], dregs[4];
GetVectorRegsPrefixS(sregs, sz, _VS);
GetVectorRegsPrefixD(dregs, sz, _VD);
MIPSReg tempregs[4];
for (int i = 0; i < n; ++i) {
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
tempregs[i] = fpr.GetTempV();
} else {
tempregs[i] = dregs[i];
}
}
if (mult != 1.0f)
fp.MOVI2F(S0, mult, SCRATCH1);
// TODO: Use the SCVTF with builtin scaling where possible.
for (int i = 0; i < n; i++) {
fpr.MapDirtyInV(tempregs[i], sregs[i]);
fp.SCVTF(fpr.V(tempregs[i]), fpr.V(sregs[i]));
if (mult != 1.0f)
fp.FMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0);
}
for (int i = 0; i < n; ++i) {
if (dregs[i] != tempregs[i]) {
fpr.MapDirtyInV(dregs[i], tempregs[i]);
fp.FMOV(fpr.V(dregs[i]), fpr.V(tempregs[i]));
}
}
ApplyPrefixD(dregs, sz);
fpr.ReleaseSpillLocksAndDiscardTemps();
}
void Arm64Jit::Comp_Vh2f(MIPSOpcode op) {
DISABLE;
CONDITIONAL_DISABLE;
if (js.HasUnknownPrefix()) {
DISABLE;
}
u8 sregs[4], dregs[4];
VectorSize sz = GetVecSize(op);
VectorSize outSz;
switch (sz) {
case V_Single:
outSz = V_Pair;
break;
case V_Pair:
outSz = V_Quad;
break;
default:
DISABLE;
}
int n = GetNumVectorElements(sz);
int nOut = n * 2;
GetVectorRegsPrefixS(sregs, sz, _VS);
GetVectorRegsPrefixD(dregs, outSz, _VD);
// Take the single registers and combine them to a D register.
for (int i = 0; i < n; i++) {
fpr.MapRegV(sregs[i], sz);
fp.INS(32, Q0, i, fpr.V(sregs[i]), 0);
}
// Convert four 16-bit floats in D0 to four 32-bit floats in Q0 (even if we only have two...)
fp.FCVTL(32, Q0, D0);
// Split apart again.
for (int i = 0; i < nOut; i++) {
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
fp.INS(32, fpr.V(dregs[i]), 0, Q0, i);
}
ApplyPrefixD(dregs, sz);
fpr.ReleaseSpillLocksAndDiscardTemps();
}
void Arm64Jit::Comp_Vf2i(MIPSOpcode op) {
@ -1679,9 +1761,6 @@ namespace MIPSComp
fp.MOVI2F(S0, 1.0f, SCRATCH1);
for (int i = 0; i < n; ++i) {
fpr.MapDirtyInV(tempregs[i], sregs[i]);
// Let's do it integer registers for now. NEON later.
// There's gotta be a shorter way, can't find one though that takes
// care of NaNs like the interpreter (ignores them and just operates on the bits).
fp.FSUB(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
}

View File

@ -428,7 +428,8 @@ static void FPandASIMD1(uint32_t w, uint64_t addr, Instruction *instr) {
break;
case 2:
if (((w >> 17) & 0xf) == 0) {
snprintf(instr->text, sizeof(instr->text), "(asimd two-reg misc %08x)", w);
// Very similar to scalar two-reg misc. can we share code?
snprintf(instr->text, sizeof(instr->text), "(asimd vector two-reg misc %08x)", w);
} else if (((w >> 17) & 0xf) == 1) {
snprintf(instr->text, sizeof(instr->text), "(asimd across lanes %08x)", w);
} else {
@ -463,7 +464,7 @@ static void FPandASIMD1(uint32_t w, uint64_t addr, Instruction *instr) {
}
int index;
if ((size & 1) == 0) {
index = (H << 1) | L;
index = (H << 1) | (int)L;
} else {
index = H;
}

View File

@ -39,10 +39,12 @@ bool TestArm64Emitter() {
//emitter.EXTR(W1, W3, 0, 7);
//RET(CheckLast(emitter, "53033061 extr w1, w3, w7"));
//fp.FCVTL(32, Q6, D25);
//RET(CheckLast(emitter, "4fa29820 fcvtl q6, d25")); // A real disasm says fmla v0.2s, v1.2s, v2.s[1] but I think our way is more readable
fp.FMUL(32, Q0, Q1, Q2, 3);
RET(CheckLast(emitter, "4fa29820 fmul q0, q1, q2.4s[3]")); // A real disasm says fmla v0.2s, v1.2s, v2.s[1] but I think our way is more readable
fp.FMLA(32, D0, D1, D2, 1);
RET(CheckLast(emitter, "1e222c20 fmla d0, d1, d2.2s[1]"));
RET(CheckLast(emitter, "0fa21020 fmla d0, d1, d2.2s[1]"));
fp.FCSEL(S0, S1, S2, CC_CS);
RET(CheckLast(emitter, "1e222c20 fcsel s0, s1, s2, cs"));
float value = 1.0;
@ -52,7 +54,7 @@ bool TestArm64Emitter() {
RET(CheckLast(emitter, "1e2e1007 fmov s7, #1.000000"));
FPImm8FromFloat(-value, &imm8);
fp.FMOV(S7, imm8);
RET(CheckLast(emitter, "0fa21020 fmov s7, #-1.000000"));
RET(CheckLast(emitter, "1e3e1007 fmov s7, #-1.000000"));
fp.FMADD(S1, S2, S3, S4);
RET(CheckLast(emitter, "1f031041 fmadd s1, s2, s3, s4"));
fp.FNMSUB(D1, D2, D3, D4);