mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-26 23:10:38 +00:00
Merge pull request #18073 from unknownbrackets/arm64jit-vec4
arm64jit: Implement several other Vec4 IR ops
This commit is contained in:
commit
6c3547d7ae
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -229,6 +229,8 @@ jobs:
|
||||
|
||||
- name: Setup ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2
|
||||
# Disable ccache on macos for now, it's become buggy for some reason.
|
||||
if: matrix.id != 'macos'
|
||||
with:
|
||||
key: ${{ matrix.id }}
|
||||
|
||||
|
@ -2128,6 +2128,13 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd
|
||||
(1 << 10) | (Rn << 5) | Rd);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) {
|
||||
Rd = DecodeReg(Rd);
|
||||
Rn = DecodeReg(Rn);
|
||||
|
||||
Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) | (Rn << 5) | Rd);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__);
|
||||
@ -2906,6 +2913,22 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
|
||||
EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
|
||||
}
|
||||
|
||||
// Scalar - pairwise
|
||||
void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn) {
|
||||
EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn) {
|
||||
EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn) {
|
||||
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn) {
|
||||
EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn) {
|
||||
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn);
|
||||
}
|
||||
|
||||
// Scalar - 2 Source
|
||||
void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
@ -3023,6 +3046,9 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
|
||||
EmitThreeSame(1, size >> 6, 0x1A, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(0, size >> 6, 0x1E, Rd, Rn, Rm);
|
||||
@ -3290,6 +3316,95 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
|
||||
EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) {
|
||||
Rd = DecodeReg(Rd);
|
||||
u8 abc = abcdefgh >> 5;
|
||||
u8 defgh = abcdefgh & 0x1F;
|
||||
Write32((Q << 30) | (op << 29) | (0xF << 24) | (abc << 16) | (cmode << 12) | (o2 << 11) | (1 << 10) | (defgh << 5) | Rd);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::FMOV(u8 size, ARM64Reg Rd, u8 imm8) {
|
||||
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
|
||||
_assert_msg_(size == 32 || size == 64, "%s: unsupported size", __FUNCTION__);
|
||||
_assert_msg_(IsQuad(Rd) || size == 32, "Use non-SIMD FMOV to load one double imm8");
|
||||
EncodeModImm(IsQuad(Rd), size >> 6, 0b1111, 0, Rd, imm8);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) {
|
||||
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
|
||||
_assert_msg_(size == 8 || size == 16 || size == 32 || size == 64, "%s: unsupported size %d", __FUNCTION__, size);
|
||||
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
|
||||
_assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MOVI MSL shift requires size 32, shift must be 8 or 16");
|
||||
_assert_msg_(size != 64 || shift == 0, "MOVI 64-bit imm cannot be shifted");
|
||||
|
||||
u8 cmode = 0;
|
||||
if (size == 8)
|
||||
cmode = 0b1110;
|
||||
else if (size == 16)
|
||||
cmode = 0b1000 | (shift >> 2);
|
||||
else if (MSL)
|
||||
cmode = 0b1100 | (shift >> 3);
|
||||
else if (size == 32)
|
||||
cmode = (shift >> 2);
|
||||
else if (size == 64)
|
||||
cmode = 0b1110;
|
||||
else
|
||||
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
|
||||
|
||||
EncodeModImm(IsQuad(Rd), size >> 6, cmode, 0, Rd, imm8);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) {
|
||||
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
|
||||
_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
|
||||
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
|
||||
_assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MVNI MSL shift requires size 32, shift must be 8 or 16");
|
||||
|
||||
u8 cmode = 0;
|
||||
if (size == 16)
|
||||
cmode = 0b1000 | (shift >> 2);
|
||||
else if (MSL)
|
||||
cmode = 0b1100 | (shift >> 3);
|
||||
else if (size == 32)
|
||||
cmode = (shift >> 2);
|
||||
else
|
||||
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
|
||||
|
||||
EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) {
|
||||
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
|
||||
_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
|
||||
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
|
||||
|
||||
u8 cmode = 0;
|
||||
if (size == 16)
|
||||
cmode = 0b1001 | (shift >> 2);
|
||||
else if (size == 32)
|
||||
cmode = 0b0001 | (shift >> 2);
|
||||
else
|
||||
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
|
||||
|
||||
EncodeModImm(IsQuad(Rd), 0, cmode, 0, Rd, imm8);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) {
|
||||
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
|
||||
_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
|
||||
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
|
||||
|
||||
u8 cmode = 0;
|
||||
if (size == 16)
|
||||
cmode = 0b1001 | (shift >> 2);
|
||||
else if (size == 32)
|
||||
cmode = 0b0001 | (shift >> 2);
|
||||
else
|
||||
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
|
||||
|
||||
EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8);
|
||||
}
|
||||
|
||||
// One source
|
||||
void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
@ -3918,17 +4033,32 @@ void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool
|
||||
}
|
||||
|
||||
// TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
|
||||
void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) {
|
||||
void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) {
|
||||
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
|
||||
// TODO: Make it work with more element sizes
|
||||
// TODO: Optimize - there are shorter solution for many values
|
||||
ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
|
||||
int ival;
|
||||
memcpy(&ival, &value, 4);
|
||||
uint8_t imm8;
|
||||
if (ival == 0) { // Make sure to not catch negative zero here
|
||||
EOR(Rd, Rd, Rd);
|
||||
// Prefer MOVI 0, which may have no latency on some CPUs.
|
||||
MOVI(32, Rd, 0);
|
||||
if (negate)
|
||||
FNEG(32, Rd, Rd);
|
||||
} else if (negate && FPImm8FromFloat(-value, &imm8)) {
|
||||
FMOV(32, Rd, imm8);
|
||||
} else if (FPImm8FromFloat(value, &imm8)) {
|
||||
FMOV(32, Rd, imm8);
|
||||
if (negate) {
|
||||
FNEG(32, Rd, Rd);
|
||||
}
|
||||
} else {
|
||||
MOVI2F(s, value, scratch);
|
||||
DUP(32, Rd, Rd, 0);
|
||||
_assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value);
|
||||
if (negate) {
|
||||
ival ^= 0x80000000;
|
||||
}
|
||||
m_emit->MOVI2R(scratch, ival);
|
||||
DUP(32, Rd, scratch);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -820,6 +820,13 @@ public:
|
||||
void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP
|
||||
|
||||
// Scalar - pairwise
|
||||
void FADDP(ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FMAXP(ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FMINP(ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FMINNMP(ARM64Reg Rd, ARM64Reg Rn);
|
||||
|
||||
// Scalar - 2 Source
|
||||
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
@ -847,6 +854,7 @@ public:
|
||||
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
|
||||
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
@ -893,6 +901,14 @@ public:
|
||||
void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
|
||||
void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
|
||||
|
||||
// Vector immediates
|
||||
void FMOV(u8 size, ARM64Reg Rd, u8 imm8);
|
||||
// MSL means bits shifted in are 1s. For size=64, each bit of imm8 is expanded to 8 actual bits.
|
||||
void MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false);
|
||||
void MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false);
|
||||
void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
|
||||
void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
|
||||
|
||||
// One source
|
||||
void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
|
||||
|
||||
@ -958,7 +974,7 @@ public:
|
||||
void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
|
||||
|
||||
void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
|
||||
void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
|
||||
void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
|
||||
|
||||
// ABI related
|
||||
void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers);
|
||||
@ -973,6 +989,7 @@ private:
|
||||
void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn);
|
||||
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
|
||||
@ -994,6 +1011,7 @@ private:
|
||||
void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode);
|
||||
void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
|
||||
void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
|
||||
void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
|
||||
|
||||
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
||||
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
||||
|
@ -40,6 +40,10 @@ namespace MIPSComp {
|
||||
using namespace Arm64Gen;
|
||||
using namespace Arm64IRJitConstants;
|
||||
|
||||
static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {
|
||||
return r1 < r2 + l2 && r1 + l1 > r2;
|
||||
}
|
||||
|
||||
void Arm64JitBackend::CompIR_VecArith(IRInst inst) {
|
||||
CONDITIONAL_DISABLE;
|
||||
|
||||
@ -65,7 +69,17 @@ void Arm64JitBackend::CompIR_VecArith(IRInst inst) {
|
||||
break;
|
||||
|
||||
case IROp::Vec4Scale:
|
||||
CompIR_Generic(inst);
|
||||
if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1)) {
|
||||
// ARM64 can handle this, but we have to map specially.
|
||||
regs_.SpillLockFPR(inst.dest, inst.src1);
|
||||
regs_.MapVec4(inst.src1);
|
||||
regs_.MapVec4(inst.src2 & ~3);
|
||||
regs_.MapVec4(inst.dest, MIPSMap::NOINIT);
|
||||
fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2 & ~3), inst.src2 & 3);
|
||||
} else {
|
||||
regs_.Map(inst);
|
||||
fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2), 0);
|
||||
}
|
||||
break;
|
||||
|
||||
case IROp::Vec4Neg:
|
||||
@ -370,7 +384,30 @@ void Arm64JitBackend::CompIR_VecAssign(IRInst inst) {
|
||||
|
||||
switch (inst.op) {
|
||||
case IROp::Vec4Init:
|
||||
CompIR_Generic(inst);
|
||||
regs_.Map(inst);
|
||||
switch (Vec4Init(inst.src1)) {
|
||||
case Vec4Init::AllZERO:
|
||||
fp_.MOVI(32, regs_.FQ(inst.dest), 0);
|
||||
break;
|
||||
|
||||
case Vec4Init::AllONE:
|
||||
case Vec4Init::AllMinusONE:
|
||||
fp_.MOVI2FDUP(regs_.FQ(inst.dest), 1.0f, INVALID_REG, Vec4Init(inst.src1) == Vec4Init::AllMinusONE);
|
||||
break;
|
||||
|
||||
case Vec4Init::Set_1000:
|
||||
case Vec4Init::Set_0100:
|
||||
case Vec4Init::Set_0010:
|
||||
case Vec4Init::Set_0001:
|
||||
fp_.MOVI(32, regs_.FQ(inst.dest), 0);
|
||||
fp_.MOVI2FDUP(EncodeRegToQuad(SCRATCHF1), 1.0f);
|
||||
fp_.INS(32, regs_.FQ(inst.dest), inst.src1 - (int)Vec4Init::Set_1000, EncodeRegToQuad(SCRATCHF1), inst.src1 - (int)Vec4Init::Set_1000);
|
||||
break;
|
||||
|
||||
default:
|
||||
_assert_msg_(false, "Unexpected Vec4Init value %d", inst.src1);
|
||||
DISABLE;
|
||||
}
|
||||
break;
|
||||
|
||||
case IROp::Vec4Shuffle:
|
||||
@ -392,7 +429,138 @@ void Arm64JitBackend::CompIR_VecAssign(IRInst inst) {
|
||||
break;
|
||||
|
||||
case IROp::Vec4Blend:
|
||||
CompIR_Generic(inst);
|
||||
regs_.Map(inst);
|
||||
if (inst.src1 == inst.src2) {
|
||||
// Shouldn't really happen, just making sure the below doesn't have to think about it.
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
break;
|
||||
}
|
||||
|
||||
// To reduce overlap cases to consider, let's inverse src1/src2 if dest == src2.
|
||||
// Thus, dest could be src1, but no other overlap is possible.
|
||||
if (inst.dest == inst.src2) {
|
||||
std::swap(inst.src1, inst.src2);
|
||||
inst.constant ^= 0xF;
|
||||
}
|
||||
|
||||
switch (inst.constant & 0xF) {
|
||||
case 0b0000:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
break;
|
||||
|
||||
case 0b0001:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);
|
||||
break;
|
||||
|
||||
case 0b0010:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);
|
||||
break;
|
||||
|
||||
case 0b0011:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(64, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);
|
||||
break;
|
||||
|
||||
case 0b0100:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2);
|
||||
break;
|
||||
|
||||
case 0b0101:
|
||||
// To get AbCd: REV64 to BADC, then TRN2 xAxC, xbxd.
|
||||
fp_.REV64(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src2));
|
||||
fp_.TRN2(32, regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
|
||||
break;
|
||||
|
||||
case 0b0110:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2);
|
||||
break;
|
||||
|
||||
case 0b0111:
|
||||
if (inst.dest != inst.src1) {
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src1), 3);
|
||||
} else {
|
||||
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 3, EncodeRegToQuad(SCRATCHF1), 3);
|
||||
}
|
||||
break;
|
||||
|
||||
case 0b1000:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3);
|
||||
break;
|
||||
|
||||
case 0b1001:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3);
|
||||
break;
|
||||
|
||||
case 0b1010:
|
||||
// To get aBcD: REV64 to badc, then TRN2 xaxc, xBxD.
|
||||
fp_.REV64(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.TRN2(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
break;
|
||||
|
||||
case 0b1011:
|
||||
if (inst.dest != inst.src1) {
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src1), 2);
|
||||
} else {
|
||||
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 2, EncodeRegToQuad(SCRATCHF1), 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case 0b1100:
|
||||
if (inst.dest != inst.src1)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
|
||||
fp_.INS(64, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);
|
||||
break;
|
||||
|
||||
case 0b1101:
|
||||
if (inst.dest != inst.src1) {
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src1), 1);
|
||||
} else {
|
||||
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 1, EncodeRegToQuad(SCRATCHF1), 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case 0b1110:
|
||||
if (inst.dest != inst.src1) {
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src1), 0);
|
||||
} else {
|
||||
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
fp_.INS(32, regs_.FQ(inst.dest), 0, EncodeRegToQuad(SCRATCHF1), 0);
|
||||
}
|
||||
break;
|
||||
|
||||
case 0b1111:
|
||||
if (inst.dest != inst.src2)
|
||||
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case IROp::Vec4Mov:
|
||||
@ -428,7 +596,22 @@ void Arm64JitBackend::CompIR_VecHoriz(IRInst inst) {
|
||||
|
||||
switch (inst.op) {
|
||||
case IROp::Vec4Dot:
|
||||
CompIR_Generic(inst);
|
||||
if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) {
|
||||
// To avoid overlap problems, map a little carefully.
|
||||
regs_.SpillLockFPR(inst.src1, inst.src2);
|
||||
regs_.MapVec4(inst.src1);
|
||||
regs_.MapVec4(inst.src2);
|
||||
// It must overlap, so inst.dest is already mapped.
|
||||
fp_.FMUL(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1), regs_.FQ(inst.src2));
|
||||
fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));
|
||||
fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));
|
||||
fp_.INS(32, regs_.FQ(inst.dest & ~3), inst.dest & 3, EncodeRegToQuad(SCRATCHF1), 0);
|
||||
} else {
|
||||
regs_.Map(inst);
|
||||
fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2));
|
||||
fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest));
|
||||
fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -174,14 +174,18 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) {
|
||||
regs_.Map(inst);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int which = (inst.constant >> i) & 1;
|
||||
FMV(32, regs_.F(inst.dest + i), regs_.F((which ? inst.src2 : inst.src1) + i));
|
||||
IRReg srcReg = which ? inst.src2 : inst.src1;
|
||||
if (inst.dest != srcReg)
|
||||
FMV(32, regs_.F(inst.dest + i), regs_.F(srcReg + i));
|
||||
}
|
||||
break;
|
||||
|
||||
case IROp::Vec4Mov:
|
||||
regs_.Map(inst);
|
||||
for (int i = 0; i < 4; ++i)
|
||||
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
|
||||
if (inst.dest != inst.src1) {
|
||||
regs_.Map(inst);
|
||||
for (int i = 0; i < 4; ++i)
|
||||
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
|
Loading…
Reference in New Issue
Block a user