Merge pull request #18073 from unknownbrackets/arm64jit-vec4

arm64jit: Implement several other Vec4 IR ops
This commit is contained in:
Henrik Rydgård 2023-09-06 08:55:57 +02:00 committed by GitHub
commit 6c3547d7ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 351 additions and 14 deletions

View File

@ -229,6 +229,8 @@ jobs:
- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
# Disable ccache on macos for now, it's become buggy for some reason.
if: matrix.id != 'macos'
with:
key: ${{ matrix.id }}

View File

@ -2128,6 +2128,13 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd
(1 << 10) | (Rn << 5) | Rd);
}
void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) {
Rd = DecodeReg(Rd);
Rn = DecodeReg(Rn);
Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) | (Rn << 5) | Rd);
}
void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__);
@ -2906,6 +2913,22 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn)
EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn);
}
// Scalar - pairwise
void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn) {
EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn);
}
void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn) {
EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn);
}
void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn) {
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn);
}
void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn) {
EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn);
}
void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn) {
EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn);
}
// Scalar - 2 Source
void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
@ -3023,6 +3046,9 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) {
EmitThreeSame(1, size >> 6, 0x1A, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, size >> 6, 0x1E, Rd, Rn, Rm);
@ -3290,6 +3316,95 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn);
}
void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) {
Rd = DecodeReg(Rd);
u8 abc = abcdefgh >> 5;
u8 defgh = abcdefgh & 0x1F;
Write32((Q << 30) | (op << 29) | (0xF << 24) | (abc << 16) | (cmode << 12) | (o2 << 11) | (1 << 10) | (defgh << 5) | Rd);
}
void ARM64FloatEmitter::FMOV(u8 size, ARM64Reg Rd, u8 imm8) {
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
_assert_msg_(size == 32 || size == 64, "%s: unsupported size", __FUNCTION__);
_assert_msg_(IsQuad(Rd) || size == 32, "Use non-SIMD FMOV to load one double imm8");
EncodeModImm(IsQuad(Rd), size >> 6, 0b1111, 0, Rd, imm8);
}
void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) {
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
_assert_msg_(size == 8 || size == 16 || size == 32 || size == 64, "%s: unsupported size %d", __FUNCTION__, size);
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
_assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MOVI MSL shift requires size 32, shift must be 8 or 16");
_assert_msg_(size != 64 || shift == 0, "MOVI 64-bit imm cannot be shifted");
u8 cmode = 0;
if (size == 8)
cmode = 0b1110;
else if (size == 16)
cmode = 0b1000 | (shift >> 2);
else if (MSL)
cmode = 0b1100 | (shift >> 3);
else if (size == 32)
cmode = (shift >> 2);
else if (size == 64)
cmode = 0b1110;
else
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
EncodeModImm(IsQuad(Rd), size >> 6, cmode, 0, Rd, imm8);
}
void ARM64FloatEmitter::MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) {
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
_assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MVNI MSL shift requires size 32, shift must be 8 or 16");
u8 cmode = 0;
if (size == 16)
cmode = 0b1000 | (shift >> 2);
else if (MSL)
cmode = 0b1100 | (shift >> 3);
else if (size == 32)
cmode = (shift >> 2);
else
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8);
}
void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) {
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
u8 cmode = 0;
if (size == 16)
cmode = 0b1001 | (shift >> 2);
else if (size == 32)
cmode = 0b0001 | (shift >> 2);
else
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
EncodeModImm(IsQuad(Rd), 0, cmode, 0, Rd, imm8);
}
void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) {
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
_assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size);
_assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift);
u8 cmode = 0;
if (size == 16)
cmode = 0b1001 | (shift >> 2);
else if (size == 32)
cmode = 0b0001 | (shift >> 2);
else
_assert_msg_(false, "%s: unhandled case", __FUNCTION__);
EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8);
}
// One source
void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn)
{
@ -3918,17 +4033,32 @@ void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool
}
// TODO: Quite a few values could be generated easily using the MOVI instruction and friends.
void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) {
void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) {
_assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__);
// TODO: Make it work with more element sizes
// TODO: Optimize - there are shorter solution for many values
ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd));
int ival;
memcpy(&ival, &value, 4);
uint8_t imm8;
if (ival == 0) { // Make sure to not catch negative zero here
EOR(Rd, Rd, Rd);
// Prefer MOVI 0, which may have no latency on some CPUs.
MOVI(32, Rd, 0);
if (negate)
FNEG(32, Rd, Rd);
} else if (negate && FPImm8FromFloat(-value, &imm8)) {
FMOV(32, Rd, imm8);
} else if (FPImm8FromFloat(value, &imm8)) {
FMOV(32, Rd, imm8);
if (negate) {
FNEG(32, Rd, Rd);
}
} else {
MOVI2F(s, value, scratch);
DUP(32, Rd, Rd, 0);
_assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value);
if (negate) {
ival ^= 0x80000000;
}
m_emit->MOVI2R(scratch, ival);
DUP(32, Rd, scratch);
}
}

View File

@ -820,6 +820,13 @@ public:
void FSQRT(ARM64Reg Rd, ARM64Reg Rn);
void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP
// Scalar - pairwise
void FADDP(ARM64Reg Rd, ARM64Reg Rn);
void FMAXP(ARM64Reg Rd, ARM64Reg Rn);
void FMINP(ARM64Reg Rd, ARM64Reg Rn);
void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn);
void FMINNMP(ARM64Reg Rd, ARM64Reg Rn);
// Scalar - 2 Source
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -847,6 +854,7 @@ public:
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -893,6 +901,14 @@ public:
void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
// Vector immediates
void FMOV(u8 size, ARM64Reg Rd, u8 imm8);
// MSL means bits shifted in are 1s. For size=64, each bit of imm8 is expanded to 8 actual bits.
void MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false);
void MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false);
void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
// One source
void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);
@ -958,7 +974,7 @@ public:
void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
// ABI related
void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers);
@ -973,6 +989,7 @@ private:
void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn);
void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
@ -994,6 +1011,7 @@ private:
void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode);
void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);

View File

@ -40,6 +40,10 @@ namespace MIPSComp {
using namespace Arm64Gen;
using namespace Arm64IRJitConstants;
static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {
return r1 < r2 + l2 && r1 + l1 > r2;
}
void Arm64JitBackend::CompIR_VecArith(IRInst inst) {
CONDITIONAL_DISABLE;
@ -65,7 +69,17 @@ void Arm64JitBackend::CompIR_VecArith(IRInst inst) {
break;
case IROp::Vec4Scale:
CompIR_Generic(inst);
if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1)) {
// ARM64 can handle this, but we have to map specially.
regs_.SpillLockFPR(inst.dest, inst.src1);
regs_.MapVec4(inst.src1);
regs_.MapVec4(inst.src2 & ~3);
regs_.MapVec4(inst.dest, MIPSMap::NOINIT);
fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2 & ~3), inst.src2 & 3);
} else {
regs_.Map(inst);
fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2), 0);
}
break;
case IROp::Vec4Neg:
@ -370,7 +384,30 @@ void Arm64JitBackend::CompIR_VecAssign(IRInst inst) {
switch (inst.op) {
case IROp::Vec4Init:
CompIR_Generic(inst);
regs_.Map(inst);
switch (Vec4Init(inst.src1)) {
case Vec4Init::AllZERO:
fp_.MOVI(32, regs_.FQ(inst.dest), 0);
break;
case Vec4Init::AllONE:
case Vec4Init::AllMinusONE:
fp_.MOVI2FDUP(regs_.FQ(inst.dest), 1.0f, INVALID_REG, Vec4Init(inst.src1) == Vec4Init::AllMinusONE);
break;
case Vec4Init::Set_1000:
case Vec4Init::Set_0100:
case Vec4Init::Set_0010:
case Vec4Init::Set_0001:
fp_.MOVI(32, regs_.FQ(inst.dest), 0);
fp_.MOVI2FDUP(EncodeRegToQuad(SCRATCHF1), 1.0f);
fp_.INS(32, regs_.FQ(inst.dest), inst.src1 - (int)Vec4Init::Set_1000, EncodeRegToQuad(SCRATCHF1), inst.src1 - (int)Vec4Init::Set_1000);
break;
default:
_assert_msg_(false, "Unexpected Vec4Init value %d", inst.src1);
DISABLE;
}
break;
case IROp::Vec4Shuffle:
@ -392,7 +429,138 @@ void Arm64JitBackend::CompIR_VecAssign(IRInst inst) {
break;
case IROp::Vec4Blend:
CompIR_Generic(inst);
regs_.Map(inst);
if (inst.src1 == inst.src2) {
// Shouldn't really happen, just making sure the below doesn't have to think about it.
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
break;
}
// To reduce overlap cases to consider, let's inverse src1/src2 if dest == src2.
// Thus, dest could be src1, but no other overlap is possible.
if (inst.dest == inst.src2) {
std::swap(inst.src1, inst.src2);
inst.constant ^= 0xF;
}
switch (inst.constant & 0xF) {
case 0b0000:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
break;
case 0b0001:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);
break;
case 0b0010:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);
break;
case 0b0011:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(64, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);
break;
case 0b0100:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2);
break;
case 0b0101:
// To get AbCd: REV64 to BADC, then TRN2 xAxC, xbxd.
fp_.REV64(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src2));
fp_.TRN2(32, regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
break;
case 0b0110:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);
fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2);
break;
case 0b0111:
if (inst.dest != inst.src1) {
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src1), 3);
} else {
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 3, EncodeRegToQuad(SCRATCHF1), 3);
}
break;
case 0b1000:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3);
break;
case 0b1001:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0);
fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3);
break;
case 0b1010:
// To get aBcD: REV64 to badc, then TRN2 xaxc, xBxD.
fp_.REV64(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.TRN2(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.src2));
break;
case 0b1011:
if (inst.dest != inst.src1) {
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src1), 2);
} else {
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 2, EncodeRegToQuad(SCRATCHF1), 2);
}
break;
case 0b1100:
if (inst.dest != inst.src1)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1));
fp_.INS(64, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1);
break;
case 0b1101:
if (inst.dest != inst.src1) {
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src1), 1);
} else {
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 1, EncodeRegToQuad(SCRATCHF1), 1);
}
break;
case 0b1110:
if (inst.dest != inst.src1) {
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src1), 0);
} else {
fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1));
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
fp_.INS(32, regs_.FQ(inst.dest), 0, EncodeRegToQuad(SCRATCHF1), 0);
}
break;
case 0b1111:
if (inst.dest != inst.src2)
fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2));
break;
}
break;
case IROp::Vec4Mov:
@ -428,7 +596,22 @@ void Arm64JitBackend::CompIR_VecHoriz(IRInst inst) {
switch (inst.op) {
case IROp::Vec4Dot:
CompIR_Generic(inst);
if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) {
// To avoid overlap problems, map a little carefully.
regs_.SpillLockFPR(inst.src1, inst.src2);
regs_.MapVec4(inst.src1);
regs_.MapVec4(inst.src2);
// It must overlap, so inst.dest is already mapped.
fp_.FMUL(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1), regs_.FQ(inst.src2));
fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));
fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1));
fp_.INS(32, regs_.FQ(inst.dest & ~3), inst.dest & 3, EncodeRegToQuad(SCRATCHF1), 0);
} else {
regs_.Map(inst);
fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2));
fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest));
fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest));
}
break;
default:

View File

@ -174,14 +174,18 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) {
regs_.Map(inst);
for (int i = 0; i < 4; ++i) {
int which = (inst.constant >> i) & 1;
FMV(32, regs_.F(inst.dest + i), regs_.F((which ? inst.src2 : inst.src1) + i));
IRReg srcReg = which ? inst.src2 : inst.src1;
if (inst.dest != srcReg)
FMV(32, regs_.F(inst.dest + i), regs_.F(srcReg + i));
}
break;
case IROp::Vec4Mov:
regs_.Map(inst);
for (int i = 0; i < 4; ++i)
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
if (inst.dest != inst.src1) {
regs_.Map(inst);
for (int i = 0; i < 4; ++i)
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
}
break;
default: