diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f250d9bd3a..0c476ffa24 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -229,6 +229,8 @@ jobs: - name: Setup ccache uses: hendrikmuhs/ccache-action@v1.2 + # Disable ccache on macos for now, it's become buggy for some reason. + if: matrix.id != 'macos' with: key: ${{ matrix.id }} diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index f74838d1d4..2192a8a6e6 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -2128,6 +2128,13 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd (1 << 10) | (Rn << 5) | Rd); } +void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) | (Rn << 5) | Rd); +} + void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { _assert_msg_(!IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); @@ -2906,6 +2913,22 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn); } +// Scalar - pairwise +void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn); +} +void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn); +} +void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn); +} +void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn); +} +void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn) { + EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn); +} // Scalar - 2 Source void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) @@ -3023,6 +3046,9 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); } +void ARM64FloatEmitter::FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, size >> 6, 0x1A, Rd, Rn, Rm); +} void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x1E, Rd, Rn, Rm); @@ -3290,6 +3316,95 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) EmitCopy(b64Bit, 0, imm5, 5, Rd, Rn); } +void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) { + Rd = DecodeReg(Rd); + u8 abc = abcdefgh >> 5; + u8 defgh = abcdefgh & 0x1F; + Write32((Q << 30) | (op << 29) | (0xF << 24) | (abc << 16) | (cmode << 12) | (o2 << 11) | (1 << 10) | (defgh << 5) | Rd); +} + +void ARM64FloatEmitter::FMOV(u8 size, ARM64Reg Rd, u8 imm8) { + _assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__); + _assert_msg_(size == 32 || size == 64, "%s: unsupported size", __FUNCTION__); + _assert_msg_(IsQuad(Rd) || size == 32, "Use non-SIMD FMOV to load one double imm8"); + EncodeModImm(IsQuad(Rd), size >> 6, 0b1111, 0, Rd, imm8); +} + +void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) { + _assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__); + _assert_msg_(size == 8 || size == 16 || size == 32 || size == 64, "%s: unsupported size %d", __FUNCTION__, size); + _assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift); + _assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MOVI MSL shift requires size 32, shift must be 8 or 16"); + _assert_msg_(size != 64 || shift == 0, "MOVI 64-bit imm cannot be shifted"); + + u8 cmode = 0; + if (size == 8) + cmode = 0b1110; + else if (size == 16) + cmode = 0b1000 | (shift >> 2); + else if (MSL) + cmode = 0b1100 | (shift >> 3); + else if (size == 32) + cmode = (shift >> 2); + else if (size == 64) + cmode = 0b1110; + else + _assert_msg_(false, "%s: unhandled case", __FUNCTION__); + + EncodeModImm(IsQuad(Rd), size >> 6, cmode, 0, Rd, imm8); +} + +void ARM64FloatEmitter::MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift, bool MSL) { + _assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__); + _assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size); + _assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift); + _assert_msg_(!MSL || (size == 32 && shift > 0 && shift <= 16), "MVNI MSL shift requires size 32, shift must be 8 or 16"); + + u8 cmode = 0; + if (size == 16) + cmode = 0b1000 | (shift >> 2); + else if (MSL) + cmode = 0b1100 | (shift >> 3); + else if (size == 32) + cmode = (shift >> 2); + else + _assert_msg_(false, "%s: unhandled case", __FUNCTION__); + + EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8); +} + +void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) { + _assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__); + _assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size); + _assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift); + + u8 cmode = 0; + if (size == 16) + cmode = 0b1001 | (shift >> 2); + else if (size == 32) + cmode = 0b0001 | (shift >> 2); + else + _assert_msg_(false, "%s: unhandled case", __FUNCTION__); + + EncodeModImm(IsQuad(Rd), 0, cmode, 0, Rd, imm8); +} + +void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift) { + _assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__); + _assert_msg_(size == 16 || size == 32, "%s: unsupported size %d", __FUNCTION__, size); + _assert_msg_((shift & 7) == 0 && shift < size, "%s: unsupported shift %d", __FUNCTION__, shift); + + u8 cmode = 0; + if (size == 16) + cmode = 0b1001 | (shift >> 2); + else if (size == 32) + cmode = 0b0001 | (shift >> 2); + else + _assert_msg_(false, "%s: unhandled case", __FUNCTION__); + + EncodeModImm(IsQuad(Rd), 1, cmode, 0, Rd, imm8); +} + // One source void ARM64FloatEmitter::FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn) { @@ -3918,17 +4033,32 @@ void ARM64FloatEmitter::MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch, bool } // TODO: Quite a few values could be generated easily using the MOVI instruction and friends. -void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch) { +void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bool negate) { + _assert_msg_(!IsSingle(Rd), "%s doesn't support singles", __FUNCTION__); // TODO: Make it work with more element sizes - // TODO: Optimize - there are shorter solution for many values ARM64Reg s = (ARM64Reg)(S0 + DecodeReg(Rd)); int ival; memcpy(&ival, &value, 4); + uint8_t imm8; if (ival == 0) { // Make sure to not catch negative zero here - EOR(Rd, Rd, Rd); + // Prefer MOVI 0, which may have no latency on some CPUs. + MOVI(32, Rd, 0); + if (negate) + FNEG(32, Rd, Rd); + } else if (negate && FPImm8FromFloat(-value, &imm8)) { + FMOV(32, Rd, imm8); + } else if (FPImm8FromFloat(value, &imm8)) { + FMOV(32, Rd, imm8); + if (negate) { + FNEG(32, Rd, Rd); + } } else { - MOVI2F(s, value, scratch); - DUP(32, Rd, Rd, 0); + _assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value); + if (negate) { + ival ^= 0x80000000; + } + m_emit->MOVI2R(scratch, ival); + DUP(32, Rd, scratch); } } diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index 27152a18bc..70a7253c28 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -820,6 +820,13 @@ public: void FSQRT(ARM64Reg Rd, ARM64Reg Rn); void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP + // Scalar - pairwise + void FADDP(ARM64Reg Rd, ARM64Reg Rn); + void FMAXP(ARM64Reg Rd, ARM64Reg Rn); + void FMINP(ARM64Reg Rd, ARM64Reg Rn); + void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn); + void FMINNMP(ARM64Reg Rd, ARM64Reg Rn); + // Scalar - 2 Source void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -847,6 +854,7 @@ public: void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FADDP(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -893,6 +901,14 @@ public: void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + // Vector immediates + void FMOV(u8 size, ARM64Reg Rd, u8 imm8); + // MSL means bits shifted in are 1s. For size=64, each bit of imm8 is expanded to 8 actual bits. + void MOVI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false); + void MVNI(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0, bool MSL = false); + void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0); + void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0); + // One source void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); @@ -958,7 +974,7 @@ public: void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); - void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); + void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); // ABI related void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers); @@ -973,6 +989,7 @@ private: void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); @@ -994,6 +1011,7 @@ private: void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode); void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh); void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); diff --git a/Core/MIPS/ARM64/Arm64IRCompVec.cpp b/Core/MIPS/ARM64/Arm64IRCompVec.cpp index d83ace73d4..c4bca9dc88 100644 --- a/Core/MIPS/ARM64/Arm64IRCompVec.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompVec.cpp @@ -40,6 +40,10 @@ namespace MIPSComp { using namespace Arm64Gen; using namespace Arm64IRJitConstants; +static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) { + return r1 < r2 + l2 && r1 + l1 > r2; +} + void Arm64JitBackend::CompIR_VecArith(IRInst inst) { CONDITIONAL_DISABLE; @@ -65,7 +69,17 @@ void Arm64JitBackend::CompIR_VecArith(IRInst inst) { break; case IROp::Vec4Scale: - CompIR_Generic(inst); + if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1)) { + // ARM64 can handle this, but we have to map specially. + regs_.SpillLockFPR(inst.dest, inst.src1); + regs_.MapVec4(inst.src1); + regs_.MapVec4(inst.src2 & ~3); + regs_.MapVec4(inst.dest, MIPSMap::NOINIT); + fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2 & ~3), inst.src2 & 3); + } else { + regs_.Map(inst); + fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2), 0); + } break; case IROp::Vec4Neg: @@ -370,7 +384,30 @@ void Arm64JitBackend::CompIR_VecAssign(IRInst inst) { switch (inst.op) { case IROp::Vec4Init: - CompIR_Generic(inst); + regs_.Map(inst); + switch (Vec4Init(inst.src1)) { + case Vec4Init::AllZERO: + fp_.MOVI(32, regs_.FQ(inst.dest), 0); + break; + + case Vec4Init::AllONE: + case Vec4Init::AllMinusONE: + fp_.MOVI2FDUP(regs_.FQ(inst.dest), 1.0f, INVALID_REG, Vec4Init(inst.src1) == Vec4Init::AllMinusONE); + break; + + case Vec4Init::Set_1000: + case Vec4Init::Set_0100: + case Vec4Init::Set_0010: + case Vec4Init::Set_0001: + fp_.MOVI(32, regs_.FQ(inst.dest), 0); + fp_.MOVI2FDUP(EncodeRegToQuad(SCRATCHF1), 1.0f); + fp_.INS(32, regs_.FQ(inst.dest), inst.src1 - (int)Vec4Init::Set_1000, EncodeRegToQuad(SCRATCHF1), inst.src1 - (int)Vec4Init::Set_1000); + break; + + default: + _assert_msg_(false, "Unexpected Vec4Init value %d", inst.src1); + DISABLE; + } break; case IROp::Vec4Shuffle: @@ -392,7 +429,138 @@ void Arm64JitBackend::CompIR_VecAssign(IRInst inst) { break; case IROp::Vec4Blend: - CompIR_Generic(inst); + regs_.Map(inst); + if (inst.src1 == inst.src2) { + // Shouldn't really happen, just making sure the below doesn't have to think about it. + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + break; + } + + // To reduce overlap cases to consider, let's inverse src1/src2 if dest == src2. + // Thus, dest could be src1, but no other overlap is possible. + if (inst.dest == inst.src2) { + std::swap(inst.src1, inst.src2); + inst.constant ^= 0xF; + } + + switch (inst.constant & 0xF) { + case 0b0000: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + break; + + case 0b0001: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0); + break; + + case 0b0010: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1); + break; + + case 0b0011: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(64, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0); + break; + + case 0b0100: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2); + break; + + case 0b0101: + // To get AbCd: REV64 to BADC, then TRN2 xAxC, xbxd. + fp_.REV64(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src2)); + fp_.TRN2(32, regs_.FQ(inst.dest), EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1)); + break; + + case 0b0110: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1); + fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src2), 2); + break; + + case 0b0111: + if (inst.dest != inst.src1) { + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src1), 3); + } else { + fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1)); + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 3, EncodeRegToQuad(SCRATCHF1), 3); + } + break; + + case 0b1000: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3); + break; + + case 0b1001: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src2), 0); + fp_.INS(32, regs_.FQ(inst.dest), 3, regs_.FQ(inst.src2), 3); + break; + + case 0b1010: + // To get aBcD: REV64 to badc, then TRN2 xaxc, xBxD. + fp_.REV64(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.TRN2(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + break; + + case 0b1011: + if (inst.dest != inst.src1) { + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 2, regs_.FQ(inst.src1), 2); + } else { + fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1)); + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 2, EncodeRegToQuad(SCRATCHF1), 2); + } + break; + + case 0b1100: + if (inst.dest != inst.src1) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + fp_.INS(64, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src2), 1); + break; + + case 0b1101: + if (inst.dest != inst.src1) { + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 1, regs_.FQ(inst.src1), 1); + } else { + fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1)); + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 1, EncodeRegToQuad(SCRATCHF1), 1); + } + break; + + case 0b1110: + if (inst.dest != inst.src1) { + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 0, regs_.FQ(inst.src1), 0); + } else { + fp_.MOV(EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1)); + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + fp_.INS(32, regs_.FQ(inst.dest), 0, EncodeRegToQuad(SCRATCHF1), 0); + } + break; + + case 0b1111: + if (inst.dest != inst.src2) + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src2)); + break; + } break; case IROp::Vec4Mov: @@ -428,7 +596,22 @@ void Arm64JitBackend::CompIR_VecHoriz(IRInst inst) { switch (inst.op) { case IROp::Vec4Dot: - CompIR_Generic(inst); + if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) { + // To avoid overlap problems, map a little carefully. + regs_.SpillLockFPR(inst.src1, inst.src2); + regs_.MapVec4(inst.src1); + regs_.MapVec4(inst.src2); + // It must overlap, so inst.dest is already mapped. + fp_.FMUL(32, EncodeRegToQuad(SCRATCHF1), regs_.FQ(inst.src1), regs_.FQ(inst.src2)); + fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1)); + fp_.FADDP(32, EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1), EncodeRegToQuad(SCRATCHF1)); + fp_.INS(32, regs_.FQ(inst.dest & ~3), inst.dest & 3, EncodeRegToQuad(SCRATCHF1), 0); + } else { + regs_.Map(inst); + fp_.FMUL(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), regs_.FQ(inst.src2)); + fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest)); + fp_.FADDP(32, regs_.FQ(inst.dest), regs_.FQ(inst.dest), regs_.FQ(inst.dest)); + } break; default: diff --git a/Core/MIPS/RiscV/RiscVCompVec.cpp b/Core/MIPS/RiscV/RiscVCompVec.cpp index 3d91312a8c..b220d0ce8e 100644 --- a/Core/MIPS/RiscV/RiscVCompVec.cpp +++ b/Core/MIPS/RiscV/RiscVCompVec.cpp @@ -174,14 +174,18 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) { regs_.Map(inst); for (int i = 0; i < 4; ++i) { int which = (inst.constant >> i) & 1; - FMV(32, regs_.F(inst.dest + i), regs_.F((which ? inst.src2 : inst.src1) + i)); + IRReg srcReg = which ? inst.src2 : inst.src1; + if (inst.dest != srcReg) + FMV(32, regs_.F(inst.dest + i), regs_.F(srcReg + i)); } break; case IROp::Vec4Mov: - regs_.Map(inst); - for (int i = 0; i < 4; ++i) - FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i)); + if (inst.dest != inst.src1) { + regs_.Map(inst); + for (int i = 0; i < 4; ++i) + FMV(32, regs_.F(inst.dest + i), regs_.F(inst.src1 + i)); + } break; default: