diff --git a/External/FEXCore/Source/Interface/Core/ArchHelpers/CodeEmitter/SVEOps.inl b/External/FEXCore/Source/Interface/Core/ArchHelpers/CodeEmitter/SVEOps.inl index 79643446c..ae8971f9d 100644 --- a/External/FEXCore/Source/Interface/Core/ArchHelpers/CodeEmitter/SVEOps.inl +++ b/External/FEXCore/Source/Interface/Core/ArchHelpers/CodeEmitter/SVEOps.inl @@ -2940,7 +2940,55 @@ public: } // SVE load multiple structures (scalar plus scalar) - // XXX: + void ld2b(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i8Bit, 0b01, zt1, pg, rn, rm); + } + void ld3b(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i8Bit, 0b10, zt1, pg, rn, rm); + } + void ld4b(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i8Bit, 0b11, zt1, pg, rn, rm); + } + void ld2h(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i16Bit, 0b01, zt1, pg, rn, rm); + } + void ld3h(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i16Bit, 0b10, zt1, pg, rn, rm); + } + void ld4h(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i16Bit, 0b11, zt1, pg, rn, rm); + } + void ld2w(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i32Bit, 0b01, zt1, pg, rn, rm); + } + void ld3w(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i32Bit, 0b10, zt1, pg, rn, rm); + } + void ld4w(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i32Bit, 0b11, zt1, pg, rn, rm); + } + void ld2d(ZRegister zt1, ZRegister zt2, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i64Bit, 0b01, zt1, pg, rn, rm); + } + void ld3d(ZRegister zt1, ZRegister zt2, ZRegister zt3, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i64Bit, 0b10, zt1, pg, rn, rm); + } + void ld4d(ZRegister zt1, ZRegister zt2, ZRegister zt3, ZRegister zt4, PRegisterZero pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(AreVectorsSequential(zt1, zt2, zt3, zt4), "Registers need to be contiguous"); + SVEContiguousLoadStoreMultipleScalar(false, SubRegSize::i64Bit, 0b11, zt1, pg, rn, rm); + } + // SVE load and broadcast quadword (scalar plus immediate) // XXX: @@ -4349,6 +4397,26 @@ private: dc32(Instr); } + void SVEContiguousLoadStoreMultipleScalar(bool is_store, SubRegSize msz, uint32_t opc, ZRegister zt, + PRegister pg, Register rn, Register rm) { + LOGMAN_THROW_A_FMT(pg <= PReg::p7, "Can only use p0-p7 as a governing predicate"); + LOGMAN_THROW_A_FMT(rm != Reg::rsp, "rm cannot be the stack pointer"); + + uint32_t Instr = 0b1010'0100'0000'0000'1100'0000'0000'0000; + if (is_store) { + Instr |= 0x40006000U; + } else { + Instr |= 0x0000C000U; + } + Instr |= FEXCore::ToUnderlying(msz) << 23; + Instr |= opc << 21; + Instr |= rm.Idx() << 16; + Instr |= pg.Idx() << 10; + Instr |= rn.Idx() << 5; + Instr |= zt.Idx(); + dc32(Instr); + } + void SVEIndexGeneration(uint32_t op, SubRegSize size, ZRegister zd, int32_t imm5, int32_t imm5b) { LOGMAN_THROW_A_FMT(size != SubRegSize::i128Bit, "INDEX cannot use 128-bit element sizes"); diff --git a/External/FEXCore/unittests/Emitter/SVE_Tests.cpp b/External/FEXCore/unittests/Emitter/SVE_Tests.cpp index d6d92b05b..f4b245bab 100644 --- a/External/FEXCore/unittests/Emitter/SVE_Tests.cpp +++ b/External/FEXCore/unittests/Emitter/SVE_Tests.cpp @@ -4073,6 +4073,35 @@ TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE Memory - 32-bit Gather and TEST_SINGLE(ldr(ZReg::z30, XReg::x29, -256), "ldr z30, [x29, #-256, mul vl]"); TEST_SINGLE(ldr(ZReg::z30, XReg::x29, 255), "ldr z30, [x29, #255, mul vl]"); } +TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE load multiple structures (scalar plus scalar)") { + TEST_SINGLE(ld2b(ZReg::z31, ZReg::z0, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2b {z31.b, z0.b}, p6/z, [x29, x30]"); + TEST_SINGLE(ld2b(ZReg::z26, ZReg::z27, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2b {z26.b, z27.b}, p6/z, [x29, x30]"); + TEST_SINGLE(ld3b(ZReg::z31, ZReg::z0, ZReg::z1, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3b {z31.b, z0.b, z1.b}, p6/z, [x29, x30]"); + TEST_SINGLE(ld3b(ZReg::z26, ZReg::z27, ZReg::z28, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3b {z26.b, z27.b, z28.b}, p6/z, [x29, x30]"); + TEST_SINGLE(ld4b(ZReg::z31, ZReg::z0, ZReg::z1, ZReg::z2, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4b {z31.b, z0.b, z1.b, z2.b}, p6/z, [x29, x30]"); + TEST_SINGLE(ld4b(ZReg::z26, ZReg::z27, ZReg::z28, ZReg::z29, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4b {z26.b, z27.b, z28.b, z29.b}, p6/z, [x29, x30]"); + + TEST_SINGLE(ld2h(ZReg::z31, ZReg::z0, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2h {z31.h, z0.h}, p6/z, [x29, x30, lsl #1]"); + TEST_SINGLE(ld2h(ZReg::z26, ZReg::z27, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2h {z26.h, z27.h}, p6/z, [x29, x30, lsl #1]"); + TEST_SINGLE(ld3h(ZReg::z31, ZReg::z0, ZReg::z1, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3h {z31.h, z0.h, z1.h}, p6/z, [x29, x30, lsl #1]"); + TEST_SINGLE(ld3h(ZReg::z26, ZReg::z27, ZReg::z28, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3h {z26.h, z27.h, z28.h}, p6/z, [x29, x30, lsl #1]"); + TEST_SINGLE(ld4h(ZReg::z31, ZReg::z0, ZReg::z1, ZReg::z2, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4h {z31.h, z0.h, z1.h, z2.h}, p6/z, [x29, x30, lsl #1]"); + TEST_SINGLE(ld4h(ZReg::z26, ZReg::z27, ZReg::z28, ZReg::z29, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4h {z26.h, z27.h, z28.h, z29.h}, p6/z, [x29, x30, lsl #1]"); + + TEST_SINGLE(ld2w(ZReg::z31, ZReg::z0, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2w {z31.s, z0.s}, p6/z, [x29, x30, lsl #2]"); + TEST_SINGLE(ld2w(ZReg::z26, ZReg::z27, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2w {z26.s, z27.s}, p6/z, [x29, x30, lsl #2]"); + TEST_SINGLE(ld3w(ZReg::z31, ZReg::z0, ZReg::z1, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3w {z31.s, z0.s, z1.s}, p6/z, [x29, x30, lsl #2]"); + TEST_SINGLE(ld3w(ZReg::z26, ZReg::z27, ZReg::z28, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3w {z26.s, z27.s, z28.s}, p6/z, [x29, x30, lsl #2]"); + TEST_SINGLE(ld4w(ZReg::z31, ZReg::z0, ZReg::z1, ZReg::z2, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4w {z31.s, z0.s, z1.s, z2.s}, p6/z, [x29, x30, lsl #2]"); + TEST_SINGLE(ld4w(ZReg::z26, ZReg::z27, ZReg::z28, ZReg::z29, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4w {z26.s, z27.s, z28.s, z29.s}, p6/z, [x29, x30, lsl #2]"); + + TEST_SINGLE(ld2d(ZReg::z31, ZReg::z0, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2d {z31.d, z0.d}, p6/z, [x29, x30, lsl #3]"); + TEST_SINGLE(ld2d(ZReg::z26, ZReg::z27, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld2d {z26.d, z27.d}, p6/z, [x29, x30, lsl #3]"); + TEST_SINGLE(ld3d(ZReg::z31, ZReg::z0, ZReg::z1, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3d {z31.d, z0.d, z1.d}, p6/z, [x29, x30, lsl #3]"); + TEST_SINGLE(ld3d(ZReg::z26, ZReg::z27, ZReg::z28, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld3d {z26.d, z27.d, z28.d}, p6/z, [x29, x30, lsl #3]"); + TEST_SINGLE(ld4d(ZReg::z31, ZReg::z0, ZReg::z1, ZReg::z2, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4d {z31.d, z0.d, z1.d, z2.d}, p6/z, [x29, x30, lsl #3]"); + TEST_SINGLE(ld4d(ZReg::z26, ZReg::z27, ZReg::z28, ZReg::z29, PReg::p6.Zeroing(), Reg::r29, Reg::r30), "ld4d {z26.d, z27.d, z28.d, z29.d}, p6/z, [x29, x30, lsl #3]"); +} TEST_CASE_METHOD(TestDisassembler, "Emitter: SVE: SVE load multiple structures (scalar plus immediate)") { TEST_SINGLE(ld2b(ZReg::z31, ZReg::z0, PReg::p6.Zeroing(), Reg::r29, 0), "ld2b {z31.b, z0.b}, p6/z, [x29]"); TEST_SINGLE(ld2b(ZReg::z26, ZReg::z27, PReg::p6.Zeroing(), Reg::r29, 0), "ld2b {z26.b, z27.b}, p6/z, [x29]");