Merge pull request #3882 from Sonicadvance1/scalar_afp_fma

AVX128: Implement support for scalar FMA with AFP
This commit is contained in:
Ryan Houdek 2024-07-22 13:19:59 -07:00 committed by GitHub
commit 9201ac5a6b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 627 additions and 231 deletions

View File

@ -345,6 +345,10 @@ private:
uint32_t SpillSlots {};
using OpType = void (Arm64JITCore::*)(const IR::IROp_Header* IROp, IR::NodeID Node);
using ScalarFMAOpCaller =
std::function<void(ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2, ARMEmitter::VRegister Src3)>;
void VFScalarFMAOperation(uint8_t OpSize, uint8_t ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2, ARMEmitter::VRegister Addend);
using ScalarBinaryOpCaller = std::function<void(ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2)>;
void VFScalarOperation(uint8_t OpSize, uint8_t ElementSize, bool ZeroUpperBits, ScalarBinaryOpCaller ScalarEmit,
ARMEmitter::VRegister Dst, ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2);

View File

@ -188,6 +188,30 @@ namespace FEXCore::CPU {
VFScalarOperation(IROp->Size, ElementSize, Op->ZeroUpperBits, ScalarEmit, Dst, Vector1, Vector2); \
}
#define DEF_FMAOP_SCALAR_INSERT(FEXOp, ARMOp) \
DEF_OP(FEXOp) { \
const auto Op = IROp->C<IR::IROp_##FEXOp>(); \
const auto ElementSize = Op->Header.ElementSize; \
\
auto ScalarEmit = \
[this, ElementSize](ARMEmitter::VRegister Dst, ARMEmitter::VRegister Src1, ARMEmitter::VRegister Src2, ARMEmitter::VRegister Src3) { \
if (ElementSize == 2) { \
ARMOp(Dst.H(), Src1.H(), Src2.H(), Src3.H()); \
} else if (ElementSize == 4) { \
ARMOp(Dst.S(), Src1.S(), Src2.S(), Src3.S()); \
} else if (ElementSize == 8) { \
ARMOp(Dst.D(), Src1.D(), Src2.D(), Src3.D()); \
} \
}; \
\
const auto Dst = GetVReg(Node); \
const auto Vector1 = GetVReg(Op->Vector1.ID()); \
const auto Vector2 = GetVReg(Op->Vector2.ID()); \
const auto Addend = GetVReg(Op->Addend.ID()); \
\
VFScalarFMAOperation(IROp->Size, ElementSize, ScalarEmit, Dst, Vector1, Vector2, Addend); \
}
DEF_UNOP(VAbs, abs, true)
DEF_UNOP(VPopcount, cnt, true)
DEF_UNOP(VNeg, neg, false)
@ -224,6 +248,35 @@ DEF_FBINOP_SCALAR_INSERT(VFSubScalarInsert, fsub)
DEF_FBINOP_SCALAR_INSERT(VFMulScalarInsert, fmul)
DEF_FBINOP_SCALAR_INSERT(VFDivScalarInsert, fdiv)
DEF_FMAOP_SCALAR_INSERT(VFMLAScalarInsert, fmadd)
DEF_FMAOP_SCALAR_INSERT(VFMLSScalarInsert, fnmsub)
DEF_FMAOP_SCALAR_INSERT(VFNMLAScalarInsert, fmsub)
DEF_FMAOP_SCALAR_INSERT(VFNMLSScalarInsert, fnmadd)
void Arm64JITCore::VFScalarFMAOperation(uint8_t OpSize, uint8_t ElementSize, ScalarFMAOpCaller ScalarEmit, ARMEmitter::VRegister Dst,
ARMEmitter::VRegister Vector1, ARMEmitter::VRegister Vector2, ARMEmitter::VRegister Addend) {
LOGMAN_THROW_A_FMT(OpSize == Core::CPUState::XMM_SSE_REG_SIZE, "256-bit unsupported", __func__);
LOGMAN_THROW_AA_FMT(ElementSize == 2 || ElementSize == 4 || ElementSize == 8, "Invalid size");
const auto SubRegSize = ARMEmitter::ToVectorSizePair(ElementSize == 2 ? ARMEmitter::SubRegSize::i16Bit :
ElementSize == 4 ? ARMEmitter::SubRegSize::i32Bit :
ARMEmitter::SubRegSize::i64Bit);
if (Dst != Vector1 && Dst != Vector2 && Dst != Addend && HostSupportsAFP) {
// If destination doesnt overlap any incoming register then move the adder to the destination first.
mov(Dst.Q(), Addend.Q());
Dst = Addend;
}
if (HostSupportsAFP && Dst == Addend) {
///< Exactly matches ARM scalar FMA semantics
// If the host CPU supports AFP then scalar does an insert without modifying upper bits.
ScalarEmit(Dst, Vector1, Vector2, Addend);
} else {
// No overlap between addr and destination or host doesn't support AFP, need to emit in to a temporary then insert.
ScalarEmit(VTMP1, Vector1, Vector2, Addend);
ins(SubRegSize.Vector, Dst.Q(), 0, VTMP1.Q(), 0);
}
}
// VFScalarOperation performs the operation described through ScalarEmit between Vector1 and Vector2,
// storing it into Dst. This is a scalar operation, so the only lowest element of each vector is used for the operation.

View File

@ -1160,7 +1160,8 @@ public:
void AVX128_VPCLMULQDQ(OpcodeArgs);
void AVX128_VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx);
void AVX128_VFMAImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx);
void AVX128_VFMAScalarImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx);
void AVX128_VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx);
RefPair AVX128_VPGatherQPSImpl(Ref Dest, Ref Mask, RefVSIB VSIB);

View File

@ -337,32 +337,32 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
{OPD(2, 0b01, 0x96), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, true, 1, 3, 2>}, // VFMADDSUB
{OPD(2, 0b01, 0x97), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, false, 1, 3, 2>}, // VFMSUBADD
{OPD(2, 0b01, 0x98), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, false, 1, 3, 2>}, // VFMADD
{OPD(2, 0b01, 0x99), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, true, 1, 3, 2>}, // VFMADD
{OPD(2, 0b01, 0x9A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, false, 1, 3, 2>}, // VFMSUB
{OPD(2, 0b01, 0x9B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, true, 1, 3, 2>}, // VFMSUB
{OPD(2, 0b01, 0x9C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, false, 1, 3, 2>}, // VFNMADD
{OPD(2, 0b01, 0x9D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, true, 1, 3, 2>}, // VFNMADD
{OPD(2, 0b01, 0x9E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, false, 1, 3, 2>}, // VFNMSUB
{OPD(2, 0b01, 0x9F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, true, 1, 3, 2>}, // VFNMSUB
{OPD(2, 0b01, 0x98), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, 1, 3, 2>}, // VFMADD
{OPD(2, 0b01, 0x99), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLASCALARINSERT, 1, 3, 2>}, // VFMADD
{OPD(2, 0b01, 0x9A), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, 1, 3, 2>}, // VFMSUB
{OPD(2, 0b01, 0x9B), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLSSCALARINSERT, 1, 3, 2>}, // VFMSUB
{OPD(2, 0b01, 0x9C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, 1, 3, 2>}, // VFNMADD
{OPD(2, 0b01, 0x9D), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLASCALARINSERT, 1, 3, 2>}, // VFNMADD
{OPD(2, 0b01, 0x9E), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, 1, 3, 2>}, // VFNMSUB
{OPD(2, 0b01, 0x9F), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLSSCALARINSERT, 1, 3, 2>}, // VFNMSUB
{OPD(2, 0b01, 0xA8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, false, 2, 1, 3>}, // VFMADD
{OPD(2, 0b01, 0xA9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, true, 2, 1, 3>}, // VFMADD
{OPD(2, 0b01, 0xAA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, false, 2, 1, 3>}, // VFMSUB
{OPD(2, 0b01, 0xAB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, true, 2, 1, 3>}, // VFMSUB
{OPD(2, 0b01, 0xAC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, false, 2, 1, 3>}, // VFNMADD
{OPD(2, 0b01, 0xAD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, true, 2, 1, 3>}, // VFNMADD
{OPD(2, 0b01, 0xAE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, false, 2, 1, 3>}, // VFNMSUB
{OPD(2, 0b01, 0xAF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, true, 2, 1, 3>}, // VFNMSUB
{OPD(2, 0b01, 0xA8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, 2, 1, 3>}, // VFMADD
{OPD(2, 0b01, 0xA9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLASCALARINSERT, 2, 1, 3>}, // VFMADD
{OPD(2, 0b01, 0xAA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, 2, 1, 3>}, // VFMSUB
{OPD(2, 0b01, 0xAB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLSSCALARINSERT, 2, 1, 3>}, // VFMSUB
{OPD(2, 0b01, 0xAC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, 2, 1, 3>}, // VFNMADD
{OPD(2, 0b01, 0xAD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLASCALARINSERT, 2, 1, 3>}, // VFNMADD
{OPD(2, 0b01, 0xAE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, 2, 1, 3>}, // VFNMSUB
{OPD(2, 0b01, 0xAF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLSSCALARINSERT, 2, 1, 3>}, // VFNMSUB
{OPD(2, 0b01, 0xB8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, false, 2, 3, 1>}, // VFMADD
{OPD(2, 0b01, 0xB9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, true, 2, 3, 1>}, // VFMADD
{OPD(2, 0b01, 0xBA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, false, 2, 3, 1>}, // VFMSUB
{OPD(2, 0b01, 0xBB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, true, 2, 3, 1>}, // VFMSUB
{OPD(2, 0b01, 0xBC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, false, 2, 3, 1>}, // VFNMADD
{OPD(2, 0b01, 0xBD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, true, 2, 3, 1>}, // VFNMADD
{OPD(2, 0b01, 0xBE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, false, 2, 3, 1>}, // VFNMSUB
{OPD(2, 0b01, 0xBF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, true, 2, 3, 1>}, // VFNMSUB
{OPD(2, 0b01, 0xB8), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLA, 2, 3, 1>}, // VFMADD
{OPD(2, 0b01, 0xB9), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLASCALARINSERT, 2, 3, 1>}, // VFMADD
{OPD(2, 0b01, 0xBA), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFMLS, 2, 3, 1>}, // VFMSUB
{OPD(2, 0b01, 0xBB), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFMLSSCALARINSERT, 2, 3, 1>}, // VFMSUB
{OPD(2, 0b01, 0xBC), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLA, 2, 3, 1>}, // VFNMADD
{OPD(2, 0b01, 0xBD), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLASCALARINSERT, 2, 3, 1>}, // VFNMADD
{OPD(2, 0b01, 0xBE), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAImpl, IR::OP_VFNMLS, 2, 3, 1>}, // VFNMSUB
{OPD(2, 0b01, 0xBF), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAScalarImpl, IR::OP_VFNMLSSCALARINSERT, 2, 3, 1>}, // VFNMSUB
{OPD(2, 0b01, 0xA6), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, true, 2, 1, 3>}, // VFMADDSUB
{OPD(2, 0b01, 0xA7), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VFMAddSubImpl, false, 2, 1, 3>}, // VFMSUBADD
@ -2460,42 +2460,50 @@ void OpDispatchBuilder::AVX128_VPCLMULQDQ(OpcodeArgs) {
// As shown only the 231 suffixed instructions matches AArch64 behaviour.
// FEX will insert moves to transpose the vectors to match AArch64 behaviour for 132 and 213 variants.
void OpDispatchBuilder::AVX128_VFMAImpl(OpcodeArgs, IROps IROp, bool Scalar, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
void OpDispatchBuilder::AVX128_VFMAImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
const auto RegisterSize = Scalar ? ElementSize : OpSize::i128Bit;
auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit);
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
RefPair Sources[3] = {
Dest,
Src1,
Src2,
};
RefPair Sources[3] = {Dest, Src1, Src2};
RefPair Result {};
DeriveOp(Result_Low, IROp, _VFMLA(RegisterSize, ElementSize, Sources[Src1Idx - 1].Low, Sources[Src2Idx - 1].Low, Sources[AddendIdx - 1].Low));
DeriveOp(Result_Low, IROp, _VFMLA(OpSize::i128Bit, ElementSize, Sources[Src1Idx - 1].Low, Sources[Src2Idx - 1].Low, Sources[AddendIdx - 1].Low));
Result.Low = Result_Low;
if (Is128Bit) {
Result.High = LoadZeroVector(OpSize::i128Bit);
if (Scalar) {
// Special case, scalar inserts in to the low bits of the destination.
///< TODO: This can be optimized with AFP.NEP.
Result.Low = _VInsElement(OpSize::i128Bit, ElementSize, 0, 0, Dest.Low, Result.Low);
}
} else {
DeriveOp(Result_High, IROp,
_VFMLA(RegisterSize, ElementSize, Sources[Src1Idx - 1].High, Sources[Src2Idx - 1].High, Sources[AddendIdx - 1].High));
_VFMLA(OpSize::i128Bit, ElementSize, Sources[Src1Idx - 1].High, Sources[Src2Idx - 1].High, Sources[AddendIdx - 1].High));
Result.High = Result_High;
}
AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
}
void OpDispatchBuilder::AVX128_VFMAScalarImpl(OpcodeArgs, IROps IROp, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;
LOGMAN_THROW_A_FMT(Is128Bit, "This can't be 256-bit");
const OpSize ElementSize = Op->Flags & X86Tables::DecodeFlags::FLAG_OPTION_AVX_W ? OpSize::i64Bit : OpSize::i32Bit;
auto Dest = AVX128_LoadSource_WithOpSize(Op, Op->Dest, Op->Flags, !Is128Bit);
auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
RefPair Sources[3] = {Dest, Src1, Src2};
DeriveOp(Result_Low, IROp,
_VFMLAScalarInsert(OpSize::i128Bit, ElementSize, Sources[Src1Idx - 1].Low, Sources[Src2Idx - 1].Low, Sources[AddendIdx - 1].Low));
AVX128_StoreResult_WithOpSize(Op, Op->Dest, AVX128_Zext(Result_Low));
}
void OpDispatchBuilder::AVX128_VFMAddSubImpl(OpcodeArgs, bool AddSub, uint8_t Src1Idx, uint8_t Src2Idx, uint8_t AddendIdx) {
const auto Size = GetDstSize(Op);
const auto Is128Bit = Size == Core::CPUState::XMM_SSE_REG_SIZE;

View File

@ -1716,6 +1716,42 @@
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize"
},
"FPR = VFMLAScalarInsert u8:#RegisterSize, u8:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": {
"Desc": [
"Dest = (Vector1 * Vector2) + Addend",
"This explicitly matches x86 FMA semantics because ARM semantics are mind-bending."
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize",
"TiedSource": 2
},
"FPR = VFMLSScalarInsert u8:#RegisterSize, u8:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": {
"Desc": [
"Dest = (Vector1 * Vector2) - Addend",
"This explicitly matches x86 FMA semantics because ARM semantics are mind-bending."
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize",
"TiedSource": 2
},
"FPR = VFNMLAScalarInsert u8:#RegisterSize, u8:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": {
"Desc": [
"Dest = (-Vector1 * Vector2) + Addend",
"This explicitly matches x86 FMA semantics because ARM semantics are mind-bending."
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize",
"TiedSource": 2
},
"FPR = VFNMLSScalarInsert u8:#RegisterSize, u8:#ElementSize, FPR:$Vector1, FPR:$Vector2, FPR:$Addend": {
"Desc": [
"Dest = (-Vector1 * Vector2) - Addend",
"This explicitly matches x86 FMA semantics because ARM semantics are mind-bending."
],
"DestSize": "RegisterSize",
"NumElements": "RegisterSize / ElementSize",
"TiedSource": 2
}
},
"Vector": {

View File

@ -4742,10 +4742,10 @@
"Map 2 0b01 0x99 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmadd s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd132sd xmm0, xmm1, xmm2": {
@ -4754,10 +4754,10 @@
"Map 2 0b01 0x99 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmadd d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub132ps xmm0, xmm1, xmm2": {
@ -4826,10 +4826,10 @@
"Map 2 0b01 0x9b 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmsub s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub132sd xmm0, xmm1, xmm2": {
@ -4838,10 +4838,10 @@
"Map 2 0b01 0x9b 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmsub d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd132ps xmm0, xmm1, xmm2": {
@ -4908,10 +4908,10 @@
"Map 2 0b01 0x9d 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmsub s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd132sd xmm0, xmm1, xmm2": {
@ -4920,10 +4920,10 @@
"Map 2 0b01 0x9d 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmsub d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub132ps xmm0, xmm1, xmm2": {
@ -4992,10 +4992,10 @@
"Map 2 0b01 0x9f 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmadd s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub132sd xmm0, xmm1, xmm2": {
@ -5004,10 +5004,10 @@
"Map 2 0b01 0x9f 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmadd d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd213ps xmm0, xmm1, xmm2": {
@ -5074,10 +5074,10 @@
"Map 2 0b01 0xa9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmadd s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd213sd xmm0, xmm1, xmm2": {
@ -5086,10 +5086,10 @@
"Map 2 0b01 0xa9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmadd d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub213ps xmm0, xmm1, xmm2": {
@ -5158,10 +5158,10 @@
"Map 2 0b01 0xab 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmsub s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub213sd xmm0, xmm1, xmm2": {
@ -5170,10 +5170,10 @@
"Map 2 0b01 0xab 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmsub d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd213ps xmm0, xmm1, xmm2": {
@ -5240,10 +5240,10 @@
"Map 2 0b01 0xad 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmsub s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd213sd xmm0, xmm1, xmm2": {
@ -5252,10 +5252,10 @@
"Map 2 0b01 0xad 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmsub d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub213ps xmm0, xmm1, xmm2": {
@ -5324,10 +5324,10 @@
"Map 2 0b01 0xaf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmadd s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub213sd xmm0, xmm1, xmm2": {
@ -5336,10 +5336,10 @@
"Map 2 0b01 0xaf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmadd d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd231ps xmm0, xmm1, xmm2": {
@ -5398,10 +5398,10 @@
"Map 2 0b01 0xb9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmadd s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd231sd xmm0, xmm1, xmm2": {
@ -5410,10 +5410,10 @@
"Map 2 0b01 0xb9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmadd d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub231ps xmm0, xmm1, xmm2": {
@ -5478,10 +5478,10 @@
"Map 2 0b01 0xbb 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmsub s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub231sd xmm0, xmm1, xmm2": {
@ -5490,10 +5490,10 @@
"Map 2 0b01 0xbb 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmsub d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd231ps xmm0, xmm1, xmm2": {
@ -5552,10 +5552,10 @@
"Map 2 0b01 0xbd 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmsub s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd231sd xmm0, xmm1, xmm2": {
@ -5564,10 +5564,10 @@
"Map 2 0b01 0xbd 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmsub d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub231ps xmm0, xmm1, xmm2": {
@ -5632,10 +5632,10 @@
"Map 2 0b01 0xbf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmadd s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub231sd xmm0, xmm1, xmm2": {
@ -5644,10 +5644,10 @@
"Map 2 0b01 0xbf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmadd d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmaddsub213ps xmm0, xmm1, xmm2": {

View File

@ -0,0 +1,294 @@
{
"Features": {
"Bitness": 64,
"EnabledHostFeatures": [
"AFP"
],
"DisabledHostFeatures": [
"SVE256",
"SVE128"
]
},
"Instructions": {
"vfmadd132ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x99 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd132sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x99 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub132ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x9b 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub132sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x9b 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd132ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x9d 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd132sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x9d 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub132ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x9f 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub132sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0x9f 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd213ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xa9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd213sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xa9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub213ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xab 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub213sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xab 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd213ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xad 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd213sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xad 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub213ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xaf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub213sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 4,
"Comment": [
"Map 2 0b01 0xaf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd231ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xb9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s16, s17, s18, s16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd231sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xb9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d16, d17, d18, d16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub231ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xbb 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s16, s17, s18, s16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub231sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xbb 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d16, d17, d18, d16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd231ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xbd 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s16, s17, s18, s16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd231sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xbd 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d16, d17, d18, d16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub231ss xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xbf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s16, s17, s18, s16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub231sd xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 3,
"Comment": [
"Map 2 0b01 0xbf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d16, d17, d18, d16",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
}
}
}

View File

@ -3044,10 +3044,10 @@
"Map 2 0b01 0x99 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmadd s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd132sd xmm0, xmm1, xmm2": {
@ -3056,10 +3056,10 @@
"Map 2 0b01 0x99 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmadd d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub132ps xmm0, xmm1, xmm2": {
@ -3126,10 +3126,10 @@
"Map 2 0b01 0x9b 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmsub s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub132sd xmm0, xmm1, xmm2": {
@ -3138,10 +3138,10 @@
"Map 2 0b01 0x9b 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmsub d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd132ps xmm0, xmm1, xmm2": {
@ -3208,10 +3208,10 @@
"Map 2 0b01 0x9d 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmsub s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd132sd xmm0, xmm1, xmm2": {
@ -3220,10 +3220,10 @@
"Map 2 0b01 0x9d 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmsub d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub132ps xmm0, xmm1, xmm2": {
@ -3290,10 +3290,10 @@
"Map 2 0b01 0x9f 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s2, s16, s18, s17",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmadd s0, s16, s18, s17",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub132sd xmm0, xmm1, xmm2": {
@ -3302,10 +3302,10 @@
"Map 2 0b01 0x9f 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d2, d16, d18, d17",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmadd d0, d16, d18, d17",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd213ps xmm0, xmm1, xmm2": {
@ -3372,10 +3372,10 @@
"Map 2 0b01 0xa9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmadd s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd213sd xmm0, xmm1, xmm2": {
@ -3384,10 +3384,10 @@
"Map 2 0b01 0xa9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmadd d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub213ps xmm0, xmm1, xmm2": {
@ -3454,10 +3454,10 @@
"Map 2 0b01 0xab 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmsub s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub213sd xmm0, xmm1, xmm2": {
@ -3466,10 +3466,10 @@
"Map 2 0b01 0xab 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmsub d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd213ps xmm0, xmm1, xmm2": {
@ -3536,10 +3536,10 @@
"Map 2 0b01 0xad 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmsub s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd213sd xmm0, xmm1, xmm2": {
@ -3548,10 +3548,10 @@
"Map 2 0b01 0xad 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmsub d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub213ps xmm0, xmm1, xmm2": {
@ -3618,10 +3618,10 @@
"Map 2 0b01 0xaf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s2, s17, s16, s18",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmadd s0, s17, s16, s18",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub213sd xmm0, xmm1, xmm2": {
@ -3630,10 +3630,10 @@
"Map 2 0b01 0xaf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d2, d17, d16, d18",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmadd d0, d17, d16, d18",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd231ps xmm0, xmm1, xmm2": {
@ -3692,10 +3692,10 @@
"Map 2 0b01 0xb9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmadd s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmadd231sd xmm0, xmm1, xmm2": {
@ -3704,10 +3704,10 @@
"Map 2 0b01 0xb9 128-bit"
],
"ExpectedArm64ASM": [
"fmadd d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmadd d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub231ps xmm0, xmm1, xmm2": {
@ -3766,10 +3766,10 @@
"Map 2 0b01 0xbb 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmsub s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmsub231sd xmm0, xmm1, xmm2": {
@ -3778,10 +3778,10 @@
"Map 2 0b01 0xbb 128-bit"
],
"ExpectedArm64ASM": [
"fnmsub d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmsub d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd231ps xmm0, xmm1, xmm2": {
@ -3840,10 +3840,10 @@
"Map 2 0b01 0xbd 128-bit"
],
"ExpectedArm64ASM": [
"fmsub s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fmsub s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmadd231sd xmm0, xmm1, xmm2": {
@ -3852,10 +3852,10 @@
"Map 2 0b01 0xbd 128-bit"
],
"ExpectedArm64ASM": [
"fmsub d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fmsub d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub231ps xmm0, xmm1, xmm2": {
@ -3914,10 +3914,10 @@
"Map 2 0b01 0xbf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd s2, s17, s18, s16",
"movi v3.2d, #0x0",
"mov v16.s[0], v2.s[0]",
"str q3, [x28, #16]"
"fnmadd s0, s17, s18, s16",
"mov v16.s[0], v0.s[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfnmsub231sd xmm0, xmm1, xmm2": {
@ -3926,10 +3926,10 @@
"Map 2 0b01 0xbf 128-bit"
],
"ExpectedArm64ASM": [
"fnmadd d2, d17, d18, d16",
"movi v3.2d, #0x0",
"mov v16.d[0], v2.d[0]",
"str q3, [x28, #16]"
"fnmadd d0, d17, d18, d16",
"mov v16.d[0], v0.d[0]",
"movi v2.2d, #0x0",
"str q2, [x28, #16]"
]
},
"vfmaddsub213ps xmm0, xmm1, xmm2": {