From d622bef31d11a5a6429fe7fad557c9b111e96f69 Mon Sep 17 00:00:00 2001 From: Hao Liu Date: Thu, 10 Oct 2013 15:01:24 +0000 Subject: [PATCH] Implement AArch64 vector load/store multiple N-element structure class SIMD(lselem). Including following 14 instructions: 4 ld1 insts: load multiple 1-element structure to sequential 1/2/3/4 registers. ld2/ld3/ld4: load multiple N-element structure to sequential N registers (N=2,3,4). 4 st1 insts: store multiple 1-element structure from sequential 1/2/3/4 registers. st2/st3/st4: store multiple N-element structure from sequential N registers (N = 2,3,4). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192352 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/ValueTypes.h | 2 +- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 298 ++++ lib/Target/AArch64/AArch64ISelLowering.cpp | 54 + lib/Target/AArch64/AArch64ISelLowering.h | 4 + lib/Target/AArch64/AArch64InstrFormats.td | 18 + lib/Target/AArch64/AArch64InstrNEON.td | 126 ++ lib/Target/AArch64/AArch64RegisterInfo.td | 101 ++ .../AArch64/AsmParser/AArch64AsmParser.cpp | 166 ++- .../Disassembler/AArch64Disassembler.cpp | 53 + .../InstPrinter/AArch64InstPrinter.cpp | 30 + .../AArch64/InstPrinter/AArch64InstPrinter.h | 3 + lib/Target/AArch64/Utils/AArch64BaseInfo.h | 44 + .../AArch64/neon-simd-ldst-multi-elem.ll | 1228 +++++++++++++++++ test/MC/AArch64/neon-diagnostics.s | 221 +++ test/MC/AArch64/neon-simd-ldst-multi-elem.s | 463 +++++++ 15 files changed, 2808 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll create mode 100644 test/MC/AArch64/neon-simd-ldst-multi-elem.s diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h index 2e8f637f522..79f323341fd 100644 --- a/include/llvm/CodeGen/ValueTypes.h +++ b/include/llvm/CodeGen/ValueTypes.h @@ -208,7 +208,7 @@ namespace llvm { bool is64BitVector() const { return (SimpleTy == MVT::v8i8 || SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 || SimpleTy == MVT::v1i64 || - SimpleTy == MVT::v2f32); + SimpleTy == MVT::v1f64 || SimpleTy == MVT::v2f32); } /// is128BitVector - Return true if this is a 128-bit vector type. diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index a8655649f19..3b0dd64ee44 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -109,6 +109,23 @@ public: SDNode* Select(SDNode*); private: + /// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4. + SDNode *SelectVLD(SDNode *N, unsigned NumVecs, const uint16_t *Opcode); + + /// Select NEON store intrinsics. NumVecs should be 1, 2, 3 or 4. + SDNode *SelectVST(SDNode *N, unsigned NumVecs, const uint16_t *Opcodes); + + // Form pairs of consecutive 64-bit/128-bit registers. + SDNode *createDPairNode(SDValue V0, SDValue V1); + SDNode *createQPairNode(SDValue V0, SDValue V1); + + // Form sequences of 3 consecutive 64-bit/128-bit registers. + SDNode *createDTripleNode(SDValue V0, SDValue V1, SDValue V2); + SDNode *createQTripleNode(SDValue V0, SDValue V1, SDValue V2); + + // Form sequences of 4 consecutive 64-bit/128-bit registers. + SDNode *createDQuadNode(SDValue V0, SDValue V1, SDValue V2, SDValue V3); + SDNode *createQQuadNode(SDValue V0, SDValue V1, SDValue V2, SDValue V3); }; } @@ -390,6 +407,221 @@ SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8, &Ops[0], Ops.size()); } +SDNode *AArch64DAGToDAGISel::createDPairNode(SDValue V0, SDValue V1) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(AArch64::DPairRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::dsub_1, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v2i64, + Ops); +} + +SDNode *AArch64DAGToDAGISel::createQPairNode(SDValue V0, SDValue V1) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(AArch64::QPairRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::qsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::qsub_1, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v4i64, + Ops); +} + +SDNode *AArch64DAGToDAGISel::createDTripleNode(SDValue V0, SDValue V1, + SDValue V2) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(AArch64::DTripleRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::dsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::dsub_2, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, + Ops); +} + +SDNode *AArch64DAGToDAGISel::createQTripleNode(SDValue V0, SDValue V1, + SDValue V2) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(AArch64::QTripleRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::qsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::qsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::qsub_2, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, + Ops); +} + +SDNode *AArch64DAGToDAGISel::createDQuadNode(SDValue V0, SDValue V1, SDValue V2, + SDValue V3) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(AArch64::DQuadRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::dsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::dsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::dsub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(AArch64::dsub_3, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, + SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v4i64, + Ops); +} + +SDNode *AArch64DAGToDAGISel::createQQuadNode(SDValue V0, SDValue V1, SDValue V2, + SDValue V3) { + SDLoc dl(V0.getNode()); + SDValue RegClass = + CurDAG->getTargetConstant(AArch64::QQuadRegClassID, MVT::i32); + SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::qsub_0, MVT::i32); + SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::qsub_1, MVT::i32); + SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::qsub_2, MVT::i32); + SDValue SubReg3 = CurDAG->getTargetConstant(AArch64::qsub_3, MVT::i32); + const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, + SubReg3 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v8i64, + Ops); +} + +SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); + + EVT VT = N->getValueType(0); + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vector load type"); + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1f64: + case MVT::v1i64: OpcodeIndex = 3; break; + case MVT::v16i8: OpcodeIndex = 4; break; + case MVT::v8f16: + case MVT::v8i16: OpcodeIndex = 5; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 6; break; + case MVT::v2f64: + case MVT::v2i64: OpcodeIndex = 7; break; + } + unsigned Opc = Opcodes[OpcodeIndex]; + + SmallVector Ops; + Ops.push_back(N->getOperand(2)); // Push back the Memory Address + Ops.push_back(N->getOperand(0)); // Push back the Chain + + std::vector ResTys; + bool is64BitVector = VT.is64BitVector(); + + if (NumVecs == 1) + ResTys.push_back(VT); + else if (NumVecs == 3) + ResTys.push_back(MVT::Untyped); + else { + EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, + is64BitVector ? NumVecs : NumVecs * 2); + ResTys.push_back(ResTy); + } + + ResTys.push_back(MVT::Other); // Type of the Chain + SDLoc dl(N); + SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(VLd)->setMemRefs(MemOp, MemOp + 1); + + if (NumVecs == 1) + return VLd; + + // If NumVecs > 1, the return result is a super register containing 2-4 + // consecutive vector registers. + SDValue SuperReg = SDValue(VLd, 0); + + unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg)); + // Update users of the Chain + ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1)); + + return NULL; +} + +SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, + const uint16_t *Opcodes) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); + SDLoc dl(N); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + + unsigned Vec0Idx = 3; + EVT VT = N->getOperand(Vec0Idx).getValueType(); + unsigned OpcodeIndex; + switch (VT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("unhandled vector store type"); + case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v2f32: + case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v1f64: + case MVT::v1i64: OpcodeIndex = 3; break; + case MVT::v16i8: OpcodeIndex = 4; break; + case MVT::v8f16: + case MVT::v8i16: OpcodeIndex = 5; break; + case MVT::v4f32: + case MVT::v4i32: OpcodeIndex = 6; break; + case MVT::v2f64: + case MVT::v2i64: OpcodeIndex = 7; break; + } + unsigned Opc = Opcodes[OpcodeIndex]; + + std::vector ResTys; + ResTys.push_back(MVT::Other); // Type for the Chain + + SmallVector Ops; + Ops.push_back(N->getOperand(2)); // Push back the Memory Address + + bool is64BitVector = VT.is64BitVector(); + + SDValue V0 = N->getOperand(Vec0Idx + 0); + SDValue SrcReg; + if (NumVecs == 1) + SrcReg = V0; + else { + SDValue V1 = N->getOperand(Vec0Idx + 1); + if (NumVecs == 2) + SrcReg = is64BitVector ? SDValue(createDPairNode(V0, V1), 0) + : SDValue(createQPairNode(V0, V1), 0); + else { + SDValue V2 = N->getOperand(Vec0Idx + 2); + if (NumVecs == 3) + SrcReg = is64BitVector ? SDValue(createDTripleNode(V0, V1, V2), 0) + : SDValue(createQTripleNode(V0, V1, V2), 0); + else { + SDValue V3 = N->getOperand(Vec0Idx + 3); + SrcReg = is64BitVector ? SDValue(createDQuadNode(V0, V1, V2, V3), 0) + : SDValue(createQQuadNode(V0, V1, V2, V3), 0); + } + } + } + Ops.push_back(SrcReg); + + // Push back the Chain + Ops.push_back(N->getOperand(0)); + + // Transfer memoperands. + SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + cast(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; +} + SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { // Dump information about the Node being selected DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n"); @@ -536,6 +768,72 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { Node = ResNode; break; } + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + + case Intrinsic::arm_neon_vld1: { + static const uint16_t Opcodes[] = { AArch64::LD1_8B, AArch64::LD1_4H, + AArch64::LD1_2S, AArch64::LD1_1D, + AArch64::LD1_16B, AArch64::LD1_8H, + AArch64::LD1_4S, AArch64::LD1_2D }; + return SelectVLD(Node, 1, Opcodes); + } + case Intrinsic::arm_neon_vld2: { + static const uint16_t Opcodes[] = { AArch64::LD2_8B, AArch64::LD2_4H, + AArch64::LD2_2S, AArch64::LD1_2V_1D, + AArch64::LD2_16B, AArch64::LD2_8H, + AArch64::LD2_4S, AArch64::LD2_2D }; + return SelectVLD(Node, 2, Opcodes); + } + case Intrinsic::arm_neon_vld3: { + static const uint16_t Opcodes[] = { AArch64::LD3_8B, AArch64::LD3_4H, + AArch64::LD3_2S, AArch64::LD1_3V_1D, + AArch64::LD3_16B, AArch64::LD3_8H, + AArch64::LD3_4S, AArch64::LD3_2D }; + return SelectVLD(Node, 3, Opcodes); + } + case Intrinsic::arm_neon_vld4: { + static const uint16_t Opcodes[] = { AArch64::LD4_8B, AArch64::LD4_4H, + AArch64::LD4_2S, AArch64::LD1_4V_1D, + AArch64::LD4_16B, AArch64::LD4_8H, + AArch64::LD4_4S, AArch64::LD4_2D }; + return SelectVLD(Node, 4, Opcodes); + } + case Intrinsic::arm_neon_vst1: { + static const uint16_t Opcodes[] = { AArch64::ST1_8B, AArch64::ST1_4H, + AArch64::ST1_2S, AArch64::ST1_1D, + AArch64::ST1_16B, AArch64::ST1_8H, + AArch64::ST1_4S, AArch64::ST1_2D }; + return SelectVST(Node, 1, Opcodes); + } + case Intrinsic::arm_neon_vst2: { + static const uint16_t Opcodes[] = { AArch64::ST2_8B, AArch64::ST2_4H, + AArch64::ST2_2S, AArch64::ST1_2V_1D, + AArch64::ST2_16B, AArch64::ST2_8H, + AArch64::ST2_4S, AArch64::ST2_2D }; + return SelectVST(Node, 2, Opcodes); + } + case Intrinsic::arm_neon_vst3: { + static const uint16_t Opcodes[] = { AArch64::ST3_8B, AArch64::ST3_4H, + AArch64::ST3_2S, AArch64::ST1_3V_1D, + AArch64::ST3_16B, AArch64::ST3_8H, + AArch64::ST3_4S, AArch64::ST3_2D }; + return SelectVST(Node, 3, Opcodes); + } + case Intrinsic::arm_neon_vst4: { + static const uint16_t Opcodes[] = { AArch64::ST4_8B, AArch64::ST4_4H, + AArch64::ST4_2S, AArch64::ST1_4V_1D, + AArch64::ST4_16B, AArch64::ST4_8H, + AArch64::ST4_4S, AArch64::ST4_2D }; + return SelectVST(Node, 4, Opcodes); + } + } + break; + } default: break; // Let generic code handle it } diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index d70548aa357..d89213c80d7 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3681,3 +3681,57 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( // constraint into a member of a register class. return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); } + +/// Represent NEON load and store intrinsics as MemIntrinsicNodes. +/// The associated MachineMemOperands record the alignment specified +/// in the intrinsic calls. +bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast(AlignArg)->getZExtValue(); + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); + Info.align = cast(AlignArg)->getZExtValue(); + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 3e309a95564..da7f62361ba 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -281,6 +281,10 @@ public: std::pair getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const; + + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const LLVM_OVERRIDE; + private: const InstrItineraryData *Itins; diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 5781578a552..ab4d083e0eb 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -1194,5 +1194,23 @@ class NeonI_Scalar2SameMisc size, bits<5> opcode, dag outs, dag i // Inherit Rd in 4-0 } +// Format AdvSIMD vector load/store multiple N-element structure +class NeonI_LdStMult opcode, bits<2> size, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRtn +{ + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29-23} = 0b0011000; + let Inst{22} = l; + let Inst{21-16} = 0b000000; + let Inst{15-12} = opcode; + let Inst{11-10} = size; + + // Inherit Rn in 9-5 + // Inherit Rt in 4-0 +} + } diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index a9f60619a21..355de537686 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -2982,6 +2982,132 @@ defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", // End of implementation for instruction class (3V Diff) +// The followings are vector load/store multiple N-element structure +// (class SIMD lselem). + +// ld1: load multiple 1-element structure to 1/2/3/4 registers. +// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4). +// The structure consists of a sequence of sets of N values. +// The first element of the structure is placed in the first lane +// of the first first vector, the second element in the first lane +// of the second vector, and so on. +// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into +// the three 64-bit vectors list {BA, DC, FE}. +// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three +// 64-bit vectors list {DA, EB, FC}. +// Store instructions store multiple structure to N registers like load. + + +class NeonI_LDVList opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdStMult { + let mayLoad = 1; + let neverHasSideEffects = 1; +} + +multiclass LDVList_BHSD opcode, string List, string asmop> { + def _8B : NeonI_LDVList<0, opcode, 0b00, + !cast(List # "8B_operand"), asmop>; + + def _4H : NeonI_LDVList<0, opcode, 0b01, + !cast(List # "4H_operand"), asmop>; + + def _2S : NeonI_LDVList<0, opcode, 0b10, + !cast(List # "2S_operand"), asmop>; + + def _16B : NeonI_LDVList<1, opcode, 0b00, + !cast(List # "16B_operand"), asmop>; + + def _8H : NeonI_LDVList<1, opcode, 0b01, + !cast(List # "8H_operand"), asmop>; + + def _4S : NeonI_LDVList<1, opcode, 0b10, + !cast(List # "4S_operand"), asmop>; + + def _2D : NeonI_LDVList<1, opcode, 0b11, + !cast(List # "2D_operand"), asmop>; +} + +// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4) +defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">; +def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">; + +defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">; + +defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; + +defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; + +// Load multiple 1-element structure to N consecutive registers (N = 2,3,4) +defm LD1_2V : LDVList_BHSD<0b1010, "VPair", "ld1">; +def LD1_2V_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">; + +defm LD1_3V : LDVList_BHSD<0b0110, "VTriple", "ld1">; +def LD1_3V_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">; + +defm LD1_4V : LDVList_BHSD<0b0010, "VQuad", "ld1">; +def LD1_4V_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">; + +class NeonI_STVList opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdStMult { + let mayStore = 1; + let neverHasSideEffects = 1; +} + +multiclass STVList_BHSD opcode, string List, string asmop> { + def _8B : NeonI_STVList<0, opcode, 0b00, + !cast(List # "8B_operand"), asmop>; + + def _4H : NeonI_STVList<0, opcode, 0b01, + !cast(List # "4H_operand"), asmop>; + + def _2S : NeonI_STVList<0, opcode, 0b10, + !cast(List # "2S_operand"), asmop>; + + def _16B : NeonI_STVList<1, opcode, 0b00, + !cast(List # "16B_operand"), asmop>; + + def _8H : NeonI_STVList<1, opcode, 0b01, + !cast(List # "8H_operand"), asmop>; + + def _4S : NeonI_STVList<1, opcode, 0b10, + !cast(List # "4S_operand"), asmop>; + + def _2D : NeonI_STVList<1, opcode, 0b11, + !cast(List # "2D_operand"), asmop>; +} + +// Store multiple N-element structures from N registers (N = 1,2,3,4) +defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">; +def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">; + +defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">; + +defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; + +defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; + +// Store multiple 1-element structures from N consecutive registers (N = 2,3,4) +defm ST1_2V : STVList_BHSD<0b1010, "VPair", "st1">; +def ST1_2V_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; + +defm ST1_3V : STVList_BHSD<0b0110, "VTriple", "st1">; +def ST1_3V_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; + +defm ST1_4V : STVList_BHSD<0b0010, "VQuad", "st1">; +def ST1_4V_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; + +// End of vector load/store multiple N-element structure(class SIMD lselem) + // Scalar Arithmetic class NeonI_Scalar3Same_D_size opcode, string asmop> diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index b7a6acb348e..5e2b1963977 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -17,6 +17,20 @@ def sub_64 : SubRegIndex<64>; def sub_32 : SubRegIndex<32>; def sub_16 : SubRegIndex<16>; def sub_8 : SubRegIndex<8>; + +// Note: Code depends on these having consecutive numbers. +def qqsub : SubRegIndex<256, 256>; + +def qsub_0 : SubRegIndex<128>; +def qsub_1 : SubRegIndex<128, 128>; +def qsub_2 : ComposedSubRegIndex; +def qsub_3 : ComposedSubRegIndex; + +def dsub_0 : SubRegIndex<64>; +def dsub_1 : SubRegIndex<64, 64>; +def dsub_2 : ComposedSubRegIndex; +def dsub_3 : ComposedSubRegIndex; +def dsub_4 : ComposedSubRegIndex; } // Registers are identified with 5-bit ID numbers. @@ -188,3 +202,90 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { let CopyCost = -1; let isAllocatable = 0; } + +//===----------------------------------------------------------------------===// +// Consecutive vector registers +//===----------------------------------------------------------------------===// +// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31 +def Tuples2D : RegisterTuples<[dsub_0, dsub_1], + [(rotl FPR64, 0), (rotl FPR64, 1)]>; + +// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1 +def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2)]>; + +// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2 +def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2), (rotl FPR64, 3)]>; + +// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31 +def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], + [(rotl FPR128, 0), (rotl FPR128, 1)]>; + +// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1 +def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2)]>; + +// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2 +def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2), (rotl FPR128, 3)]>; + +// The followings are super register classes to model 2/3/4 consecutive +// 64-bit/128-bit registers. + +def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>; + +def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> { + let Size = 192; // 3 x 64 bits, we have no predefined type of that size. +} + +def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>; + +def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>; + +def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> { + let Size = 384; // 3 x 128 bits, we have no predefined type of that size. +} + +def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>; + + +// The followings are vector list operands +multiclass VectorList_operands { + def _asmoperand : AsmOperandClass { + let Name = PREFIX # LAYOUT # Count; + let RenderMethod = "addVectorListOperands"; + let PredicateMethod = + "isVectorList"; + let ParserMethod = "ParseVectorList"; + } + + def _operand : RegisterOperand"> { + let ParserMatchClass = + !cast(PREFIX # LAYOUT # "_asmoperand"); + } +} + +multiclass VectorList_BHSD { + defm 8B : VectorList_operands; + defm 4H : VectorList_operands; + defm 2S : VectorList_operands; + defm 1D : VectorList_operands; + defm 16B : VectorList_operands; + defm 8H : VectorList_operands; + defm 4S : VectorList_operands; + defm 2D : VectorList_operands; +} + +// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand +defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>; +defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>; +defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>; +defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>; \ No newline at end of file diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 51638d90e3c..127c7eca0ee 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -127,6 +127,11 @@ public: OperandMatchResultTy ParseSysRegOperand(SmallVectorImpl &Operands); + bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout, + SMLoc &LayoutLoc); + + OperandMatchResultTy ParseVectorList(SmallVectorImpl &); + bool validateInstruction(MCInst &Inst, const SmallVectorImpl &Operands); @@ -154,6 +159,7 @@ private: k_Immediate, // Including expressions referencing symbols k_Register, k_ShiftExtend, + k_VectorList, // A sequential list of 1 to 4 registers. k_SysReg, // The register operand of MRS and MSR instructions k_Token, // The mnemonic; other raw tokens the auto-generated k_WrappedRegister // Load/store exclusive permit a wrapped register. @@ -189,6 +195,13 @@ private: bool ImplicitAmount; }; + // A vector register list is a sequential list of 1 to 4 registers. + struct VectorListOp { + unsigned RegNum; + unsigned Count; + A64Layout::VectorLayout Layout; + }; + struct SysRegOp { const char *Data; unsigned Length; @@ -206,6 +219,7 @@ private: struct ImmOp Imm; struct RegOp Reg; struct ShiftExtendOp ShiftExtend; + struct VectorListOp VectorList; struct SysRegOp SysReg; struct TokOp Tok; }; @@ -717,6 +731,12 @@ public: return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16; } + template + bool isVectorList() const { + return Kind == k_VectorList && VectorList.Layout == Layout && + VectorList.Count == Count; + } + template bool isSImm7Scaled() const { if (!isImm()) return false; @@ -837,6 +857,18 @@ public: return Op; } + static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count, + A64Layout::VectorLayout Layout, + SMLoc S, SMLoc E) { + AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.Layout = Layout; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + static AArch64Operand *CreateToken(StringRef Str, SMLoc S) { AArch64Operand *Op = new AArch64Operand(k_Token, S, S); Op->Tok.Data = Str.data(); @@ -1184,6 +1216,11 @@ public: } Inst.addOperand(MCOperand::CreateImm(Imm)); } + + void addVectorListOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum)); + } }; } // end anonymous namespace. @@ -1223,7 +1260,6 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl &Operands, else return MatchOperand_Success; } - // ... or it might be a symbolish thing } // Fall through @@ -1267,7 +1303,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl &Operands, return ParseOperand(Operands, Mnemonic); } // The following will likely be useful later, but not in very early cases - case AsmToken::LCurly: // Weird SIMD lists + case AsmToken::LCurly: // SIMD vector list is not parsed here llvm_unreachable("Don't know how to deal with '{' in operand"); return MatchOperand_ParseFail; } @@ -1890,6 +1926,132 @@ AArch64AsmParser::ParseShiftExtend( return MatchOperand_Success; } +/// Try to parse a vector register token, If it is a vector register, +/// the token is eaten and return true. Otherwise return false. +bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, + StringRef &Layout, SMLoc &LayoutLoc) { + bool IsVector = true; + + if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)) + IsVector = false; + + if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(RegNum) && + !AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(RegNum)) + IsVector = false; + + if (Layout.size() == 0) + IsVector = false; + + if (!IsVector) + Error(Parser.getTok().getLoc(), "expected vector type register"); + + Parser.Lex(); // Eat this token. + return IsVector; +} + + +// A vector list contains 1-4 consecutive registers. +// Now there are two kinds of vector list when number of vector > 1: +// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout} +// (2) {Vn.layout - Vm.layout} +AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList( + SmallVectorImpl &Operands) { + if (Parser.getTok().isNot(AsmToken::LCurly)) { + Error(Parser.getTok().getLoc(), "'{' expected"); + return MatchOperand_ParseFail; + } + SMLoc SLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '{' token. + + unsigned Reg, Count = 1; + StringRef LayoutStr; + SMLoc RegEndLoc, LayoutLoc; + if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc)) + return MatchOperand_ParseFail; + + if (Parser.getTok().is(AsmToken::Minus)) { + Parser.Lex(); // Eat the minus. + + unsigned Reg2; + StringRef LayoutStr2; + SMLoc RegEndLoc2, LayoutLoc2; + SMLoc RegLoc2 = Parser.getTok().getLoc(); + + if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2)) + return MatchOperand_ParseFail; + unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg); + + if (LayoutStr != LayoutStr2) { + Error(LayoutLoc2, "expected the same vector layout"); + return MatchOperand_ParseFail; + } + if (Space == 0 || Space > 3) { + Error(RegLoc2, "invalid number of vectors"); + return MatchOperand_ParseFail; + } + + Count += Space; + } else { + unsigned LastReg = Reg; + while (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + unsigned Reg2; + StringRef LayoutStr2; + SMLoc RegEndLoc2, LayoutLoc2; + SMLoc RegLoc2 = Parser.getTok().getLoc(); + + if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2)) + return MatchOperand_ParseFail; + unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg) + : (Reg2 + 32 - LastReg); + Count++; + + // The space between two vectors should be 1. And they should have the same layout. + // Total count shouldn't be great than 4 + if (Space != 1) { + Error(RegLoc2, "invalid space between two vectors"); + return MatchOperand_ParseFail; + } + if (LayoutStr != LayoutStr2) { + Error(LayoutLoc2, "expected the same vector layout"); + return MatchOperand_ParseFail; + } + if (Count > 4) { + Error(RegLoc2, "invalid number of vectors"); + return MatchOperand_ParseFail; + } + + LastReg = Reg2; + } + } + + if (Parser.getTok().isNot(AsmToken::RCurly)) { + Error(Parser.getTok().getLoc(), "'}' expected"); + return MatchOperand_ParseFail; + } + SMLoc ELoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat '}' token. + + A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr); + if (Count > 1) { // If count > 1, create vector list using super register. + bool IsVec64 = (Layout < A64Layout::_16B) ? true : false; + static unsigned SupRegIDs[3][2] = { + { AArch64::QPairRegClassID, AArch64::DPairRegClassID }, + { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID }, + { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID } + }; + unsigned SupRegID = SupRegIDs[Count - 2][static_cast(IsVec64)]; + unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0; + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + Reg = MRI->getMatchingSuperReg(Reg, Sub0, + &AArch64MCRegisterClasses[SupRegID]); + } + Operands.push_back( + AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc)); + + return MatchOperand_Success; +} + // FIXME: We would really like to be able to tablegen'erate this. bool AArch64AsmParser:: validateInstruction(MCInst &Inst, diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index b9d7c1684d5..16ec0cbac00 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -361,6 +361,59 @@ DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder); } +static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo, + unsigned RegID, + const void *Decoder) { + if (RegNo > 31) + return MCDisassembler::Fail; + + uint16_t Register = getReg(Decoder, RegID, RegNo); + Inst.addOperand(MCOperand::CreateReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID, + Decoder); +} + +static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID, + Decoder); +} + +static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID, + Decoder); +} + +static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID, + Decoder); +} + +static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID, + Decoder); +} + +static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID, + Decoder); +} + static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst, unsigned OptionHiS, uint64_t Address, diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 26bd797e3b3..51335e145b7 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -507,3 +507,33 @@ void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI, O << "#0x"; O.write_hex(Mask); } + +// If Count > 1, there are two valid kinds of vector list: +// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout} +// (2) {Vn.layout - Vm.layout} +// We choose the first kind as output. +template +void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors"); + + unsigned Reg = MI->getOperand(OpNum).getReg(); + std::string LayoutStr = A64VectorLayoutToString(Layout); + O << "{"; + if (Count > 1) { // Print sub registers separately + bool IsVec64 = (Layout < A64Layout::_16B) ? true : false; + unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0; + for (unsigned I = 0; I < Count; I++) { + std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++)); + Name[0] = 'v'; + O << Name << LayoutStr; + if (I != Count - 1) + O << ", "; + } + } else { // Print the register directly when NumVecs is 1. + std::string Name = getRegisterName(Reg); + Name[0] = 'v'; + O << Name << LayoutStr; + } + O << "}"; +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 71c9f4a0291..28ebfc45f1f 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -174,6 +174,9 @@ public: raw_ostream &O); void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + template + void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O); }; } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e675efc9d9a..7db52381813 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -306,6 +306,50 @@ namespace A64SE { }; } +namespace A64Layout { + enum VectorLayout { + Invalid = -1, + _8B, + _4H, + _2S, + _1D, + + _16B, + _8H, + _4S, + _2D + }; +} + +inline static const char * +A64VectorLayoutToString(A64Layout::VectorLayout Layout) { + switch (Layout) { + case A64Layout::_8B: return ".8b"; + case A64Layout::_4H: return ".4h"; + case A64Layout::_2S: return ".2s"; + case A64Layout::_1D: return ".1d"; + case A64Layout::_16B: return ".16b"; + case A64Layout::_8H: return ".8h"; + case A64Layout::_4S: return ".4s"; + case A64Layout::_2D: return ".2d"; + default: llvm_unreachable("Unknown Vector Layout"); + } +} + +inline static A64Layout::VectorLayout +A64StringToVectorLayout(StringRef LayoutStr) { + return StringSwitch(LayoutStr) + .Case(".8b", A64Layout::_8B) + .Case(".4h", A64Layout::_4H) + .Case(".2s", A64Layout::_2S) + .Case(".1d", A64Layout::_1D) + .Case(".16b", A64Layout::_16B) + .Case(".8h", A64Layout::_8H) + .Case(".4s", A64Layout::_4S) + .Case(".2d", A64Layout::_2D) + .Default(A64Layout::Invalid); +} + namespace A64SysReg { enum SysRegROValues { MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll new file mode 100644 index 00000000000..4cd76bcb00b --- /dev/null +++ b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll @@ -0,0 +1,1228 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +%struct.int8x16x2_t = type { [2 x <16 x i8>] } +%struct.int16x8x2_t = type { [2 x <8 x i16>] } +%struct.int32x4x2_t = type { [2 x <4 x i32>] } +%struct.int64x2x2_t = type { [2 x <2 x i64>] } +%struct.float32x4x2_t = type { [2 x <4 x float>] } +%struct.float64x2x2_t = type { [2 x <2 x double>] } +%struct.int8x8x2_t = type { [2 x <8 x i8>] } +%struct.int16x4x2_t = type { [2 x <4 x i16>] } +%struct.int32x2x2_t = type { [2 x <2 x i32>] } +%struct.int64x1x2_t = type { [2 x <1 x i64>] } +%struct.float32x2x2_t = type { [2 x <2 x float>] } +%struct.float64x1x2_t = type { [2 x <1 x double>] } +%struct.int8x16x3_t = type { [3 x <16 x i8>] } +%struct.int16x8x3_t = type { [3 x <8 x i16>] } +%struct.int32x4x3_t = type { [3 x <4 x i32>] } +%struct.int64x2x3_t = type { [3 x <2 x i64>] } +%struct.float32x4x3_t = type { [3 x <4 x float>] } +%struct.float64x2x3_t = type { [3 x <2 x double>] } +%struct.int8x8x3_t = type { [3 x <8 x i8>] } +%struct.int16x4x3_t = type { [3 x <4 x i16>] } +%struct.int32x2x3_t = type { [3 x <2 x i32>] } +%struct.int64x1x3_t = type { [3 x <1 x i64>] } +%struct.float32x2x3_t = type { [3 x <2 x float>] } +%struct.float64x1x3_t = type { [3 x <1 x double>] } +%struct.int8x16x4_t = type { [4 x <16 x i8>] } +%struct.int16x8x4_t = type { [4 x <8 x i16>] } +%struct.int32x4x4_t = type { [4 x <4 x i32>] } +%struct.int64x2x4_t = type { [4 x <2 x i64>] } +%struct.float32x4x4_t = type { [4 x <4 x float>] } +%struct.float64x2x4_t = type { [4 x <2 x double>] } +%struct.int8x8x4_t = type { [4 x <8 x i8>] } +%struct.int16x4x4_t = type { [4 x <4 x i16>] } +%struct.int32x2x4_t = type { [4 x <2 x i32>] } +%struct.int64x1x4_t = type { [4 x <1 x i64>] } +%struct.float32x2x4_t = type { [4 x <2 x float>] } +%struct.float64x1x4_t = type { [4 x <1 x double>] } + + +define <16 x i8> @test_vld1q_s8(i8* readonly %a) { +; CHECK: test_vld1q_s8 +; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1) + ret <16 x i8> %vld1 +} + +define <8 x i16> @test_vld1q_s16(i16* readonly %a) { +; CHECK: test_vld1q_s16 +; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2) + ret <8 x i16> %vld1 +} + +define <4 x i32> @test_vld1q_s32(i32* readonly %a) { +; CHECK: test_vld1q_s32 +; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4) + ret <4 x i32> %vld1 +} + +define <2 x i64> @test_vld1q_s64(i64* readonly %a) { +; CHECK: test_vld1q_s64 +; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8) + ret <2 x i64> %vld1 +} + +define <4 x float> @test_vld1q_f32(float* readonly %a) { +; CHECK: test_vld1q_f32 +; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4) + ret <4 x float> %vld1 +} + +define <2 x double> @test_vld1q_f64(double* readonly %a) { +; CHECK: test_vld1q_f64 +; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8) + ret <2 x double> %vld1 +} + +define <8 x i8> @test_vld1_s8(i8* readonly %a) { +; CHECK: test_vld1_s8 +; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1) + ret <8 x i8> %vld1 +} + +define <4 x i16> @test_vld1_s16(i16* readonly %a) { +; CHECK: test_vld1_s16 +; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2) + ret <4 x i16> %vld1 +} + +define <2 x i32> @test_vld1_s32(i32* readonly %a) { +; CHECK: test_vld1_s32 +; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4) + ret <2 x i32> %vld1 +} + +define <1 x i64> @test_vld1_s64(i64* readonly %a) { +; CHECK: test_vld1_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8) + ret <1 x i64> %vld1 +} + +define <2 x float> @test_vld1_f32(float* readonly %a) { +; CHECK: test_vld1_f32 +; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4) + ret <2 x float> %vld1 +} + +define <1 x double> @test_vld1_f64(double* readonly %a) { +; CHECK: test_vld1_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8) + ret <1 x double> %vld1 +} + +define <8 x i8> @test_vld1_p8(i8* readonly %a) { +; CHECK: test_vld1_p8 +; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1) + ret <8 x i8> %vld1 +} + +define <4 x i16> @test_vld1_p16(i16* readonly %a) { +; CHECK: test_vld1_p16 +; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2) + ret <4 x i16> %vld1 +} + +define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) { +; CHECK: test_vld2q_s8 +; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1) + %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1 + ret %struct.int8x16x2_t %.fca.0.1.insert +} + +define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) { +; CHECK: test_vld2q_s16 +; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2) + %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1 + ret %struct.int16x8x2_t %.fca.0.1.insert +} + +define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) { +; CHECK: test_vld2q_s32 +; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1 + ret %struct.int32x4x2_t %.fca.0.1.insert +} + +define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) { +; CHECK: test_vld2q_s64 +; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1 + ret %struct.int64x2x2_t %.fca.0.1.insert +} + +define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) { +; CHECK: test_vld2q_f32 +; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1 + ret %struct.float32x4x2_t %.fca.0.1.insert +} + +define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) { +; CHECK: test_vld2q_f64 +; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1 + ret %struct.float64x2x2_t %.fca.0.1.insert +} + +define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) { +; CHECK: test_vld2_s8 +; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1) + %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1 + ret %struct.int8x8x2_t %.fca.0.1.insert +} + +define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) { +; CHECK: test_vld2_s16 +; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2) + %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1 + ret %struct.int16x4x2_t %.fca.0.1.insert +} + +define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) { +; CHECK: test_vld2_s32 +; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1 + ret %struct.int32x2x2_t %.fca.0.1.insert +} + +define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) { +; CHECK: test_vld2_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1 + ret %struct.int64x1x2_t %.fca.0.1.insert +} + +define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) { +; CHECK: test_vld2_f32 +; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4) + %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1 + ret %struct.float32x2x2_t %.fca.0.1.insert +} + +define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) { +; CHECK: test_vld2_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8) + %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1 + ret %struct.float64x1x2_t %.fca.0.1.insert +} + +define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) { +; CHECK: test_vld3q_s8 +; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1) + %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2 + ret %struct.int8x16x3_t %.fca.0.2.insert +} + +define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) { +; CHECK: test_vld3q_s16 +; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2) + %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2 + ret %struct.int16x8x3_t %.fca.0.2.insert +} + +define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) { +; CHECK: test_vld3q_s32 +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2 + ret %struct.int32x4x3_t %.fca.0.2.insert +} + +define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) { +; CHECK: test_vld3q_s64 +; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2 + ret %struct.int64x2x3_t %.fca.0.2.insert +} + +define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) { +; CHECK: test_vld3q_f32 +; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2 + ret %struct.float32x4x3_t %.fca.0.2.insert +} + +define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) { +; CHECK: test_vld3q_f64 +; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2 + ret %struct.float64x2x3_t %.fca.0.2.insert +} + +define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) { +; CHECK: test_vld3_s8 +; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1) + %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2 + ret %struct.int8x8x3_t %.fca.0.2.insert +} + +define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) { +; CHECK: test_vld3_s16 +; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2) + %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2 + ret %struct.int16x4x3_t %.fca.0.2.insert +} + +define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) { +; CHECK: test_vld3_s32 +; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2 + ret %struct.int32x2x3_t %.fca.0.2.insert +} + +define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) { +; CHECK: test_vld3_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2 + ret %struct.int64x1x3_t %.fca.0.2.insert +} + +define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) { +; CHECK: test_vld3_f32 +; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4) + %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2 + ret %struct.float32x2x3_t %.fca.0.2.insert +} + +define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) { +; CHECK: test_vld3_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8) + %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2 + ret %struct.float64x1x3_t %.fca.0.2.insert +} + +define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) { +; CHECK: test_vld4q_s8 +; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] + %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1) + %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3 + ret %struct.int8x16x4_t %.fca.0.3.insert +} + +define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) { +; CHECK: test_vld4q_s16 +; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2) + %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3 + ret %struct.int16x8x4_t %.fca.0.3.insert +} + +define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) { +; CHECK: test_vld4q_s32 +; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3 + ret %struct.int32x4x4_t %.fca.0.3.insert +} + +define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) { +; CHECK: test_vld4q_s64 +; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3 + ret %struct.int64x2x4_t %.fca.0.3.insert +} + +define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) { +; CHECK: test_vld4q_f32 +; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3 + ret %struct.float32x4x4_t %.fca.0.3.insert +} + +define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) { +; CHECK: test_vld4q_f64 +; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3 + ret %struct.float64x2x4_t %.fca.0.3.insert +} + +define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) { +; CHECK: test_vld4_s8 +; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] + %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1) + %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3 + ret %struct.int8x8x4_t %.fca.0.3.insert +} + +define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) { +; CHECK: test_vld4_s16 +; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2) + %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3 + ret %struct.int16x4x4_t %.fca.0.3.insert +} + +define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) { +; CHECK: test_vld4_s32 +; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3 + ret %struct.int32x2x4_t %.fca.0.3.insert +} + +define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) { +; CHECK: test_vld4_s64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3 + ret %struct.int64x1x4_t %.fca.0.3.insert +} + +define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) { +; CHECK: test_vld4_f32 +; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4) + %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3 + ret %struct.float32x2x4_t %.fca.0.3.insert +} + +define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) { +; CHECK: test_vld4_f64 +; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8) + %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3 + ret %struct.float64x1x4_t %.fca.0.3.insert +} + +declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) +declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) +declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) +declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) +declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) +declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32) +declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32) +declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) +declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) +declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) +declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) +declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32) +declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) +declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32) +declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32) +declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32) +declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32) +declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32) +declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32) +declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32) +declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32) +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) +declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32) +declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32) +declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32) +declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32) +declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32) +declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32) + +define void @test_vst1q_s8(i8* %a, <16 x i8> %b) { +; CHECK: test_vst1q_s8 +; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1) + ret void +} + +define void @test_vst1q_s16(i16* %a, <8 x i16> %b) { +; CHECK: test_vst1q_s16 +; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2) + ret void +} + +define void @test_vst1q_s32(i32* %a, <4 x i32> %b) { +; CHECK: test_vst1q_s32 +; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4) + ret void +} + +define void @test_vst1q_s64(i64* %a, <2 x i64> %b) { +; CHECK: test_vst1q_s64 +; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8) + ret void +} + +define void @test_vst1q_f32(float* %a, <4 x float> %b) { +; CHECK: test_vst1q_f32 +; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4) + ret void +} + +define void @test_vst1q_f64(double* %a, <2 x double> %b) { +; CHECK: test_vst1q_f64 +; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8) + ret void +} + +define void @test_vst1_s8(i8* %a, <8 x i8> %b) { +; CHECK: test_vst1_s8 +; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1) + ret void +} + +define void @test_vst1_s16(i16* %a, <4 x i16> %b) { +; CHECK: test_vst1_s16 +; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2) + ret void +} + +define void @test_vst1_s32(i32* %a, <2 x i32> %b) { +; CHECK: test_vst1_s32 +; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4) + ret void +} + +define void @test_vst1_s64(i64* %a, <1 x i64> %b) { +; CHECK: test_vst1_s64 +; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8) + ret void +} + +define void @test_vst1_f32(float* %a, <2 x float> %b) { +; CHECK: test_vst1_f32 +; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4) + ret void +} + +define void @test_vst1_f64(double* %a, <1 x double> %b) { +; CHECK: test_vst1_f64 +; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8) + ret void +} + +define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) { +; CHECK: test_vst2q_s8 +; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1) + ret void +} + +define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) { +; CHECK: test_vst2q_s16 +; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2) + ret void +} + +define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) { +; CHECK: test_vst2q_s32 +; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) { +; CHECK: test_vst2q_s64 +; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) { +; CHECK: test_vst2q_f32 +; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) { +; CHECK: test_vst2q_f64 +; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) { +; CHECK: test_vst2_s8 +; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1 + tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1) + ret void +} + +define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) { +; CHECK: test_vst2_s16 +; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2) + ret void +} + +define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) { +; CHECK: test_vst2_s32 +; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) { +; CHECK: test_vst2_s64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) { +; CHECK: test_vst2_f32 +; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4) + ret void +} + +define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) { +; CHECK: test_vst2_f64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8) + ret void +} + +define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) { +; CHECK: test_vst3q_s8 +; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1) + ret void +} + +define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) { +; CHECK: test_vst3q_s16 +; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2) + ret void +} + +define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) { +; CHECK: test_vst3q_s32 +; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) { +; CHECK: test_vst3q_s64 +; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) { +; CHECK: test_vst3q_f32 +; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) { +; CHECK: test_vst3q_f64 +; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) { +; CHECK: test_vst3_s8 +; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2 + tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1) + ret void +} + +define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) { +; CHECK: test_vst3_s16 +; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2) + ret void +} + +define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) { +; CHECK: test_vst3_s32 +; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) { +; CHECK: test_vst3_s64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) { +; CHECK: test_vst3_f32 +; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4) + ret void +} + +define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) { +; CHECK: test_vst3_f64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8) + ret void +} + +define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) { +; CHECK: test_vst4q_s8 +; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3 + tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1) + ret void +} + +define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) { +; CHECK: test_vst4q_s16 +; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2) + ret void +} + +define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) { +; CHECK: test_vst4q_s32 +; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) { +; CHECK: test_vst4q_s64 +; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8) + ret void +} + +define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) { +; CHECK: test_vst4q_f32 +; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) { +; CHECK: test_vst4q_f64 +; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8) + ret void +} + +define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) { +; CHECK: test_vst4_s8 +; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3 + tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1) + ret void +} + +define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) { +; CHECK: test_vst4_s16 +; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3 + %1 = bitcast i16* %a to i8* + tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2) + ret void +} + +define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) { +; CHECK: test_vst4_s32 +; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3 + %1 = bitcast i32* %a to i8* + tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) { +; CHECK: test_vst4_s64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3 + %1 = bitcast i64* %a to i8* + tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8) + ret void +} + +define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) { +; CHECK: test_vst4_f32 +; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3 + %1 = bitcast float* %a to i8* + tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4) + ret void +} + +define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) { +; CHECK: test_vst4_f64 +; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}] + %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0 + %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1 + %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2 + %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3 + %1 = bitcast double* %a to i8* + tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8) + ret void +} + +declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) +declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) +declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) +declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) +declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) +declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32) +declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) +declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) +declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) +declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) +declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) +declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32) +declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) +declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) +declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) +declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32) +declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) +declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32) +declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) +declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) +declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) +declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) +declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) +declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32) +declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) +declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) +declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) +declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32) +declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) +declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32) +declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) +declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) +declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) +declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32) +declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) +declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) +declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) +declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32) +declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) +declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) +declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) +declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) +declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) +declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) +declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32) \ No newline at end of file diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s index 9127ed840b5..086d4876e35 100644 --- a/test/MC/AArch64/neon-diagnostics.s +++ b/test/MC/AArch64/neon-diagnostics.s @@ -3880,3 +3880,224 @@ // CHECK-ERROR: error: invalid operand for instruction // CHECK-ERROR: frsqrts d8, s22, d18 // CHECK-ERROR: ^ + +//---------------------------------------------------------------------- +// Vector load/store multiple N-element structure (class SIMD lselem) +//---------------------------------------------------------------------- + ld1 {x3}, [x2] + ld1 {v4}, [x0] + ld1 {v32.16b}, [x0] + ld1 {v15.8h}, [x32] +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: ld1 {x3}, [x2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: ld1 {v4}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: ld1 {v32.16b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld1 {v15.8h}, [x32] +// CHECK-ERROR: ^ + + ld1 {v0.16b, v2.16b}, [x0] + ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0] + ld1 v0.8b, v1.8b}, [x0] + ld1 {v0.8h-v4.8h}, [x0] + ld1 {v1.8h-v1.8h}, [x0] + ld1 {v15.8h-v17.4h}, [x15] + ld1 {v0.8b-v2.8b, [x0] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld1 {v0.16b, v2.16b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: '{' expected +// CHECK-ERROR: ld1 v0.8b, v1.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: ld1 {v0.8h-v4.8h}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: ld1 {v1.8h-v1.8h}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: ld1 {v15.8h-v17.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: '}' expected +// CHECK-ERROR: ld1 {v0.8b-v2.8b, [x0] +// CHECK-ERROR: ^ + + ld2 {v15.8h, v16.4h}, [x15] + ld2 {v0.8b, v2.8b}, [x0] + ld2 {v15.4h, v16.4h, v17.4h}, [x32] + ld2 {v15.8h-v16.4h}, [x15] + ld2 {v0.2d-v2.2d}, [x0] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld2 {v15.8h, v16.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld2 {v0.8b, v2.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld2 {v15.4h, v16.4h, v17.4h}, [x32] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: ld2 {v15.8h-v16.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld2 {v0.2d-v2.2d}, [x0] +// CHECK-ERROR: ^ + + ld3 {v15.8h, v16.8h, v17.4h}, [x15] + ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0] + ld3 {v0.8b, v2.8b, v3.8b}, [x0] + ld3 {v15.8h-v17.4h}, [x15] + ld3 {v31.4s-v2.4s}, [sp] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld3 {v15.8h, v16.8h, v17.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld3 {v0.8b, v2.8b, v3.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: ld3 {v15.8h-v17.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld3 {v31.4s-v2.4s}, [sp] +// CHECK-ERROR: ^ + + ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15] + ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0] + ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31] + ld4 {v15.8h-v18.4h}, [x15] + ld4 {v31.2s-v1.2s}, [x31] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: ld4 {v15.8h-v18.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ld4 {v31.2s-v1.2s}, [x31] +// CHECK-ERROR: ^ + + st1 {x3}, [x2] + st1 {v4}, [x0] + st1 {v32.16b}, [x0] + st1 {v15.8h}, [x32] +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: st1 {x3}, [x2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: st1 {v4}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: st1 {v32.16b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st1 {v15.8h}, [x32] +// CHECK-ERROR: ^ + + st1 {v0.16b, v2.16b}, [x0] + st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0] + st1 v0.8b, v1.8b}, [x0] + st1 {v0.8h-v4.8h}, [x0] + st1 {v1.8h-v1.8h}, [x0] + st1 {v15.8h-v17.4h}, [x15] + st1 {v0.8b-v2.8b, [x0] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: st1 {v0.16b, v2.16b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: '{' expected +// CHECK-ERROR: st1 v0.8b, v1.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: st1 {v0.8h-v4.8h}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: st1 {v1.8h-v1.8h}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: st1 {v15.8h-v17.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: '}' expected +// CHECK-ERROR: st1 {v0.8b-v2.8b, [x0] +// CHECK-ERROR: ^ + + st2 {v15.8h, v16.4h}, [x15] + st2 {v0.8b, v2.8b}, [x0] + st2 {v15.4h, v16.4h, v17.4h}, [x30] + st2 {v15.8h-v16.4h}, [x15] + st2 {v0.2d-v2.2d}, [x0] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: st2 {v15.8h, v16.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: st2 {v0.8b, v2.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st2 {v15.4h, v16.4h, v17.4h}, [x30] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: st2 {v15.8h-v16.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st2 {v0.2d-v2.2d}, [x0] +// CHECK-ERROR: ^ + + st3 {v15.8h, v16.8h, v17.4h}, [x15] + st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0] + st3 {v0.8b, v2.8b, v3.8b}, [x0] + st3 {v15.8h-v17.4h}, [x15] + st3 {v31.4s-v2.4s}, [sp] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: st3 {v15.8h, v16.8h, v17.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected vector type register +// CHECK-ERROR: st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: st3 {v0.8b, v2.8b, v3.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: st3 {v15.8h-v17.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st3 {v31.4s-v2.4s}, [sp] +// CHECK-ERROR: ^ + + st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15] + st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0] + st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31] + st4 {v15.8h-v18.4h}, [x15] + st4 {v31.2s-v1.2s}, [x31] +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid space between two vectors +// CHECK-ERROR: st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid number of vectors +// CHECK-ERROR: st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: expected the same vector layout +// CHECK-ERROR: st4 {v15.8h-v18.4h}, [x15] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: st4 {v31.2s-v1.2s}, [x31] +// CHECK-ERROR: ^ diff --git a/test/MC/AArch64/neon-simd-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-ldst-multi-elem.s new file mode 100644 index 00000000000..05fe4dac913 --- /dev/null +++ b/test/MC/AArch64/neon-simd-ldst-multi-elem.s @@ -0,0 +1,463 @@ +// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//------------------------------------------------------------------------------ +// Store multiple 1-element structures from one register +//------------------------------------------------------------------------------ + st1 {v0.16b}, [x0] + st1 {v15.8h}, [x15] + st1 {v31.4s}, [sp] + st1 {v0.2d}, [x0] + st1 {v0.8b}, [x0] + st1 {v15.4h}, [x15] + st1 {v31.2s}, [sp] + st1 {v0.1d}, [x0] +// CHECK: st1 {v0.16b}, [x0] // encoding: [0x00,0x70,0x00,0x4c] +// CHECK: st1 {v15.8h}, [x15] // encoding: [0xef,0x75,0x00,0x4c] +// CHECK: st1 {v31.4s}, [sp] // encoding: [0xff,0x7b,0x00,0x4c] +// CHECK: st1 {v0.2d}, [x0] // encoding: [0x00,0x7c,0x00,0x4c] +// CHECK: st1 {v0.8b}, [x0] // encoding: [0x00,0x70,0x00,0x0c] +// CHECK: st1 {v15.4h}, [x15] // encoding: [0xef,0x75,0x00,0x0c] +// CHECK: st1 {v31.2s}, [sp] // encoding: [0xff,0x7b,0x00,0x0c] +// CHECK: st1 {v0.1d}, [x0] // encoding: [0x00,0x7c,0x00,0x0c] + +//------------------------------------------------------------------------------ +// Store multiple 1-element structures from two consecutive registers +//------------------------------------------------------------------------------ + st1 {v0.16b, v1.16b}, [x0] + st1 {v15.8h, v16.8h}, [x15] + st1 {v31.4s, v0.4s}, [sp] + st1 {v0.2d, v1.2d}, [x0] + st1 {v0.8b, v1.8b}, [x0] + st1 {v15.4h, v16.4h}, [x15] + st1 {v31.2s, v0.2s}, [sp] + st1 {v0.1d, v1.1d}, [x0] +// CHECK: st1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x00,0x4c] +// CHECK: st1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c] +// CHECK: st1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x00,0x4c] +// CHECK: st1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x00,0x4c] +// CHECK: st1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x00,0x0c] +// CHECK: st1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c] +// CHECK: st1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x00,0x0c] +// CHECK: st1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x00,0x0c] + + st1 {v0.16b-v1.16b}, [x0] + st1 {v15.8h-v16.8h}, [x15] + st1 {v31.4s-v0.4s}, [sp] + st1 {v0.2d-v1.2d}, [x0] + st1 {v0.8b-v1.8b}, [x0] + st1 {v15.4h-v16.4h}, [x15] + st1 {v31.2s-v0.2s}, [sp] + st1 {v0.1d-v1.1d}, [x0] +// CHECK: st1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x00,0x4c] +// CHECK: st1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c] +// CHECK: st1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x00,0x4c] +// CHECK: st1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x00,0x4c] +// CHECK: st1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x00,0x0c] +// CHECK: st1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c] +// CHECK: st1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x00,0x0c] +// CHECK: st1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x00,0x0c] + +//------------------------------------------------------------------------------ +// Store multiple 1-element structures from three consecutive registers +//------------------------------------------------------------------------------ + st1 {v0.16b, v1.16b, v2.16b}, [x0] + st1 {v15.8h, v16.8h, v17.8h}, [x15] + st1 {v31.4s, v0.4s, v1.4s}, [sp] + st1 {v0.2d, v1.2d, v2.2d}, [x0] + st1 {v0.8b, v1.8b, v2.8b}, [x0] + st1 {v15.4h, v16.4h, v17.4h}, [x15] + st1 {v31.2s, v0.2s, v1.2s}, [sp] + st1 {v0.1d, v1.1d, v2.1d}, [x0] +// CHECK: st1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c] +// CHECK: st1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c] +// CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c] +// CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c] +// CHECK: st1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c] +// CHECK: st1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c] +// CHECK: st1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c] +// CHECK: st1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c] + + st1 {v0.16b-v2.16b}, [x0] + st1 {v15.8h-v17.8h}, [x15] + st1 {v31.4s-v1.4s}, [sp] + st1 {v0.2d-v2.2d}, [x0] + st1 {v0.8b-v2.8b}, [x0] + st1 {v15.4h-v17.4h}, [x15] + st1 {v31.2s-v1.2s}, [sp] + st1 {v0.1d-v2.1d}, [x0] +// CHECK: st1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c] +// CHECK: st1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c] +// CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c] +// CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c] +// CHECK: st1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c] +// CHECK: st1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c] +// CHECK: st1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c] +// CHECK: st1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c] + +//------------------------------------------------------------------------------ +// Store multiple 1-element structures from four consecutive registers +//------------------------------------------------------------------------------ + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] + st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] + st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] + st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] + st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] + st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] + st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] +// CHECK: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c] +// CHECK: st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c] +// CHECK: st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c] +// CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c] +// CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c] +// CHECK: st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c] +// CHECK: st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c] +// CHECK: st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c] + + st1 {v0.16b-v3.16b}, [x0] + st1 {v15.8h-v18.8h}, [x15] + st1 {v31.4s-v2.4s}, [sp] + st1 {v0.2d-v3.2d}, [x0] + st1 {v0.8b-v3.8b}, [x0] + st1 {v15.4h-v18.4h}, [x15] + st1 {v31.2s-v2.2s}, [sp] + st1 {v0.1d-v3.1d}, [x0] +// CHECK: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c] +// CHECK: st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c] +// CHECK: st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c] +// CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c] +// CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c] +// CHECK: st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c] +// CHECK: st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c] +// CHECK: st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c] + +//------------------------------------------------------------------------------ +// Store multiple 2-element structures from two consecutive registers +//------------------------------------------------------------------------------ + st2 {v0.16b, v1.16b}, [x0] + st2 {v15.8h, v16.8h}, [x15] + st2 {v31.4s, v0.4s}, [sp] + st2 {v0.2d, v1.2d}, [x0] + st2 {v0.8b, v1.8b}, [x0] + st2 {v15.4h, v16.4h}, [x15] + st2 {v31.2s, v0.2s}, [sp] +// CHECK: st2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x00,0x4c] +// CHECK: st2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c] +// CHECK: st2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x00,0x4c] +// CHECK: st2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x00,0x4c] +// CHECK: st2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x00,0x0c] +// CHECK: st2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c] +// CHECK: st2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x00,0x0c] + + st2 {v0.16b-v1.16b}, [x0] + st2 {v15.8h-v16.8h}, [x15] + st2 {v31.4s-v0.4s}, [sp] + st2 {v0.2d-v1.2d}, [x0] + st2 {v0.8b-v1.8b}, [x0] + st2 {v15.4h-v16.4h}, [x15] + st2 {v31.2s-v0.2s}, [sp] +// CHECK: st2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x00,0x4c] +// CHECK: st2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c] +// CHECK: st2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x00,0x4c] +// CHECK: st2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x00,0x4c] +// CHECK: st2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x00,0x0c] +// CHECK: st2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c] +// CHECK: st2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x00,0x0c] + +//------------------------------------------------------------------------------ +// Store multiple 3-element structures from three consecutive registers +//------------------------------------------------------------------------------ + st3 {v0.16b, v1.16b, v2.16b}, [x0] + st3 {v15.8h, v16.8h, v17.8h}, [x15] + st3 {v31.4s, v0.4s, v1.4s}, [sp] + st3 {v0.2d, v1.2d, v2.2d}, [x0] + st3 {v0.8b, v1.8b, v2.8b}, [x0] + st3 {v15.4h, v16.4h, v17.4h}, [x15] + st3 {v31.2s, v0.2s, v1.2s}, [sp] +// CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c] +// CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c] +// CHECK: st3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c] +// CHECK: st3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c] +// CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c] +// CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c] +// CHECK: st3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c] + + st3 {v0.16b-v2.16b}, [x0] + st3 {v15.8h-v17.8h}, [x15] + st3 {v31.4s-v1.4s}, [sp] + st3 {v0.2d-v2.2d}, [x0] + st3 {v0.8b-v2.8b}, [x0] + st3 {v15.4h-v17.4h}, [x15] + st3 {v31.2s-v1.2s}, [sp] +// CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c] +// CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c] +// CHECK: st3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c] +// CHECK: st3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c] +// CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c] +// CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c] +// CHECK: st3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c] + +//------------------------------------------------------------------------------ +// Store multiple 4-element structures from four consecutive registers +//------------------------------------------------------------------------------ + st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] + st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] + st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] + st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] + st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] + st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] + st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] +// CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c] +// CHECK: st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c] +// CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c] +// CHECK: st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c] +// CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c] +// CHECK: st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c] +// CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c] + + st4 {v0.16b-v3.16b}, [x0] + st4 {v15.8h-v18.8h}, [x15] + st4 {v31.4s-v2.4s}, [sp] + st4 {v0.2d-v3.2d}, [x0] + st4 {v0.8b-v3.8b}, [x0] + st4 {v15.4h-v18.4h}, [x15] + st4 {v31.2s-v2.2s}, [sp] +// CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c] +// CHECK: st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c] +// CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c] +// CHECK: st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c] +// CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c] +// CHECK: st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c] +// CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c] + +//------------------------------------------------------------------------------ +// Load multiple 1-element structures to one register +//------------------------------------------------------------------------------ + ld1 {v0.16b}, [x0] + ld1 {v15.8h}, [x15] + ld1 {v31.4s}, [sp] + ld1 {v0.2d}, [x0] + ld1 {v0.8b}, [x0] + ld1 {v15.4h}, [x15] + ld1 {v31.2s}, [sp] + ld1 {v0.1d}, [x0] +// CHECK: ld1 {v0.16b}, [x0] // encoding: [0x00,0x70,0x40,0x4c] +// CHECK: ld1 {v15.8h}, [x15] // encoding: [0xef,0x75,0x40,0x4c] +// CHECK: ld1 {v31.4s}, [sp] // encoding: [0xff,0x7b,0x40,0x4c] +// CHECK: ld1 {v0.2d}, [x0] // encoding: [0x00,0x7c,0x40,0x4c] +// CHECK: ld1 {v0.8b}, [x0] // encoding: [0x00,0x70,0x40,0x0c] +// CHECK: ld1 {v15.4h}, [x15] // encoding: [0xef,0x75,0x40,0x0c] +// CHECK: ld1 {v31.2s}, [sp] // encoding: [0xff,0x7b,0x40,0x0c] +// CHECK: ld1 {v0.1d}, [x0] // encoding: [0x00,0x7c,0x40,0x0c] + +//------------------------------------------------------------------------------ +// Load multiple 1-element structures to two consecutive registers +//------------------------------------------------------------------------------ + ld1 {v0.16b, v1.16b}, [x0] + ld1 {v15.8h, v16.8h}, [x15] + ld1 {v31.4s, v0.4s}, [sp] + ld1 {v0.2d, v1.2d}, [x0] + ld1 {v0.8b, v1.8b}, [x0] + ld1 {v15.4h, v16.4h}, [x15] + ld1 {v31.2s, v0.2s}, [sp] + ld1 {v0.1d, v1.1d}, [x0] +// CHECK: ld1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x40,0x4c] +// CHECK: ld1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c] +// CHECK: ld1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x40,0x4c] +// CHECK: ld1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x40,0x4c] +// CHECK: ld1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x40,0x0c] +// CHECK: ld1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c] +// CHECK: ld1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x40,0x0c] +// CHECK: ld1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x40,0x0c] + + ld1 {v0.16b-v1.16b}, [x0] + ld1 {v15.8h-v16.8h}, [x15] + ld1 {v31.4s-v0.4s}, [sp] + ld1 {v0.2d-v1.2d}, [x0] + ld1 {v0.8b-v1.8b}, [x0] + ld1 {v15.4h-v16.4h}, [x15] + ld1 {v31.2s-v0.2s}, [sp] + ld1 {v0.1d-v1.1d}, [x0] +// CHECK: ld1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x40,0x4c] +// CHECK: ld1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c] +// CHECK: ld1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x40,0x4c] +// CHECK: ld1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x40,0x4c] +// CHECK: ld1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x40,0x0c] +// CHECK: ld1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c] +// CHECK: ld1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x40,0x0c] +// CHECK: ld1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x40,0x0c] + +//------------------------------------------------------------------------------ +// Load multiple 1-element structures to three consecutive registers +//------------------------------------------------------------------------------ + ld1 {v0.16b, v1.16b, v2.16b}, [x0] + ld1 {v15.8h, v16.8h, v17.8h}, [x15] + ld1 {v31.4s, v0.4s, v1.4s}, [sp] + ld1 {v0.2d, v1.2d, v2.2d}, [x0] + ld1 {v0.8b, v1.8b, v2.8b}, [x0] + ld1 {v15.4h, v16.4h, v17.4h}, [x15] + ld1 {v31.2s, v0.2s, v1.2s}, [sp] + ld1 {v0.1d, v1.1d, v2.1d}, [x0] +// CHECK: ld1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c] +// CHECK: ld1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c] +// CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c] +// CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c] +// CHECK: ld1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c] +// CHECK: ld1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c] +// CHECK: ld1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c] +// CHECK: ld1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c] + + ld1 {v0.16b-v2.16b}, [x0] + ld1 {v15.8h-v17.8h}, [x15] + ld1 {v31.4s-v1.4s}, [sp] + ld1 {v0.2d-v2.2d}, [x0] + ld1 {v0.8b-v2.8b}, [x0] + ld1 {v15.4h-v17.4h}, [x15] + ld1 {v31.2s-v1.2s}, [sp] + ld1 {v0.1d-v2.1d}, [x0] +// CHECK: ld1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c] +// CHECK: ld1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c] +// CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c] +// CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c] +// CHECK: ld1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c] +// CHECK: ld1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c] +// CHECK: ld1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c] +// CHECK: ld1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c] + +//------------------------------------------------------------------------------ +// Load multiple 1-element structures to four consecutive registers +//------------------------------------------------------------------------------ + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] + ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] + ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] + ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] + ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] + ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] + ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] +// CHECK: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c] +// CHECK: ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c] +// CHECK: ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c] +// CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c] +// CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c] +// CHECK: ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c] +// CHECK: ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c] +// CHECK: ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c] + + ld1 {v0.16b-v3.16b}, [x0] + ld1 {v15.8h-v18.8h}, [x15] + ld1 {v31.4s-v2.4s}, [sp] + ld1 {v0.2d-v3.2d}, [x0] + ld1 {v0.8b-v3.8b}, [x0] + ld1 {v15.4h-v18.4h}, [x15] + ld1 {v31.2s-v2.2s}, [sp] + ld1 {v0.1d-v3.1d}, [x0] +// CHECK: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c] +// CHECK: ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c] +// CHECK: ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c] +// CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c] +// CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c] +// CHECK: ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c] +// CHECK: ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c] +// CHECK: ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c] + +//------------------------------------------------------------------------------ +// Load multiple 4-element structures to two consecutive registers +//------------------------------------------------------------------------------ + ld2 {v0.16b, v1.16b}, [x0] + ld2 {v15.8h, v16.8h}, [x15] + ld2 {v31.4s, v0.4s}, [sp] + ld2 {v0.2d, v1.2d}, [x0] + ld2 {v0.8b, v1.8b}, [x0] + ld2 {v15.4h, v16.4h}, [x15] + ld2 {v31.2s, v0.2s}, [sp] +// CHECK: ld2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x40,0x4c] +// CHECK: ld2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c] +// CHECK: ld2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x40,0x4c] +// CHECK: ld2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x40,0x4c] +// CHECK: ld2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x40,0x0c] +// CHECK: ld2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c] +// CHECK: ld2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x40,0x0c] + + ld2 {v0.16b-v1.16b}, [x0] + ld2 {v15.8h-v16.8h}, [x15] + ld2 {v31.4s-v0.4s}, [sp] + ld2 {v0.2d-v1.2d}, [x0] + ld2 {v0.8b-v1.8b}, [x0] + ld2 {v15.4h-v16.4h}, [x15] + ld2 {v31.2s-v0.2s}, [sp] +// CHECK: ld2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x40,0x4c] +// CHECK: ld2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c] +// CHECK: ld2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x40,0x4c] +// CHECK: ld2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x40,0x4c] +// CHECK: ld2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x40,0x0c] +// CHECK: ld2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c] +// CHECK: ld2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x40,0x0c] + +//------------------------------------------------------------------------------ +// Load multiple 3-element structures to three consecutive registers +//------------------------------------------------------------------------------ + ld3 {v0.16b, v1.16b, v2.16b}, [x0] + ld3 {v15.8h, v16.8h, v17.8h}, [x15] + ld3 {v31.4s, v0.4s, v1.4s}, [sp] + ld3 {v0.2d, v1.2d, v2.2d}, [x0] + ld3 {v0.8b, v1.8b, v2.8b}, [x0] + ld3 {v15.4h, v16.4h, v17.4h}, [x15] + ld3 {v31.2s, v0.2s, v1.2s}, [sp] +// CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c] +// CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c] +// CHECK: ld3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c] +// CHECK: ld3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c] +// CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c] +// CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c] +// CHECK: ld3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c] + + ld3 {v0.16b-v2.16b}, [x0] + ld3 {v15.8h-v17.8h}, [x15] + ld3 {v31.4s-v1.4s}, [sp] + ld3 {v0.2d-v2.2d}, [x0] + ld3 {v0.8b-v2.8b}, [x0] + ld3 {v15.4h-v17.4h}, [x15] + ld3 {v31.2s-v1.2s}, [sp] +// CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c] +// CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c] +// CHECK: ld3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c] +// CHECK: ld3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c] +// CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c] +// CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c] +// CHECK: ld3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c] + +//------------------------------------------------------------------------------ +// Load multiple 4-element structures to four consecutive registers +//------------------------------------------------------------------------------ + ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] + ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] + ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] + ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] + ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] + ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] + ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] +// CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c] +// CHECK: ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c] +// CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c] +// CHECK: ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c] +// CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c] +// CHECK: ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c] +// CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c] + + ld4 {v0.16b-v3.16b}, [x0] + ld4 {v15.8h-v18.8h}, [x15] + ld4 {v31.4s-v2.4s}, [sp] + ld4 {v0.2d-v3.2d}, [x0] + ld4 {v0.8b-v3.8b}, [x0] + ld4 {v15.4h-v18.4h}, [x15] + ld4 {v31.2s-v2.2s}, [sp] +// CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c] +// CHECK: ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c] +// CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c] +// CHECK: ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c] +// CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c] +// CHECK: ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c] +// CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c]