Add alignment arguments to all the NEON load/store intrinsics.

Update all the tests using those intrinsics and add support for auto-upgrading bitcode files with the old versions of the intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112271 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-04 14:22:26 +00:00 · 2010-08-27 17:13:24 +00:00 · 2010-08-27 17:13:24 +00:00 · 7a9ef44b3b
commit 7a9ef44b3b
parent 660cab32fe
24 changed files with 436 additions and 323 deletions
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@ -339,62 +339,76 @@ def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic;
 let TargetPrefix = "arm" in {

  // De-interleaving vector loads from N-element structures.
+  // Source operands are the address and alignment.
  def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
-                                    [llvm_ptr_ty], [IntrReadArgMem]>;
+                                    [llvm_ptr_ty, llvm_i32_ty],
+                                    [IntrReadArgMem]>;
  def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                                    [llvm_ptr_ty], [IntrReadArgMem]>;
+                                    [llvm_ptr_ty, llvm_i32_ty],
+                                    [IntrReadArgMem]>;
  def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                                     LLVMMatchType<0>],
-                                    [llvm_ptr_ty], [IntrReadArgMem]>;
+                                    [llvm_ptr_ty, llvm_i32_ty],
+                                    [IntrReadArgMem]>;
  def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                                     LLVMMatchType<0>, LLVMMatchType<0>],
-                                    [llvm_ptr_ty], [IntrReadArgMem]>;
+                                    [llvm_ptr_ty, llvm_i32_ty],
+                                    [IntrReadArgMem]>;

  // Vector load N-element structure to one lane.
+  // Source operands are: the address, the N input vectors (since only one
+  // lane is assigned), the lane number, and the alignment.
  def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                                        [llvm_ptr_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
+                                         LLVMMatchType<0>, llvm_i32_ty,
+                                         llvm_i32_ty], [IntrReadArgMem]>;
  def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                                         LLVMMatchType<0>],
                                        [llvm_ptr_ty, LLVMMatchType<0>,
                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty], [IntrReadArgMem]>;
+                                         llvm_i32_ty, llvm_i32_ty],
+                                        [IntrReadArgMem]>;
  def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
                                         LLVMMatchType<0>, LLVMMatchType<0>],
                                        [llvm_ptr_ty, LLVMMatchType<0>,
                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
+                                         LLVMMatchType<0>, llvm_i32_ty,
+                                         llvm_i32_ty], [IntrReadArgMem]>;

  // Interleaving vector stores from N-element structures.
+  // Source operands are: the address, the N vectors, and the alignment.
  def int_arm_neon_vst1 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty],
-                                    [IntrReadWriteArgMem]>;
+                                    [llvm_ptr_ty, llvm_anyvector_ty,
+                                     llvm_i32_ty], [IntrReadWriteArgMem]>;
  def int_arm_neon_vst2 : Intrinsic<[],
                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>], [IntrReadWriteArgMem]>;
+                                     LLVMMatchType<0>, llvm_i32_ty],
+                                    [IntrReadWriteArgMem]>;
  def int_arm_neon_vst3 : Intrinsic<[],
                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>, LLVMMatchType<0>],
-                                     [IntrReadWriteArgMem]>;
+                                     LLVMMatchType<0>, LLVMMatchType<0>,
+                                     llvm_i32_ty], [IntrReadWriteArgMem]>;
  def int_arm_neon_vst4 : Intrinsic<[],
                                    [llvm_ptr_ty, llvm_anyvector_ty,
                                     LLVMMatchType<0>, LLVMMatchType<0>,
-                                     LLVMMatchType<0>], [IntrReadWriteArgMem]>;
+                                     LLVMMatchType<0>, llvm_i32_ty],
+                                    [IntrReadWriteArgMem]>;

  // Vector store N-element structure from one lane.
+  // Source operands are: the address, the N vectors, the lane number, and
+  // the alignment.
  def int_arm_neon_vst2lane : Intrinsic<[],
                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
+                                         LLVMMatchType<0>, llvm_i32_ty,
+                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
  def int_arm_neon_vst3lane : Intrinsic<[],
                                        [llvm_ptr_ty, llvm_anyvector_ty,
                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
+                                         llvm_i32_ty, llvm_i32_ty],
+                                        [IntrReadWriteArgMem]>;
  def int_arm_neon_vst4lane : Intrinsic<[],
                                        [llvm_ptr_ty, llvm_anyvector_ty,
                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
+                                         LLVMMatchType<0>, llvm_i32_ty,
+                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
 }
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@ -85,6 +85,39 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
        NewFn = 0;
        return true;
      }
+      // Old versions of NEON ld/st intrinsics are missing alignment arguments.
+      bool isVLd = (Name.compare(14, 3, "vld", 3) == 0);
+      bool isVSt = (Name.compare(14, 3, "vst", 3) == 0);
+      if (isVLd || isVSt) {
+        unsigned NumVecs = Name.at(17) - '0';
+        if (NumVecs == 0 || NumVecs > 4)
+          return false;
+        bool isLaneOp = (Name.compare(18, 5, "lane.", 5) == 0);
+        if (!isLaneOp && Name.at(18) != '.')
+          return false;
+        unsigned ExpectedArgs = 2; // for the address and alignment
+        if (isVSt || isLaneOp)
+          ExpectedArgs += NumVecs;
+        if (isLaneOp)
+          ExpectedArgs += 1; // for the lane number
+        unsigned NumP = FTy->getNumParams();
+        if (NumP != ExpectedArgs - 1)
+          return false;
+
+        // Change the name of the old (bad) intrinsic, because 
+        // its type is incorrect, but we cannot overload that name.
+        F->setName("");
+
+        // One argument is missing: add the alignment argument.
+        std::vector<const Type*> NewParams;
+        for (unsigned p = 0; p < NumP; ++p)
+          NewParams.push_back(FTy->getParamType(p));
+        NewParams.push_back(Type::getInt32Ty(F->getContext()));
+        FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(),
+                                                 NewParams, false);
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, NewFTy));
+        return true;
+      }
    }
    break;
  case 'b':
@ -189,7 +222,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
        NewFnName = "llvm.memset.p0i8.i64";
    }
    if (NewFnName) {
-      const FunctionType *FTy = F->getFunctionType();
      NewFn = cast<Function>(M->getOrInsertFunction(NewFnName, 
                                            FTy->getReturnType(),
                                            FTy->getParamType(0),
@ -578,6 +610,39 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {

  switch (NewFn->getIntrinsicID()) {
  default:  llvm_unreachable("Unknown function for CallInst upgrade.");
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    // Add a default alignment argument of 1.
+    SmallVector<Value*, 8> Operands(CS.arg_begin(), CS.arg_end());
+    Operands.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+    CallInst *NewCI = CallInst::Create(NewFn, Operands.begin(), Operands.end(),
+                                       CI->getName(), CI);
+    NewCI->setTailCall(CI->isTailCall());
+    NewCI->setCallingConv(CI->getCallingConv());
+
+    //  Handle any uses of the old CallInst.
+    if (!CI->use_empty())
+      //  Replace all uses of the old call with the new cast which has the 
+      //  correct type.
+      CI->replaceAllUsesWith(NewCI);
+    
+    //  Clean up the old call now that it has been completely upgraded.
+    CI->eraseFromParent();
+    break;
+  }        
+
  case Intrinsic::x86_mmx_psll_d:
  case Intrinsic::x86_mmx_psll_q:
  case Intrinsic::x86_mmx_psll_w:
--- a/test/Bitcode/neon-intrinsics.ll
+++ b/test/Bitcode/neon-intrinsics.ll
@ -27,3 +27,37 @@
 ; CHECK: vmovlu32
 ; CHECK-NOT: arm.neon.vmovlu.v2i64
 ; CHECK: zext <2 x i32>
+
+; vld* and vst* intrinsic calls need an alignment argument (defaulted to 1)
+
+; CHECK: vld1i8
+; CHECK: i32 1
+; CHECK: vld2Qi16
+; CHECK: i32 1
+; CHECK: vld3i32
+; CHECK: i32 1
+; CHECK: vld4Qf
+; CHECK: i32 1
+
+; CHECK: vst1i8
+; CHECK: i32 1
+; CHECK: vst2Qi16
+; CHECK: i32 1
+; CHECK: vst3i32
+; CHECK: i32 1
+; CHECK: vst4Qf
+; CHECK: i32 1
+
+; CHECK: vld2laneQi16
+; CHECK: i32 1
+; CHECK: vld3lanei32
+; CHECK: i32 1
+; CHECK: vld4laneQf
+; CHECK: i32 1
+
+; CHECK: vst2laneQi16
+; CHECK: i32 1
+; CHECK: vst3lanei32
+; CHECK: i32 1
+; CHECK: vst4laneQf
+; CHECK: i32 1
--- a/test/Bitcode/neon-intrinsics.ll.bc
+++ b/test/Bitcode/neon-intrinsics.ll.bc
--- a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
+++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
@ -5,32 +5,32 @@

 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }

-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly

-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind

 define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
-  %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
  %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
  %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
  %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
  %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
  %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
  %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
  %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
  %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
-  %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
  %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
  %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
  %tmp2bd = add <8 x i8> %tmp2b, %tmp2d           ; <<8 x i8>> [#uses=1]
  %tmp4bd = add <8 x i8> %tmp4b, %tmp4d           ; <<8 x i8>> [#uses=1]
  %tmp2abcd = mul <8 x i8> undef, %tmp2bd         ; <<8 x i8>> [#uses=1]
  %tmp4abcd = mul <8 x i8> undef, %tmp4bd         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd)
+  call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
  %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f           ; <<8 x i8>> [#uses=1]
  %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h           ; <<8 x i8>> [#uses=1]
  %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h  ; <<8 x i8>> [#uses=1]
@ -38,8 +38,8 @@ define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A
  %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh       ; <<8 x i8>> [#uses=1]
  %tmp3efgh = mul <8 x i8> undef, %tmp3gh         ; <<8 x i8>> [#uses=1]
  %tmp4efgh = mul <8 x i8> %tmp4ef, undef         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh)
+  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
  %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd       ; <<8 x i8>> [#uses=1]
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef)
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
  ret <8 x i8> %tmp4
 }
--- a/test/CodeGen/ARM/2010-05-21-BuildVector.ll
+++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
@ -36,8 +36,8 @@ entry:
  %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
  %19 = fmul <4 x float> %tmp5, %2
  %20 = bitcast float* %fltp to i8*
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19)
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1)
  ret void
 }

-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
--- a/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
+++ b/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll
@ -12,8 +12,8 @@ entry:
 %tmp9 = trunc i128 %tmp8 to i64                 ; <i64> [#uses=1]
 %tmp16.i = bitcast i64 %tmp6 to <8 x i8>        ; <<8 x i8>> [#uses=1]
 %tmp20.i = bitcast i64 %tmp9 to <8 x i8>        ; <<8 x i8>> [#uses=1]
- tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i) nounwind
+ tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
 ret void
 }

-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
--- a/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
+++ b/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll
@ -16,10 +16,10 @@ target triple = "thumbv7-apple-darwin10"

 define i32 @test(i8* %arg) nounwind {
 entry:
- %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg)
+ %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1)
 %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> <i32 1, i32 2>
 store <2 x i64> %1, <2 x i64>* undef, align 16
 ret i32 undef
 }

-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@ -23,7 +23,7 @@ entry:
  %2 = getelementptr inbounds %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
  %3 = load <4 x i32>* %2, align 16               ; <<4 x i32>> [#uses=1]
  %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
-  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
  %6 = bitcast <8 x i16> %5 to <2 x double>       ; <<2 x double>> [#uses=2]
  %7 = extractelement <2 x double> %6, i32 0      ; <double> [#uses=1]
  %8 = bitcast double %7 to <4 x i16>             ; <<4 x i16>> [#uses=1]
@ -37,7 +37,7 @@ entry:
  %16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
  %17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
  %18 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17)
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1)
  ret void
 }

@ -57,17 +57,17 @@ entry:
  %2 = getelementptr inbounds %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
  %3 = load <8 x i16>* %2, align 16               ; <<8 x i16>> [#uses=1]
  %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
-  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
  %6 = getelementptr inbounds i16* %i_ptr, i32 8  ; <i16*> [#uses=1]
  %7 = bitcast i16* %6 to i8*                     ; <i8*> [#uses=1]
-  %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7) ; <<8 x i16>> [#uses=1]
+  %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
  %9 = mul <8 x i16> %1, %5                       ; <<8 x i16>> [#uses=1]
  %10 = mul <8 x i16> %3, %8                      ; <<8 x i16>> [#uses=1]
  %11 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9)
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9, i32 1)
  %12 = getelementptr inbounds i16* %o_ptr, i32 8 ; <i16*> [#uses=1]
  %13 = bitcast i16* %12 to i8*                   ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10)
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10, i32 1)
  ret void
 }

@ -77,14 +77,14 @@ define <8 x i8> @t3(i8* %A, i8* %B) nounwind {
 ; CHECK:        vmul.i8
 ; CHECK-NOT:    vmov
 ; CHECK:        vst3.8
-  %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
  %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
  %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
  %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
  %tmp5 = sub <8 x i8> %tmp3, %tmp4
  %tmp6 = add <8 x i8> %tmp2, %tmp3               ; <<8 x i8>> [#uses=1]
  %tmp7 = mul <8 x i8> %tmp4, %tmp2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7)
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
  ret <8 x i8> %tmp4
 }

@ -97,10 +97,10 @@ entry:
 ; CHECK-NOT:    vmov
 ; CHECK:        bne
  %tmp1 = bitcast i32* %in to i8*                 ; <i8*> [#uses=1]
-  %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
  %tmp3 = getelementptr inbounds i32* %in, i32 8  ; <i32*> [#uses=1]
  %tmp4 = bitcast i32* %tmp3 to i8*               ; <i8*> [#uses=1]
-  %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
  %tmp8 = bitcast i32* %out to i8*                ; <i8*> [#uses=1]
  br i1 undef, label %return1, label %return2

@ -116,7 +116,7 @@ return1:
  %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
  %tmp6 = add <4 x i32> %tmp52, %tmp              ; <<4 x i32>> [#uses=1]
  %tmp7 = add <4 x i32> %tmp57, %tmp39            ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7)
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
  ret void

 return2:
@ -128,7 +128,7 @@ return2:
  %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
  %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
  %tmp102 = add <4 x i32> %tmp100, %tmp101              ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101)
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
  call void @llvm.trap()
  unreachable
 }
@ -143,7 +143,7 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
 ; CHECK:        vadd.i16
  %tmp0 = bitcast i16* %A to i8*                  ; <i8*> [#uses=1]
  %tmp1 = load <8 x i16>* %B                      ; <<8 x i16>> [#uses=2]
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
  %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
  %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
  %tmp5 = add <8 x i16> %tmp3, %tmp4              ; <<8 x i16>> [#uses=1]
@ -156,7 +156,7 @@ define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
 ; CHECK:        vmov d1, d0
 ; CHECK-NEXT:   vld2.8 {d0[1], d1[1]}
  %tmp1 = load <8 x i8>* %B                       ; <<8 x i8>> [#uses=2]
-  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
+  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
  %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
  %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
  %tmp5 = add <8 x i8> %tmp3, %tmp4               ; <<8 x i8>> [#uses=1]
@ -174,14 +174,14 @@ entry:
 ; CHECK:        vuzp.32 q0, q1
 ; CHECK:        vst1.32
  %0 = bitcast i32* %iptr to i8*                  ; <i8*> [#uses=2]
-  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
  %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
  %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
  %2 = bitcast i32* %optr to i8*                  ; <i8*> [#uses=2]
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60)
-  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0) ; <<4 x i32>> [#uses=1]
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
+  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4)
+  tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4, i32 1)
  ret void
 }

@ -304,42 +304,43 @@ bb14:                                             ; preds = %bb6

 ; This test crashes the coalescer because live variables were not updated properly.
 define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
-  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
  %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
-  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
  %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
  %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d  ; <<8 x i8>> [#uses=1]
  %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
  %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f  ; <<8 x i8>> [#uses=1]
  %tmp2efgh = mul <8 x i8> %tmp2ef, undef         ; <<8 x i8>> [#uses=2]
-  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh)
+  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
  %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd       ; <<8 x i8>> [#uses=1]
  %tmp7 = mul <8 x i8> undef, %tmp2               ; <<8 x i8>> [#uses=1]
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7)
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
  ret <8 x i8> undef
 }

-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly

-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly

 declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone

-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind

-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind

-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+nounwind

-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly

-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly

-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly

-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly

-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind

 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone

--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@ -7,7 +7,7 @@
 %quux = type { i32 (...)**, %baz*, i32 }
 %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }

-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly

 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK: aaa:
@ -15,11 +15,11 @@ define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK: vst1.64 {{.*}}sp, :128
 ; CHECK: vld1.64 {{.*}}sp, :128
 entry:
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
  store float 6.300000e+01, float* undef, align 4
-  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
  store float 0.000000e+00, float* undef, align 4
-  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
  %val173 = load <4 x float>* undef               ; <<4 x float>> [#uses=1]
  br label %bb4

--- a/test/CodeGen/ARM/vld1.ll
+++ b/test/CodeGen/ARM/vld1.ll
@ -3,7 +3,7 @@
 define <8 x i8> @vld1i8(i8* %A) nounwind {
 ;CHECK: vld1i8:
 ;CHECK: vld1.8
-	%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A)
+	%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 1)
 	ret <8 x i8> %tmp1
 }

@ -11,7 +11,7 @@ define <4 x i16> @vld1i16(i16* %A) nounwind {
 ;CHECK: vld1i16:
 ;CHECK: vld1.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0)
+	%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
 	ret <4 x i16> %tmp1
 }

@ -19,7 +19,7 @@ define <2 x i32> @vld1i32(i32* %A) nounwind {
 ;CHECK: vld1i32:
 ;CHECK: vld1.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0)
+	%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
 	ret <2 x i32> %tmp1
 }

@ -27,7 +27,7 @@ define <2 x float> @vld1f(float* %A) nounwind {
 ;CHECK: vld1f:
 ;CHECK: vld1.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %tmp0)
+	%tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %tmp0, i32 1)
 	ret <2 x float> %tmp1
 }

@ -35,14 +35,14 @@ define <1 x i64> @vld1i64(i64* %A) nounwind {
 ;CHECK: vld1i64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0)
+	%tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0, i32 1)
 	ret <1 x i64> %tmp1
 }

 define <16 x i8> @vld1Qi8(i8* %A) nounwind {
 ;CHECK: vld1Qi8:
 ;CHECK: vld1.8
-	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A)
+	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 1)
 	ret <16 x i8> %tmp1
 }

@ -50,7 +50,7 @@ define <8 x i16> @vld1Qi16(i16* %A) nounwind {
 ;CHECK: vld1Qi16:
 ;CHECK: vld1.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0)
+	%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 1)
 	ret <8 x i16> %tmp1
 }

@ -58,7 +58,7 @@ define <4 x i32> @vld1Qi32(i32* %A) nounwind {
 ;CHECK: vld1Qi32:
 ;CHECK: vld1.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0)
+	%tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0, i32 1)
 	ret <4 x i32> %tmp1
 }

@ -66,7 +66,7 @@ define <4 x float> @vld1Qf(float* %A) nounwind {
 ;CHECK: vld1Qf:
 ;CHECK: vld1.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %tmp0)
+	%tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %tmp0, i32 1)
 	ret <4 x float> %tmp1
 }

@ -74,21 +74,21 @@ define <2 x i64> @vld1Qi64(i64* %A) nounwind {
 ;CHECK: vld1Qi64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0)
+	%tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0, i32 1)
 	ret <2 x i64> %tmp1
 }

-declare <8 x i8>  @llvm.arm.neon.vld1.v8i8(i8*) nounwind readonly
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*) nounwind readonly
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*) nounwind readonly
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*) nounwind readonly
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*) nounwind readonly
+declare <8 x i8>  @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) nounwind readonly
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly

-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*) nounwind readonly
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*) nounwind readonly
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly

 ; Radar 8355607
 ; Do not crash if the vld1 result is not used.
@ -96,10 +96,9 @@ define void @unused_vld1_result() {
 entry:
 ;CHECK: unused_vld1_result
 ;CHECK: vld1.32
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) 
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) 
  call void @llvm.trap()
  unreachable
 }

 declare void @llvm.trap() nounwind
-
--- a/test/CodeGen/ARM/vld2.ll
+++ b/test/CodeGen/ARM/vld2.ll
@ -14,7 +14,7 @@
 define <8 x i8> @vld2i8(i8* %A) nounwind {
 ;CHECK: vld2i8:
 ;CHECK: vld2.8
-	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A)
+	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 1)
        %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
        %tmp4 = add <8 x i8> %tmp2, %tmp3
@ -25,7 +25,7 @@ define <4 x i16> @vld2i16(i16* %A) nounwind {
 ;CHECK: vld2i16:
 ;CHECK: vld2.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
        %tmp4 = add <4 x i16> %tmp2, %tmp3
@ -36,7 +36,7 @@ define <2 x i32> @vld2i32(i32* %A) nounwind {
 ;CHECK: vld2i32:
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
        %tmp4 = add <2 x i32> %tmp2, %tmp3
@ -47,7 +47,7 @@ define <2 x float> @vld2f(float* %A) nounwind {
 ;CHECK: vld2f:
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
        %tmp4 = fadd <2 x float> %tmp2, %tmp3
@ -58,7 +58,7 @@ define <1 x i64> @vld2i64(i64* %A) nounwind {
 ;CHECK: vld2i64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1
        %tmp4 = add <1 x i64> %tmp2, %tmp3
@ -68,7 +68,7 @@ define <1 x i64> @vld2i64(i64* %A) nounwind {
 define <16 x i8> @vld2Qi8(i8* %A) nounwind {
 ;CHECK: vld2Qi8:
 ;CHECK: vld2.8
-	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A)
+	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1)
        %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
        %tmp4 = add <16 x i8> %tmp2, %tmp3
@ -79,7 +79,7 @@ define <8 x i16> @vld2Qi16(i16* %A) nounwind {
 ;CHECK: vld2Qi16:
 ;CHECK: vld2.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
        %tmp4 = add <8 x i16> %tmp2, %tmp3
@ -90,7 +90,7 @@ define <4 x i32> @vld2Qi32(i32* %A) nounwind {
 ;CHECK: vld2Qi32:
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
        %tmp4 = add <4 x i32> %tmp2, %tmp3
@ -101,20 +101,20 @@ define <4 x float> @vld2Qf(float* %A) nounwind {
 ;CHECK: vld2Qf:
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1
        %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }

-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*, i32) nounwind readonly

-declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
--- a/test/CodeGen/ARM/vld3.ll
+++ b/test/CodeGen/ARM/vld3.ll
@ -14,7 +14,7 @@
 define <8 x i8> @vld3i8(i8* %A) nounwind {
 ;CHECK: vld3i8:
 ;CHECK: vld3.8
-	%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A)
+	%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 1)
        %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
        %tmp4 = add <8 x i8> %tmp2, %tmp3
@ -25,7 +25,7 @@ define <4 x i16> @vld3i16(i16* %A) nounwind {
 ;CHECK: vld3i16:
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
        %tmp4 = add <4 x i16> %tmp2, %tmp3
@ -36,7 +36,7 @@ define <2 x i32> @vld3i32(i32* %A) nounwind {
 ;CHECK: vld3i32:
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2
        %tmp4 = add <2 x i32> %tmp2, %tmp3
@ -47,7 +47,7 @@ define <2 x float> @vld3f(float* %A) nounwind {
 ;CHECK: vld3f:
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2
        %tmp4 = fadd <2 x float> %tmp2, %tmp3
@ -58,7 +58,7 @@ define <1 x i64> @vld3i64(i64* %A) nounwind {
 ;CHECK: vld3i64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
        %tmp4 = add <1 x i64> %tmp2, %tmp3
@ -69,7 +69,7 @@ define <16 x i8> @vld3Qi8(i8* %A) nounwind {
 ;CHECK: vld3Qi8:
 ;CHECK: vld3.8
 ;CHECK: vld3.8
-	%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A)
+	%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 1)
        %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
        %tmp4 = add <16 x i8> %tmp2, %tmp3
@ -81,7 +81,7 @@ define <8 x i16> @vld3Qi16(i16* %A) nounwind {
 ;CHECK: vld3.16
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2
        %tmp4 = add <8 x i16> %tmp2, %tmp3
@ -93,7 +93,7 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind {
 ;CHECK: vld3.32
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
        %tmp4 = add <4 x i32> %tmp2, %tmp3
@ -105,20 +105,20 @@ define <4 x float> @vld3Qf(float* %A) nounwind {
 ;CHECK: vld3.32
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2
        %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }

-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*, i32) nounwind readonly

-declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*, i32) nounwind readonly
--- a/test/CodeGen/ARM/vld4.ll
+++ b/test/CodeGen/ARM/vld4.ll
@ -14,7 +14,7 @@
 define <8 x i8> @vld4i8(i8* %A) nounwind {
 ;CHECK: vld4i8:
 ;CHECK: vld4.8
-	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A)
+	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1)
        %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
        %tmp4 = add <8 x i8> %tmp2, %tmp3
@ -25,7 +25,7 @@ define <4 x i16> @vld4i16(i16* %A) nounwind {
 ;CHECK: vld4i16:
 ;CHECK: vld4.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2
        %tmp4 = add <4 x i16> %tmp2, %tmp3
@ -36,7 +36,7 @@ define <2 x i32> @vld4i32(i32* %A) nounwind {
 ;CHECK: vld4i32:
 ;CHECK: vld4.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
        %tmp4 = add <2 x i32> %tmp2, %tmp3
@ -47,7 +47,7 @@ define <2 x float> @vld4f(float* %A) nounwind {
 ;CHECK: vld4f:
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2
        %tmp4 = fadd <2 x float> %tmp2, %tmp3
@ -58,7 +58,7 @@ define <1 x i64> @vld4i64(i64* %A) nounwind {
 ;CHECK: vld4i64:
 ;CHECK: vld1.64
 	%tmp0 = bitcast i64* %A to i8*
-	%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
        %tmp4 = add <1 x i64> %tmp2, %tmp3
@ -69,7 +69,7 @@ define <16 x i8> @vld4Qi8(i8* %A) nounwind {
 ;CHECK: vld4Qi8:
 ;CHECK: vld4.8
 ;CHECK: vld4.8
-	%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A)
+	%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 1)
        %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
        %tmp4 = add <16 x i8> %tmp2, %tmp3
@ -81,7 +81,7 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind {
 ;CHECK: vld4.16
 ;CHECK: vld4.16
 	%tmp0 = bitcast i16* %A to i8*
-	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
        %tmp4 = add <8 x i16> %tmp2, %tmp3
@ -93,7 +93,7 @@ define <4 x i32> @vld4Qi32(i32* %A) nounwind {
 ;CHECK: vld4.32
 ;CHECK: vld4.32
 	%tmp0 = bitcast i32* %A to i8*
-	%tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2
        %tmp4 = add <4 x i32> %tmp2, %tmp3
@ -105,20 +105,20 @@ define <4 x float> @vld4Qf(float* %A) nounwind {
 ;CHECK: vld4.32
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
-	%tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8* %tmp0)
+	%tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8* %tmp0, i32 1)
        %tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0
        %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2
        %tmp4 = fadd <4 x float> %tmp2, %tmp3
 	ret <4 x float> %tmp4
 }

-declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*, i32) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*, i32) nounwind readonly

-declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*, i32) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*, i32) nounwind readonly
--- a/test/CodeGen/ARM/vldlane.ll
+++ b/test/CodeGen/ARM/vldlane.ll
@ -13,7 +13,7 @@ define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vld2lanei8:
 ;CHECK: vld2.8
 	%tmp1 = load <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
        %tmp5 = add <8 x i8> %tmp3, %tmp4
@ -25,7 +25,7 @@ define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld2.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
        %tmp5 = add <4 x i16> %tmp3, %tmp4
@ -37,7 +37,7 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
        %tmp5 = add <2 x i32> %tmp3, %tmp4
@ -49,7 +49,7 @@ define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
        %tmp5 = fadd <2 x float> %tmp3, %tmp4
@ -61,7 +61,7 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld2.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
        %tmp5 = add <8 x i16> %tmp3, %tmp4
@ -73,7 +73,7 @@ define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
        %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
        %tmp5 = add <4 x i32> %tmp3, %tmp4
@ -85,21 +85,21 @@ define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
        %tmp5 = fadd <4 x float> %tmp3, %tmp4
 	ret <4 x float> %tmp5
 }

-declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly

-declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly

 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@ -114,7 +114,7 @@ define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vld3lanei8:
 ;CHECK: vld3.8
 	%tmp1 = load <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
@ -128,7 +128,7 @@ define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
@ -142,7 +142,7 @@ define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
@ -156,7 +156,7 @@ define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
@ -170,7 +170,7 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
@ -184,7 +184,7 @@ define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
        %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
@ -198,7 +198,7 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
@ -207,14 +207,14 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp7
 }

-declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly

-declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly

 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
@ -229,7 +229,7 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vld4lanei8:
 ;CHECK: vld4.8
 	%tmp1 = load <8 x i8>* %B
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
@ -245,7 +245,7 @@ define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
@ -261,7 +261,7 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
@ -277,7 +277,7 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
@ -293,7 +293,7 @@ define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
@ -309,7 +309,7 @@ define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
@ -325,7 +325,7 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
        %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
        %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
        %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
@ -336,11 +336,11 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp9
 }

-declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly

-declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
--- a/test/CodeGen/ARM/vst1.ll
+++ b/test/CodeGen/ARM/vst1.ll
@ -4,7 +4,7 @@ define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst1i8:
 ;CHECK: vst1.8
 	%tmp1 = load <8 x i8>* %B
-	call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1)
+	call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 1)
 	ret void
 }

@ -13,7 +13,7 @@ define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst1.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1)
+	call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
 	ret void
 }

@ -22,7 +22,7 @@ define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1)
+	call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
 	ret void
 }

@ -31,7 +31,7 @@ define void @vst1f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1)
+	call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
 	ret void
 }

@ -40,7 +40,7 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
-	call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1)
+	call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
 	ret void
 }

@ -48,7 +48,7 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst1Qi8:
 ;CHECK: vst1.8
 	%tmp1 = load <16 x i8>* %B
-	call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1)
+	call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 1)
 	ret void
 }

@ -57,7 +57,7 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst1.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1)
+	call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 1)
 	ret void
 }

@ -66,7 +66,7 @@ define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1)
+	call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
 	ret void
 }

@ -75,7 +75,7 @@ define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst1.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1)
+	call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
 	ret void
 }

@ -84,18 +84,18 @@ define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <2 x i64>* %B
-	call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1)
+	call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
 	ret void
 }

-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>) nounwind
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>) nounwind
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>) nounwind
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>) nounwind
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>) nounwind
+declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) nounwind

-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>) nounwind
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>) nounwind
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) nounwind
--- a/test/CodeGen/ARM/vst2.ll
+++ b/test/CodeGen/ARM/vst2.ll
@ -4,7 +4,7 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst2i8:
 ;CHECK: vst2.8
 	%tmp1 = load <8 x i8>* %B
-	call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1)
+	call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
 	ret void
 }

@ -13,7 +13,7 @@ define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1)
+	call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
 	ret void
 }

@ -22,7 +22,7 @@ define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1)
+	call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	ret void
 }

@ -31,7 +31,7 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1)
+	call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }

@ -40,7 +40,7 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
-	call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1)
+	call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
 	ret void
 }

@ -48,7 +48,7 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst2Qi8:
 ;CHECK: vst2.8
 	%tmp1 = load <16 x i8>* %B
-	call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1)
+	call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 1)
 	ret void
 }

@ -57,7 +57,7 @@ define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst2.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1)
+	call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	ret void
 }

@ -66,7 +66,7 @@ define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1)
+	call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
 	ret void
 }

@ -75,17 +75,17 @@ define void @vst2Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1)
+	call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }

-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>) nounwind
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>) nounwind
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind

-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>) nounwind
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>) nounwind
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@ -4,7 +4,7 @@ define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst3i8:
 ;CHECK: vst3.8
 	%tmp1 = load <8 x i8>* %B
-	call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
+	call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
 	ret void
 }

@ -13,7 +13,7 @@ define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
+	call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
 	ret void
 }

@ -22,7 +22,7 @@ define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
+	call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	ret void
 }

@ -31,7 +31,7 @@ define void @vst3f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	call void @llvm.arm.neon.vst3.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
+	call void @llvm.arm.neon.vst3.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }

@ -40,7 +40,7 @@ define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
-	call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1)
+	call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
 	ret void
 }

@ -49,7 +49,7 @@ define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst3.8
 ;CHECK: vst3.8
 	%tmp1 = load <16 x i8>* %B
-	call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1)
+	call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 1)
 	ret void
 }

@ -59,7 +59,7 @@ define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1)
+	call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	ret void
 }

@ -69,7 +69,7 @@ define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	call void @llvm.arm.neon.vst3.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1)
+	call void @llvm.arm.neon.vst3.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
 	ret void
 }

@ -79,17 +79,17 @@ define void @vst3Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	call void @llvm.arm.neon.vst3.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1)
+	call void @llvm.arm.neon.vst3.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }

-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>) nounwind
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind

-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>) nounwind
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>) nounwind
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>) nounwind
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
--- a/test/CodeGen/ARM/vst4.ll
+++ b/test/CodeGen/ARM/vst4.ll
@ -4,7 +4,7 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst4i8:
 ;CHECK: vst4.8
 	%tmp1 = load <8 x i8>* %B
-	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
+	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
 	ret void
 }

@ -13,7 +13,7 @@ define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
+	call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
 	ret void
 }

@ -22,7 +22,7 @@ define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
+	call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
 	ret void
 }

@ -31,7 +31,7 @@ define void @vst4f(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	call void @llvm.arm.neon.vst4.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
+	call void @llvm.arm.neon.vst4.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
 	ret void
 }

@ -40,7 +40,7 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst1.64
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
-	call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1)
+	call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
 	ret void
 }

@ -49,7 +49,7 @@ define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst4.8
 ;CHECK: vst4.8
 	%tmp1 = load <16 x i8>* %B
-	call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1)
+	call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 1)
 	ret void
 }

@ -59,7 +59,7 @@ define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1)
+	call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
 	ret void
 }

@ -69,7 +69,7 @@ define void @vst4Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	call void @llvm.arm.neon.vst4.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1)
+	call void @llvm.arm.neon.vst4.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
 	ret void
 }

@ -79,17 +79,17 @@ define void @vst4Qf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1)
+	call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
 	ret void
 }

-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>) nounwind
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind

-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>) nounwind
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>) nounwind
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@ -4,7 +4,7 @@ define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst2lanei8:
 ;CHECK: vst2.8
 	%tmp1 = load <8 x i8>* %B
-	call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -13,7 +13,7 @@ define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -22,7 +22,7 @@ define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -31,7 +31,7 @@ define void @vst2lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -40,7 +40,7 @@ define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst2.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -49,7 +49,7 @@ define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2)
+	call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
 	ret void
 }

@ -58,24 +58,24 @@ define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst2.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3)
+	call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
 	ret void
 }

-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind

-declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind

 define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst3lanei8:
 ;CHECK: vst3.8
 	%tmp1 = load <8 x i8>* %B
-	call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -84,7 +84,7 @@ define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -93,7 +93,7 @@ define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -102,7 +102,7 @@ define void @vst3lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -111,7 +111,7 @@ define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst3.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6)
+	call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 1)
 	ret void
 }

@ -120,7 +120,7 @@ define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0)
+	call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
 	ret void
 }

@ -129,25 +129,25 @@ define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst3.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
 	ret void
 }

-declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind

-declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind


 define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst4lanei8:
 ;CHECK: vst4.8
 	%tmp1 = load <8 x i8>* %B
-	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -156,7 +156,7 @@ define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
-	call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -165,7 +165,7 @@ define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
-	call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -174,7 +174,7 @@ define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <2 x float>* %B
-	call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
 	ret void
 }

@ -183,7 +183,7 @@ define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst4.16
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
-	call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7)
+	call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 1)
 	ret void
 }

@ -192,7 +192,7 @@ define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
-	call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2)
+	call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
 	ret void
 }

@ -201,15 +201,15 @@ define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst4.32
 	%tmp0 = bitcast float* %A to i8*
 	%tmp1 = load <4 x float>* %B
-	call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+	call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
 	ret void
 }

-declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind

-declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
-declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
--- a/test/CodeGen/Thumb2/crash.ll
+++ b/test/CodeGen/Thumb2/crash.ll
@ -14,11 +14,11 @@ entry:
  %6 = bitcast i32* %sp3 to <4 x i32>*            ; <<4 x i32>*> [#uses=1]
  %7 = load <4 x i32>* %6, align 16               ; <<4 x i32>> [#uses=1]
  %8 = bitcast i32* %dp to i8*                    ; <i8*> [#uses=1]
-  tail call void @llvm.arm.neon.vst4.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7)
+  tail call void @llvm.arm.neon.vst4.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7, i32 1)
  ret void
 }

-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind

@sbuf = common global [16 x i32] zeroinitializer, align 16 ; <[16 x i32]*> [#uses=5]
@dbuf = common global [16 x i32] zeroinitializer  ; <[16 x i32]*> [#uses=2]
@ -44,6 +44,6 @@ bb2:                                              ; preds = %bb
  %3 = load <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32]* @sbuf, i32 0, i32 4) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
  %4 = load <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32]* @sbuf, i32 0, i32 8) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
  %5 = load <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32]* @sbuf, i32 0, i32 12) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
-  tail call void @llvm.arm.neon.vst4.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5) nounwind
+  tail call void @llvm.arm.neon.vst4.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, i32 1) nounwind
  ret i32 0
 }
--- a/test/CodeGen/Thumb2/machine-licm-vdup.ll
+++ b/test/CodeGen/Thumb2/machine-licm-vdup.ll
@ -16,10 +16,10 @@ bb1:
  %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
  %tmp1 = shl i32 %indvar, 2
  %gep1 = getelementptr i8* %ptr1, i32 %tmp1
-  %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1)
+  %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1, i32 1)
  %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %tmp2)
  %gep2 = getelementptr i8* %ptr2, i32 %tmp1
-  call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3)
+  call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1)
  %indvar.next = add i32 %indvar, 1
  %cond = icmp eq i32 %indvar.next, 10
  br i1 %cond, label %bb2, label %bb1
@ -31,8 +31,8 @@ bb2:
 ; CHECK-NOT: LCPI1_0:
 ; CHECK: .subsections_via_symbols

-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly

-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind

 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/Thumb2/machine-licm.ll
+++ b/test/CodeGen/Thumb2/machine-licm.ll
@ -64,10 +64,10 @@ bb1:
  %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
  %tmp1 = shl i32 %indvar, 2
  %gep1 = getelementptr i8* %ptr1, i32 %tmp1
-  %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1)
+  %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1, i32 1)
  %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %tmp2)
  %gep2 = getelementptr i8* %ptr2, i32 %tmp1
-  call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3)
+  call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1)
  %indvar.next = add i32 %indvar, 1
  %cond = icmp eq i32 %indvar.next, 10
  br i1 %cond, label %bb2, label %bb1
@ -79,8 +79,8 @@ bb2:
 ; CHECK: LCPI1_0:
 ; CHECK: .section

-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly

-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind

 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/Thumb2/thumb2-spill-q.ll
+++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll
@ -7,7 +7,7 @@
 %quux = type { i32 (...)**, %baz*, i32 }
 %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }

-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly

 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK: aaa:
@ -15,11 +15,11 @@ define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK: vst1.64 {{.*}}[{{.*}}, :128]
 ; CHECK: vld1.64 {{.*}}[{{.*}}, :128]
 entry:
-  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+  %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
  store float 6.300000e+01, float* undef, align 4
-  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+  %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
  store float 0.000000e+00, float* undef, align 4
-  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+  %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
  %val173 = load <4 x float>* undef               ; <<4 x float>> [#uses=1]
  br label %bb4