Add support for ARM's Advanced SIMD (NEON) instruction set.

This is still a work in progress but most of the NEON instruction set is supported. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73919 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-21 11:11:42 +00:00 · 2009-06-22 23:27:02 +00:00 · 2009-06-22 23:27:02 +00:00 · 5bafff36c7
commit 5bafff36c7
parent 5de83afcdc
102 changed files with 10302 additions and 127 deletions
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@ -116,6 +116,7 @@ def llvm_v2i64_ty      : LLVMType<v2i64>;    //  2 x i64
 def llvm_v2i32_ty      : LLVMType<v2i32>;    //  2 x i32
 def llvm_v1i64_ty      : LLVMType<v1i64>;    //  1 x i64
 def llvm_v4i32_ty      : LLVMType<v4i32>;    //  4 x i32
+def llvm_v2f32_ty      : LLVMType<v2f32>;    //  2 x float
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
 def llvm_v2f64_ty      : LLVMType<v2f64>;    //  2 x double

--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@ -19,3 +19,298 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
  def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
              Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 }
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON)
+
+let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
+
+  // The following classes do not correspond directly to GCC builtins.
+  class Neon_1Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>;
+  class Neon_2Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedElementVectorType<0>,
+                 LLVMExtendedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMTruncatedElementVectorType<0>,
+                 LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_3Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_3Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>,
+                 LLVMTruncatedElementVectorType<0>,
+                 LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_CvtFxToFP_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  class Neon_CvtFPToFx_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+}
+
+// Arithmetic ops
+
+let Properties = [IntrNoMem, Commutative] in {
+
+  // Vector Add.
+  def int_arm_neon_vhadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
+  def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
+  def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic;
+  def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic;
+
+  // Vector Multiply.
+  def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
+
+  // Vector Multiply and Accumulate/Subtract.
+  def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
+
+  // Vector Maximum.
+  def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Minimum.
+  def int_arm_neon_vmins : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vminu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Reciprocal Step.
+  def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Reciprocal Square Root Step.
+  def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic;
+}
+
+// Vector Subtract.
+def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic;
+def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic;
+
+// Vector Absolute Compare.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
+                                      [llvm_v2f32_ty, llvm_v2f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
+                                      [llvm_v4f32_ty, llvm_v4f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
+                                      [llvm_v2f32_ty, llvm_v2f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
+                                      [llvm_v4f32_ty, llvm_v4f32_ty],
+                                      [IntrNoMem]>;
+}
+
+// Vector Absolute Differences.
+def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
+def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic;
+def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
+
+// Vector Absolute Difference and Accumulate.
+def int_arm_neon_vabas : Neon_3Arg_Intrinsic;
+def int_arm_neon_vabau : Neon_3Arg_Intrinsic;
+def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic;
+def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
+
+// Vector Pairwise Add.
+def int_arm_neon_vpaddi : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpaddf : Neon_2Arg_Float_Intrinsic;
+
+// Vector Pairwise Add Long.
+// Note: This is different than the other "long" NEON intrinsics because
+// the result vector has half as many elements as the source vector.
+// The source and destination vector types must be specified separately.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
+                                       [IntrNoMem]>;
+  def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
+                                       [IntrNoMem]>;
+}
+
+// Vector Pairwise Add and Accumulate Long.
+// Note: This is similar to vpaddl but the destination vector also appears
+// as the first argument.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty],
+                                       [LLVMMatchType<0>, llvm_anyint_ty],
+                                       [IntrNoMem]>;
+  def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty],
+                                       [LLVMMatchType<0>, llvm_anyint_ty],
+                                       [IntrNoMem]>;
+}
+
+// Vector Pairwise Maximum and Minimum.
+def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic;
+def int_arm_neon_vpmins : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpminu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic;
+
+// Vector Shifts:
+//
+// The various saturating and rounding vector shift operations need to be
+// represented by intrinsics in LLVM, and even the basic VSHL variable shift
+// operation cannot be safely translated to LLVM's shift operators.  VSHL can
+// be used for both left and right shifts, or even combinations of the two,
+// depending on the signs of the shift amounts.  It also has well-defined
+// behavior for shift amounts that LLVM leaves undefined.  Only basic shifts
+// by constants can be represented with LLVM's shift operators.
+//
+// The shift counts for these intrinsics are always vectors, even for constant
+// shifts, where the constant is replicated.  For consistency with VSHL (and
+// other variable shift instructions), left shifts have positive shift counts
+// and right shifts have negative shift counts.  This convention is also used
+// for constant right shift intrinsics, and to help preserve sanity, the
+// intrinsic names use "shift" instead of either "shl" or "shr".  Where
+// applicable, signed and unsigned versions of the intrinsics are
+// distinguished with "s" and "u" suffixes.  A few NEON shift instructions,
+// such as VQSHLU, take signed operands but produce unsigned results; these
+// use a "su" suffix.
+
+// Vector Shift.
+def int_arm_neon_vshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vshiftls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vshiftlu : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vshiftn : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Rounding Shift.
+def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Saturating Shift.
+def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Saturating Rounding Shift.
+def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Shift and Insert.
+def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic;
+
+// Vector Absolute Value and Saturating Absolute Value.
+def int_arm_neon_vabs : Neon_1Arg_Intrinsic;
+def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic;
+def int_arm_neon_vqabs : Neon_1Arg_Intrinsic;
+
+// Vector Saturating Negate.
+def int_arm_neon_vqneg : Neon_1Arg_Intrinsic;
+
+// Vector Count Leading Sign/Zero Bits.
+def int_arm_neon_vcls : Neon_1Arg_Intrinsic;
+def int_arm_neon_vclz : Neon_1Arg_Intrinsic;
+
+// Vector Count One Bits.
+def int_arm_neon_vcnt : Neon_1Arg_Intrinsic;
+
+// Vector Reciprocal Estimate.
+def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic;
+def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic;
+
+// Vector Reciprocal Square Root Estimate.
+def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic;
+def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic;
+
+// Vector Conversions Between Floating-point and Fixed-point.
+def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic;
+def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic;
+def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;
+def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;
+
+// Narrowing and Lengthening Vector Moves.
+def int_arm_neon_vmovn : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vmovls : Neon_1Arg_Long_Intrinsic;
+def int_arm_neon_vmovlu : Neon_1Arg_Long_Intrinsic;
+
+let TargetPrefix = "arm" in {
+
+  // De-interleaving vector loads from N-element structures.
+  def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+
+  // Interleaving vector stores from N-element structures.
+  def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyint_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyfloat_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyint_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyfloat_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+}
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@ -24,19 +24,29 @@ def CC_ARM_APCS : CallingConv<[

  CCIfType<[i8, i16], CCPromoteToType<i32>>,

-  // f64 is passed in pairs of GPRs, possibly split onto the stack
-  CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,

  CCIfType<[f32], CCBitConvertToType<i32>>,
  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,

  CCIfType<[i32], CCAssignToStack<4, 4>>,
-  CCIfType<[f64], CCAssignToStack<8, 4>>
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 4>>
 ]>;

 def RetCC_ARM_APCS : CallingConv<[
  CCIfType<[f32], CCBitConvertToType<i32>>,
-  CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,

  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
@ -59,7 +69,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
                       CCAssignToReg<[R0, R1, R2, R3]>>>,

  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[f64], CCAssignToStack<8, 8>>
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 8>>
 ]>;

 def RetCC_ARM_AAPCS_Common : CallingConv<[
@ -72,13 +83,21 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[
 //===----------------------------------------------------------------------===//

 def CC_ARM_AAPCS : CallingConv<[
-  CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
  CCIfType<[f32], CCBitConvertToType<i32>>,
  CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;

 def RetCC_ARM_AAPCS : CallingConv<[
-  CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
  CCIfType<[f32], CCBitConvertToType<i32>>,
  CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
@ -88,6 +107,10 @@ def RetCC_ARM_AAPCS : CallingConv<[
 //===----------------------------------------------------------------------===//

 def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                 S9, S10, S11, S12, S13, S14, S15]>>,
@ -95,6 +118,10 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
 ]>;

 def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                 S9, S10, S11, S12, S13, S14, S15]>>,
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@ -32,6 +32,9 @@
 #include "llvm/Support/Debug.h"
 using namespace llvm;

+static const unsigned arm_dsubreg_0 = 5;
+static const unsigned arm_dsubreg_1 = 6;
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@ -918,6 +921,65 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
    return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
                                 MVT::Other, Ops, 3);
  }
+
+  case ISD::CONCAT_VECTORS: {
+    MVT VT = Op.getValueType();
+    assert(VT.is128BitVector() && Op.getNumOperands() == 2 &&
+           "unexpected CONCAT_VECTORS");
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDNode *Result =
+      CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT);
+    if (N0.getOpcode() != ISD::UNDEF)
+      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
+                                     SDValue(Result, 0), N0,
+                                     CurDAG->getTargetConstant(arm_dsubreg_0,
+                                                               MVT::i32));
+    if (N1.getOpcode() != ISD::UNDEF)
+      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
+                                     SDValue(Result, 0), N1,
+                                     CurDAG->getTargetConstant(arm_dsubreg_1,
+                                                               MVT::i32));
+    return Result;
+  }
+
+  case ISD::VECTOR_SHUFFLE: {
+    MVT VT = Op.getValueType();
+
+    // Match 128-bit splat to VDUPLANEQ.  (This could be done with a Pat in
+    // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be
+    // transformed first into a lane number and then to both a subregister
+    // index and an adjusted lane number.)  If the source operand is a
+    // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP.
+    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+    if (VT.is128BitVector() && SVOp->isSplat() &&
+        Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR &&
+        Op.getOperand(1).getOpcode() == ISD::UNDEF) {
+      unsigned LaneVal = SVOp->getSplatIndex();
+
+      MVT HalfVT;
+      unsigned Opc = 0;
+      switch (VT.getVectorElementType().getSimpleVT()) {
+      default: assert(false && "unhandled VDUP splat type");
+      case MVT::i8:  Opc = ARM::VDUPLN8q;  HalfVT = MVT::v8i8; break;
+      case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break;
+      case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break;
+      case MVT::f32: Opc = ARM::VDUPLNfq;  HalfVT = MVT::v2f32; break;
+      }
+
+      // The source operand needs to be changed to a subreg of the original
+      // 128-bit operand, and the lane number needs to be adjusted accordingly.
+      unsigned NumElts = VT.getVectorNumElements() / 2;
+      unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1);
+      SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32);
+      SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32);
+      SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG,
+                                             dl, HalfVT, N->getOperand(0), SR);
+      return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane);
+    }
+
+    break;
+  }
  }

  return SelectCode(Op);
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@ -67,10 +67,65 @@ namespace llvm {
      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp
      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp

-      THREAD_POINTER
+      THREAD_POINTER,
+
+      VCEQ,         // Vector compare equal.
+      VCGE,         // Vector compare greater than or equal.
+      VCGEU,        // Vector compare unsigned greater than or equal.
+      VCGT,         // Vector compare greater than.
+      VCGTU,        // Vector compare unsigned greater than.
+      VTST,         // Vector test bits.
+
+      // Vector shift by immediate:
+      VSHL,         // ...left
+      VSHRs,        // ...right (signed)
+      VSHRu,        // ...right (unsigned)
+      VSHLLs,       // ...left long (signed)
+      VSHLLu,       // ...left long (unsigned)
+      VSHLLi,       // ...left long (with maximum shift count)
+      VSHRN,        // ...right narrow
+
+      // Vector rounding shift by immediate:
+      VRSHRs,       // ...right (signed)
+      VRSHRu,       // ...right (unsigned)
+      VRSHRN,       // ...right narrow
+
+      // Vector saturating shift by immediate:
+      VQSHLs,       // ...left (signed)
+      VQSHLu,       // ...left (unsigned)
+      VQSHLsu,      // ...left (signed to unsigned)
+      VQSHRNs,      // ...right narrow (signed)
+      VQSHRNu,      // ...right narrow (unsigned)
+      VQSHRNsu,     // ...right narrow (signed to unsigned)
+
+      // Vector saturating rounding shift by immediate:
+      VQRSHRNs,     // ...right narrow (signed)
+      VQRSHRNu,     // ...right narrow (unsigned)
+      VQRSHRNsu,    // ...right narrow (signed to unsigned)
+
+      // Vector shift and insert:
+      VSLI,         // ...left
+      VSRI,         // ...right
+
+      // Vector get lane (VMOV scalar to ARM core register)
+      // (These are used for 8- and 16-bit element types only.)
+      VGETLANEu,    // zero-extend vector extract element
+      VGETLANEs,    // sign-extend vector extract element
+
+      // Vector duplicate lane (128-bit result only; 64-bit is a shuffle)
+      VDUPLANEQ     // splat a lane from a 64-bit vector to a 128-bit vector
    };
  }

+  /// Define some predicates that are used for node matching.
+  namespace ARM {
+    /// getVMOVImm - If this is a build_vector of constants which can be
+    /// formed by using a VMOV instruction of the specified element size,
+    /// return the constant being splatted.  The ByteSize field indicates the
+    /// number of bytes of each element [1248].
+    SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+
  //===--------------------------------------------------------------------===//
  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface

@ -151,6 +206,21 @@ namespace llvm {
    ///
    unsigned ARMPCLabelIndex;

+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addDRTypeForNEON(MVT VT);
+    void addQRTypeForNEON(MVT VT);
+
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+    void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+                          SDValue Chain, SDValue &Arg,
+                          RegsToPassVector &RegsToPass,
+                          CCValAssign &VA, CCValAssign &NextVA,
+                          SDValue &StackPtr,
+                          SmallVector<SDValue, 8> &MemOpChains,
+                          ISD::ArgFlagsTy Flags);
+    SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                 SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
+
    CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const;
    SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
                             const SDValue &StackPtr, const CCValAssign &VA,
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@ -49,6 +49,11 @@ def VFPMiscFrm    : Format<22>;

 def ThumbFrm      : Format<23>;

+def NEONFrm       : Format<24>;
+def NEONGetLnFrm  : Format<25>;
+def NEONSetLnFrm  : Format<26>;
+def NEONDupFrm    : Format<27>;
+
 // Misc flag for data processing instructions that indicates whether
 // the instruction has a Rn register operand.
 class UnaryDP  { bit isUnaryDataProc = 1; }
@ -737,6 +742,14 @@ class TIx2<dag outs, dag ins, string asm, list<dag> pattern>
 class TJTI<dag outs, dag ins, string asm, list<dag> pattern>
  : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;

+// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
+class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, HasV5T];
+}

 //===----------------------------------------------------------------------===//

@ -857,12 +870,102 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,

 //===----------------------------------------------------------------------===//

+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//

-// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
-class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb];
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
+            string cstr, list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, NEONFrm, cstr> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
 }

-class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb, HasV5T];
+class NI<dag oops, dag iops, string asm, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> {
 }
+
+class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+               bit op5, bit op4,
+               dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{23} = op23;
+  let Inst{21-19} = op21_19;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{5} = op5;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7} = op11_7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+             bit op6, bit op4,
+             dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-16} = op21_16;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 3 vector register format.
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+               dag oops, dag iops, Format f, string opc, string asm,
+               list<dag> pattern>
+  : AI<oops, iops, f, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8} = opcod2;
+  let Inst{6-5} = opcod3;
+  let Inst{4} = 1;
+  list<Predicate> Predicates = [HasNEON];
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm,
+             pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm,
+             pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+            dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>;
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@ -59,6 +59,8 @@ bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI,
    return false;
  case ARM::FCPYS:
  case ARM::FCPYD:
+  case ARM::VMOVD:
+  case ARM::VMOVQ:
    SrcReg = MI.getOperand(1).getReg();
    DstReg = MI.getOperand(0).getReg();
    return true;
@ -528,6 +530,8 @@ bool ARMInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
  else if (DestRC == ARM::DPRRegisterClass)
    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
                   .addReg(SrcReg));
+  else if (DestRC == ARM::QPRRegisterClass)
+    BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
  else
    return false;
  
@ -844,6 +848,10 @@ canFoldMemoryOperand(const MachineInstr *MI,
  case ARM::FCPYS:
  case ARM::FCPYD:
    return true;
+
+  case ARM::VMOVD:
+  case ARM::VMOVQ:
+    return false; // FIXME
  }

  return false;
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@ -114,6 +114,12 @@ namespace ARMII {
    // Thumb format
    ThumbFrm      = 23 << FormShift,

+    // NEON format
+    NEONFrm       = 24 << FormShift,
+    NEONGetLnFrm  = 25 << FormShift,
+    NEONSetLnFrm  = 26 << FormShift,
+    NEONDupFrm    = 27 << FormShift,
+
    //===------------------------------------------------------------------===//
    // Field shifts - such shifts are used to set field while generating
    // machine instructions.
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@ -93,6 +93,10 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
 def HasV5T    : Predicate<"Subtarget->hasV5TOps()">;
 def HasV5TE   : Predicate<"Subtarget->hasV5TEOps()">;
 def HasV6     : Predicate<"Subtarget->hasV6Ops()">;
+def HasV7     : Predicate<"Subtarget->hasV7Ops()">;
+def HasVFP2   : Predicate<"Subtarget->hasVFP2()">;
+def HasVFP3   : Predicate<"Subtarget->hasVFP3()">;
+def HasNEON   : Predicate<"Subtarget->hasNEON()">;
 def IsThumb   : Predicate<"Subtarget->isThumb()">;
 def HasThumb2 : Predicate<"Subtarget->hasThumb2()">;
 def IsARM     : Predicate<"!Subtarget->isThumb()">;
@ -1437,3 +1441,9 @@ include "ARMInstrThumb2.td"
 //

 include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@ -77,6 +77,34 @@ def D13 : ARMReg<13, "d13", [S26, S27]>;
 def D14 : ARMReg<14, "d14", [S28, S29]>;
 def D15 : ARMReg<15, "d15", [S30, S31]>;

+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">;
+def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">;
+def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">;
+def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">;
+def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">;
+def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">;
+def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">;
+def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>; 
+def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
+def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
+def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
+def Q5  : ARMReg< 5,  "q5", [D10, D11]>;
+def Q6  : ARMReg< 6,  "q6", [D12, D13]>;
+def Q7  : ARMReg< 7,  "q7", [D14, D15]>;
+def Q8  : ARMReg< 8,  "q8", [D16, D17]>;
+def Q9  : ARMReg< 9,  "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+
 // Current Program Status Register.
 def CPSR : ARMReg<0, "cpsr">;

@ -207,14 +235,67 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
  }];
 }

+// Scalar single precision floating point register class..
 def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
  S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
  S23, S24, S25, S26, S27, S28, S29, S30, S31]>;

+// Scalar double precision floating point / generic 64-bit vector register
+// class.
 // ARM requires only word alignment for double. It's more performant if it
 // is double-word alignment though.
-def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
-  D9, D10, D11, D12, D13, D14, D15]>;
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                        [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                         D8,  D9,  D10, D11, D12, D13, D14, D15]> {
+  let SubRegClassList = [SPR, SPR];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // VFP2
+    static const unsigned ARM_DPR_VFP2[] = { 
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
+    // VFP3
+    static const unsigned ARM_DPR_VFP3[] = {
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+      ARM::D16, ARM::D17, ARM::D18, ARM::D15,
+      ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+      ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+      ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
+    DPRClass::iterator
+    DPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3;
+      return ARM_DPR_VFP2;
+    }
+
+    DPRClass::iterator
+    DPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned));
+      else
+        return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned));
+    }
+  }];
+}
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                        [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                         Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15]> {
+  let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR];
+}

 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
@ -224,12 +305,40 @@ def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
 // sub registers for each register.
 //

-def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7,
-                    D8, D9, D10, D11, D12, D13, D14, D15],
-                   [S0, S2, S4, S6, S8, S10, S12, S14,
-                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+def arm_ssubreg_0 : PatLeaf<(i32 1)>;
+def arm_ssubreg_1 : PatLeaf<(i32 2)>;
+def arm_ssubreg_2 : PatLeaf<(i32 3)>;
+def arm_ssubreg_3 : PatLeaf<(i32 4)>;
+def arm_dsubreg_0 : PatLeaf<(i32 5)>;
+def arm_dsubreg_1 : PatLeaf<(i32 6)>;

-def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7,
-                    D8, D9, D10, D11, D12, D13, D14, D15],
-                   [S1, S3, S5, S7, S9, S11, S13, S15,
+// S sub-registers of D registers.
+def : SubRegSet<1, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S0,  S2,  S4,  S6,  S8,  S10, S12, S14,
+                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+def : SubRegSet<2, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S1,  S3,  S5,  S7,  S9,  S11, S13, S15,
                    S17, S19, S21, S23, S25, S27, S29, S31]>;
+
+// S sub-registers of Q registers.
+def : SubRegSet<1, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S0,  S4,  S8,  S12, S16, S20, S24, S28]>;
+def : SubRegSet<2, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S1,  S5,  S9,  S13, S17, S21, S25, S29]>;
+def : SubRegSet<3, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S2,  S6,  S10, S14, S18, S22, S26, S30]>;
+def : SubRegSet<4, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S3,  S7,  S11, S15, S19, S23, S27, S31]>;
+
+// D sub-registers of Q registers.
+def : SubRegSet<5, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D0,  D2,  D4,  D6,  D8,  D10, D12, D14,
+                    D16, D18, D20, D22, D24, D26, D28, D30]>;
+def : SubRegSet<6, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D1,  D3,  D5,  D7,  D9,  D11, D13, D15,
+                    D17, D19, D21, D23, D25, D27, D29, D31]>;
+
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@ -285,12 +285,22 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                 const char *Modifier) {
  const MachineOperand &MO = MI->getOperand(opNum);
  switch (MO.getType()) {
-  case MachineOperand::MO_Register:
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
-    else
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+        unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+        unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+        O << '{'
+          << TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi)
+          << '}';
+      } else {
+        O << TRI->getAsmName(Reg);
+      }
+    } else
      assert(0 && "not implemented");
    break;
+  }
  case MachineOperand::MO_Immediate: {
    if (!Modifier || strcmp(Modifier, "no_hash") != 0)
      O << "#";
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@ -552,3 +552,23 @@ __Z11no_overflowjj:

 //===---------------------------------------------------------------------===//

+Some of the NEON intrinsics may be appropriate for more general use, either
+as target-independent intrinsics or perhaps elsewhere in the ARM backend.
+Some of them may also be lowered to target-independent SDNodes, and perhaps
+some new SDNodes could be added.
+
+For example, maximum, minimum, and absolute value operations are well-defined
+and standard operations, both for vector and scalar types.
+
+The current NEON-specific intrinsics for count leading zeros and count one
+bits could perhaps be replaced by the target-independent ctlz and ctpop
+intrinsics.  It may also make sense to add a target-independent "ctls"
+intrinsic for "count leading sign bits".  Likewise, the backend could use
+the target-independent SDNodes for these operations.
+
+ARMv6 has scalar saturating and halving adds and subtracts.  The same
+intrinsics could possibly be used for both NEON's vector implementations of
+those operations and the ARMv6 scalar versions.
+
+//===---------------------------------------------------------------------===//
+
--- a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
+++ b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
@ -0,0 +1,62 @@
+; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6,+vfp2
+
+@"\01LC" = external constant [15 x i8]		; <[15 x i8]*> [#uses=1]
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define i32 @main() nounwind {
+entry:
+	br label %bb.i1.i
+
+bb.i1.i:		; preds = %Cos.exit.i.i, %entry
+	br label %bb.i.i.i
+
+bb.i.i.i:		; preds = %bb.i.i.i, %bb.i1.i
+	br i1 undef, label %Cos.exit.i.i, label %bb.i.i.i
+
+Cos.exit.i.i:		; preds = %bb.i.i.i
+	br i1 undef, label %bb2.i.i, label %bb.i1.i
+
+bb2.i.i:		; preds = %Cos.exit.i.i
+	br label %bb3.i.i
+
+bb3.i.i:		; preds = %bb5.i.i, %bb2.i.i
+	br label %bb4.i.i
+
+bb4.i.i:		; preds = %bb4.i.i, %bb3.i.i
+	br i1 undef, label %bb5.i.i, label %bb4.i.i
+
+bb5.i.i:		; preds = %bb4.i.i
+	br i1 undef, label %bb.i, label %bb3.i.i
+
+bb.i:		; preds = %bb.i, %bb5.i.i
+	br i1 undef, label %bb1.outer2.i.i.outer, label %bb.i
+
+bb1.outer2.i.i.outer:		; preds = %Fft.exit.i, %bb5.i12.i, %bb.i
+	br label %bb1.outer2.i.i
+
+bb1.outer2.i.i:		; preds = %bb2.i9.i, %bb1.outer2.i.i.outer
+	br label %bb1.i.i
+
+bb1.i.i:		; preds = %bb1.i.i, %bb1.outer2.i.i
+	br i1 undef, label %bb2.i9.i, label %bb1.i.i
+
+bb2.i9.i:		; preds = %bb1.i.i
+	br i1 undef, label %bb4.i11.i, label %bb1.outer2.i.i
+
+bb4.i11.i:		; preds = %bb4.i11.i, %bb2.i9.i
+	br i1 undef, label %bb5.i12.i, label %bb4.i11.i
+
+bb5.i12.i:		; preds = %bb4.i11.i
+	br i1 undef, label %bb7.i.i, label %bb1.outer2.i.i.outer
+
+bb7.i.i:		; preds = %bb7.i.i, %bb5.i12.i
+	br i1 undef, label %Fft.exit.i, label %bb7.i.i
+
+Fft.exit.i:		; preds = %bb7.i.i
+	br i1 undef, label %bb5.i, label %bb1.outer2.i.i.outer
+
+bb5.i:		; preds = %Fft.exit.i
+	%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([15 x i8]* @"\01LC", i32 0, i32 0), double undef, double undef) nounwind		; <i32> [#uses=0]
+	unreachable
+}
--- a/test/CodeGen/ARM/neon_arith1.ll
+++ b/test/CodeGen/ARM/neon_arith1.ll
@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vadd
+
+define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+	%0 = add <8 x i8> %a, %b
+	ret <8 x i8> %0
+}
--- a/test/CodeGen/ARM/neon_ld1.ll
+++ b/test/CodeGen/ARM/neon_ld1.ll
@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fldd | count 4
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fstd
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd
+
+define void @t1(<2 x i32>* %r, <4 x i16>* %a, <4 x i16>* %b) nounwind {
+entry:
+	%0 = load <4 x i16>* %a, align 8		; <<4 x i16>> [#uses=1]
+	%1 = load <4 x i16>* %b, align 8		; <<4 x i16>> [#uses=1]
+	%2 = add <4 x i16> %0, %1		; <<4 x i16>> [#uses=1]
+	%3 = bitcast <4 x i16> %2 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	store <2 x i32> %3, <2 x i32>* %r, align 8
+	ret void
+}
+
+define <2 x i32> @t2(<4 x i16>* %a, <4 x i16>* %b) nounwind readonly {
+entry:
+	%0 = load <4 x i16>* %a, align 8		; <<4 x i16>> [#uses=1]
+	%1 = load <4 x i16>* %b, align 8		; <<4 x i16>> [#uses=1]
+	%2 = sub <4 x i16> %0, %1		; <<4 x i16>> [#uses=1]
+	%3 = bitcast <4 x i16> %2 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	ret <2 x i32> %3
+}
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vldmia | count 4
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vstmia | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd  | count 2
+
+define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+entry:
+	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
+	%1 = load <2 x i64>* %b, align 16		; <<2 x i64>> [#uses=1]
+	%2 = add <2 x i64> %0, %1		; <<2 x i64>> [#uses=1]
+	%3 = bitcast <2 x i64> %2 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	store <4 x i32> %3, <4 x i32>* %r, align 16
+	ret void
+}
+
+define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
+entry:
+	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
+	%1 = load <2 x i64>* %b, align 16		; <<2 x i64>> [#uses=1]
+	%2 = sub <2 x i64> %0, %1		; <<2 x i64>> [#uses=1]
+	%3 = bitcast <2 x i64> %2 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	ret <4 x i32> %3
+}
+
--- a/test/CodeGen/ARM/vaba.ll
+++ b/test/CodeGen/ARM/vaba.ll
@ -0,0 +1,119 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaba\\.s8} %t | count 2
+; RUN: grep {vaba\\.s16} %t | count 2
+; RUN: grep {vaba\\.s32} %t | count 2
+; RUN: grep {vaba\\.u8} %t | count 2
+; RUN: grep {vaba\\.u16} %t | count 2
+; RUN: grep {vaba\\.u32} %t | count 2
+
+define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i32> %tmp4
+}
+
+define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vabal.ll
+++ b/test/CodeGen/ARM/vabal.ll
@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabal\\.s8} %t | count 1
+; RUN: grep {vabal\\.s16} %t | count 1
+; RUN: grep {vabal\\.s32} %t | count 1
+; RUN: grep {vabal\\.u8} %t | count 1
+; RUN: grep {vabal\\.u16} %t | count 1
+; RUN: grep {vabal\\.u32} %t | count 1
+
+define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vabd.ll
+++ b/test/CodeGen/ARM/vabd.ll
@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabd\\.s8} %t | count 2
+; RUN: grep {vabd\\.s16} %t | count 2
+; RUN: grep {vabd\\.s32} %t | count 2
+; RUN: grep {vabd\\.u8} %t | count 2
+; RUN: grep {vabd\\.u16} %t | count 2
+; RUN: grep {vabd\\.u32} %t | count 2
+; RUN: grep {vabd\\.f32} %t | count 2
+
+define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vabds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vabds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vabdu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vabdu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vabdu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vabdf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vabdQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vabdQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vabdQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vabdQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vabdQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vabdl.ll
+++ b/test/CodeGen/ARM/vabdl.ll
@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabdl\\.s8} %t | count 1
+; RUN: grep {vabdl\\.s16} %t | count 1
+; RUN: grep {vabdl\\.s32} %t | count 1
+; RUN: grep {vabdl\\.u8} %t | count 1
+; RUN: grep {vabdl\\.u16} %t | count 1
+; RUN: grep {vabdl\\.u32} %t | count 1
+
+define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vabs.ll
+++ b/test/CodeGen/ARM/vabs.ll
@ -0,0 +1,64 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabs\\.s8} %t | count 2
+; RUN: grep {vabs\\.s16} %t | count 2
+; RUN: grep {vabs\\.s32} %t | count 2
+; RUN: grep {vabs\\.f32} %t | count 2
+
+define <8 x i8> @vabss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vabss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vabss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vabsf32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vabsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vabsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vabsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vabsQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone
+
--- a/test/CodeGen/ARM/vacge.ll
+++ b/test/CodeGen/ARM/vacge.ll
@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vacge\\.f32} %t | count 2
+
+define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vacgt.ll
+++ b/test/CodeGen/ARM/vacgt.ll
@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vacgt\\.f32} %t | count 2
+
+define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vadd.ll
+++ b/test/CodeGen/ARM/vadd.ll
@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vadd\\.i8} %t | count 2
+; RUN: grep {vadd\\.i16} %t | count 2
+; RUN: grep {vadd\\.i32} %t | count 2
+; RUN: grep {vadd\\.i64} %t | count 2
+; RUN: grep {vadd\\.f32} %t | count 2
+
+define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = add <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = add <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = add <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vaddi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = add <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <2 x float> @vaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = add <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vaddQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = add <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vaddQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = add <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = add <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = add <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = add <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
--- a/test/CodeGen/ARM/vaddhn.ll
+++ b/test/CodeGen/ARM/vaddhn.ll
@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddhn\\.i16} %t | count 1
+; RUN: grep {vaddhn\\.i32} %t | count 1
+; RUN: grep {vaddhn\\.i64} %t | count 1
+
+define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vaddl.ll
+++ b/test/CodeGen/ARM/vaddl.ll
@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddl\\.s8} %t | count 1
+; RUN: grep {vaddl\\.s16} %t | count 1
+; RUN: grep {vaddl\\.s32} %t | count 1
+; RUN: grep {vaddl\\.u8} %t | count 1
+; RUN: grep {vaddl\\.u16} %t | count 1
+; RUN: grep {vaddl\\.u32} %t | count 1
+
+define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vaddw.ll
+++ b/test/CodeGen/ARM/vaddw.ll
@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddw\\.s8} %t | count 1
+; RUN: grep {vaddw\\.s16} %t | count 1
+; RUN: grep {vaddw\\.s32} %t | count 1
+; RUN: grep {vaddw\\.u8} %t | count 1
+; RUN: grep {vaddw\\.u16} %t | count 1
+; RUN: grep {vaddw\\.u32} %t | count 1
+
+define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vand.ll
+++ b/test/CodeGen/ARM/vand.ll
@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vand %t | count 8
+; Note: function names do not include "vand" to allow simple grep for opcodes
+
+define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = and <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = and <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = and <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = and <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = and <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = and <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = and <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = and <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
--- a/test/CodeGen/ARM/vbic.ll
+++ b/test/CodeGen/ARM/vbic.ll
@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vbic %t | count 8
+; Note: function names do not include "vbic" to allow simple grep for opcodes
+
+define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = and <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = and <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
+	%tmp4 = and <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
+	%tmp4 = and <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = and <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = and <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp4 = and <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
+	%tmp4 = and <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@ -0,0 +1,91 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vbsl %t | count 8
+; Note: function names do not include "vbsl" to allow simple grep for opcodes
+
+define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = and <8 x i8> %tmp1, %tmp2
+	%tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp6 = and <8 x i8> %tmp5, %tmp3
+	%tmp7 = or <8 x i8> %tmp4, %tmp6
+	ret <8 x i8> %tmp7
+}
+
+define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = and <4 x i16> %tmp1, %tmp2
+	%tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp6 = and <4 x i16> %tmp5, %tmp3
+	%tmp7 = or <4 x i16> %tmp4, %tmp6
+	ret <4 x i16> %tmp7
+}
+
+define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = and <2 x i32> %tmp1, %tmp2
+	%tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
+	%tmp6 = and <2 x i32> %tmp5, %tmp3
+	%tmp7 = or <2 x i32> %tmp4, %tmp6
+	ret <2 x i32> %tmp7
+}
+
+define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = load <1 x i64>* %C
+	%tmp4 = and <1 x i64> %tmp1, %tmp2
+	%tmp5 = xor <1 x i64> %tmp1, < i64 -1 >
+	%tmp6 = and <1 x i64> %tmp5, %tmp3
+	%tmp7 = or <1 x i64> %tmp4, %tmp6
+	ret <1 x i64> %tmp7
+}
+
+define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = and <16 x i8> %tmp1, %tmp2
+	%tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp6 = and <16 x i8> %tmp5, %tmp3
+	%tmp7 = or <16 x i8> %tmp4, %tmp6
+	ret <16 x i8> %tmp7
+}
+
+define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = and <8 x i16> %tmp1, %tmp2
+	%tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp6 = and <8 x i16> %tmp5, %tmp3
+	%tmp7 = or <8 x i16> %tmp4, %tmp6
+	ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = and <4 x i32> %tmp1, %tmp2
+	%tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp6 = and <4 x i32> %tmp5, %tmp3
+	%tmp7 = or <4 x i32> %tmp4, %tmp6
+	ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = load <2 x i64>* %C
+	%tmp4 = and <2 x i64> %tmp1, %tmp2
+	%tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
+	%tmp6 = and <2 x i64> %tmp5, %tmp3
+	%tmp7 = or <2 x i64> %tmp4, %tmp6
+	ret <2 x i64> %tmp7
+}
--- a/test/CodeGen/ARM/vceq.ll
+++ b/test/CodeGen/ARM/vceq.ll
@ -0,0 +1,61 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.i8} %t | count 2
+; RUN: grep {vceq\\.i16} %t | count 2
+; RUN: grep {vceq\\.i32} %t | count 2
+; RUN: grep {vceq\\.f32} %t | count 2
+
+define <8 x i8> @vceqi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp eq <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vceqi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp eq <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vceqi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp eq <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vceqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp oeq <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vceqQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp eq <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vceqQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp eq <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vceqQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp eq <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vceqQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp oeq <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vcge.ll
+++ b/test/CodeGen/ARM/vcge.ll
@ -0,0 +1,106 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcge\\.s8} %t | count 2
+; RUN: grep {vcge\\.s16} %t | count 2
+; RUN: grep {vcge\\.s32} %t | count 2
+; RUN: grep {vcge\\.u8} %t | count 2
+; RUN: grep {vcge\\.u16} %t | count 2
+; RUN: grep {vcge\\.u32} %t | count 2
+; RUN: grep {vcge\\.f32} %t | count 2
+
+define <8 x i8> @vcges8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp sge <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcges16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sge <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcges32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp sge <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vcgeu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp uge <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgeu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp uge <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgeu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp uge <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vcgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp oge <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcgeQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp sge <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgeQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp sge <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgeQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp sge <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcgeQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp uge <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgeQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp uge <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgeQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp uge <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp oge <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vcgt.ll
+++ b/test/CodeGen/ARM/vcgt.ll
@ -0,0 +1,106 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcgt\\.s8} %t | count 2
+; RUN: grep {vcgt\\.s16} %t | count 2
+; RUN: grep {vcgt\\.s32} %t | count 2
+; RUN: grep {vcgt\\.u8} %t | count 2
+; RUN: grep {vcgt\\.u16} %t | count 2
+; RUN: grep {vcgt\\.u32} %t | count 2
+; RUN: grep {vcgt\\.f32} %t | count 2
+
+define <8 x i8> @vcgts8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp sgt <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgts16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sgt <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgts32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp sgt <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vcgtu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp ugt <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgtu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ugt <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgtu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp ugt <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vcgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ogt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcgtQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp sgt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgtQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp sgt <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgtQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp sgt <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcgtQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp ugt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgtQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp ugt <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgtQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ugt <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp ogt <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vcls.ll
+++ b/test/CodeGen/ARM/vcls.ll
@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcls\\.s8} %t | count 2
+; RUN: grep {vcls\\.s16} %t | count 2
+; RUN: grep {vcls\\.s32} %t | count 2
+
+define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vclz.ll
+++ b/test/CodeGen/ARM/vclz.ll
@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vclz\\.i8} %t | count 2
+; RUN: grep {vclz\\.i16} %t | count 2
+; RUN: grep {vclz\\.i32} %t | count 2
+
+define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcnt\\.8} %t | count 2
+
+define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
--- a/test/CodeGen/ARM/vcvt.ll
+++ b/test/CodeGen/ARM/vcvt.ll
@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
+
+define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = fptosi <2 x float> %tmp1 to <2 x i32>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = fptoui <2 x float> %tmp1 to <2 x i32>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = sitofp <2 x i32> %tmp1 to <2 x float>
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = uitofp <2 x i32> %tmp1 to <2 x float>
+	ret <2 x float> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = fptosi <4 x float> %tmp1 to <4 x i32>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = fptoui <4 x float> %tmp1 to <4 x i32>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = sitofp <4 x i32> %tmp1 to <4 x float>
+	ret <4 x float> %tmp2
+}
+
+define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = uitofp <4 x i32> %tmp1 to <4 x float>
+	ret <4 x float> %tmp2
+}
--- a/test/CodeGen/ARM/vcvt_n.ll
+++ b/test/CodeGen/ARM/vcvt_n.ll
@ -0,0 +1,64 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
+
+define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+	ret <2 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+
+define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+	ret <4 x float> %tmp2
+}
+
+define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+	ret <4 x float> %tmp2
+}
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@ -0,0 +1,134 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vdup.8 %t | count 4
+; RUN: grep vdup.16 %t | count 4
+; RUN: grep vdup.32 %t | count 8
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
--- a/test/CodeGen/ARM/vdup_lane.ll
+++ b/test/CodeGen/ARM/vdup_lane.ll
@ -0,0 +1,52 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vdup.8 %t | count 2
+; RUN: grep vdup.16 %t | count 2
+; RUN: grep vdup.32 %t | count 4
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
--- a/test/CodeGen/ARM/veor.ll
+++ b/test/CodeGen/ARM/veor.ll
@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep veor %t | count 8
+; Note: function names do not include "veor" to allow simple grep for opcodes
+
+define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
--- a/test/CodeGen/ARM/vfcmp.ll
+++ b/test/CodeGen/ARM/vfcmp.ll
@ -0,0 +1,96 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.f32} %t | count 1
+; RUN: grep {vcgt\\.f32} %t | count 9
+; RUN: grep {vcge\\.f32} %t | count 5
+; RUN: grep vorr %t | count 4
+; RUN: grep vmvn %t | count 7
+
+; This tests vfcmp operations that do not map directly to NEON instructions.
+
+; une is implemented with VCEQ/VMVN
+define <2 x i32> @vcunef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp une <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; olt is implemented with VCGT
+define <2 x i32> @vcoltf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp olt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ole is implemented with VCGE
+define <2 x i32> @vcolef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ole <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; uge is implemented with VCGT/VMVN
+define <2 x i32> @vcugef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp uge <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ule is implemented with VCGT/VMVN
+define <2 x i32> @vculef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ule <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ugt is implemented with VCGE/VMVN
+define <2 x i32> @vcugtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ugt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ult is implemented with VCGE/VMVN
+define <2 x i32> @vcultf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ult <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ueq is implemented with VCGT/VCGT/VORR/VMVN
+define <2 x i32> @vcueqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ueq <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; one is implemented with VCGT/VCGT/VORR
+define <2 x i32> @vconef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp one <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; uno is implemented with VCGT/VCGE/VORR/VMVN
+define <2 x i32> @vcunof32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp uno <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ord is implemented with VCGT/VCGE/VORR
+define <2 x i32> @vcordf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ord <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@ -0,0 +1,78 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmov\\.s8} %t | count 2
+; RUN: grep {vmov\\.s16} %t | count 2
+; RUN: grep {vmov\\.u8} %t | count 2
+; RUN: grep {vmov\\.u16} %t | count 2
+; RUN: grep {vmov\\.32} %t | count 2
+
+define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = extractelement <8 x i8> %tmp1, i32 1
+	%tmp3 = sext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_lanes16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = extractelement <4 x i16> %tmp1, i32 1
+	%tmp3 = sext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_laneu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = extractelement <8 x i8> %tmp1, i32 1
+	%tmp3 = zext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_laneu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = extractelement <4 x i16> %tmp1, i32 1
+	%tmp3 = zext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+; Do a vector add to keep the extraction from being done directly from memory.
+define i32 @vget_lanei32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = add <2 x i32> %tmp1, %tmp1
+	%tmp3 = extractelement <2 x i32> %tmp2, i32 1
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = extractelement <16 x i8> %tmp1, i32 1
+	%tmp3 = sext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = extractelement <8 x i16> %tmp1, i32 1
+	%tmp3 = sext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = extractelement <16 x i8> %tmp1, i32 1
+	%tmp3 = zext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = extractelement <8 x i16> %tmp1, i32 1
+	%tmp3 = zext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+; Do a vector add to keep the extraction from being done directly from memory.
+define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = add <4 x i32> %tmp1, %tmp1
+	%tmp3 = extractelement <4 x i32> %tmp2, i32 1
+	ret i32 %tmp3
+}
--- a/test/CodeGen/ARM/vhadd.ll
+++ b/test/CodeGen/ARM/vhadd.ll
@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vhadd\\.s8} %t | count 2
+; RUN: grep {vhadd\\.s16} %t | count 2
+; RUN: grep {vhadd\\.s32} %t | count 2
+; RUN: grep {vhadd\\.u8} %t | count 2
+; RUN: grep {vhadd\\.u16} %t | count 2
+; RUN: grep {vhadd\\.u32} %t | count 2
+
+define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vhsub.ll
+++ b/test/CodeGen/ARM/vhsub.ll
@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vhsub\\.s8} %t | count 2
+; RUN: grep {vhsub\\.s16} %t | count 2
+; RUN: grep {vhsub\\.s32} %t | count 2
+; RUN: grep {vhsub\\.u8} %t | count 2
+; RUN: grep {vhsub\\.u16} %t | count 2
+; RUN: grep {vhsub\\.u32} %t | count 2
+
+define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vhsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vhsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vhsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vicmp.ll
+++ b/test/CodeGen/ARM/vicmp.ll
@ -0,0 +1,85 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.i8} %t | count 2
+; RUN: grep {vceq\\.i16} %t | count 2
+; RUN: grep {vceq\\.i32} %t | count 2
+; RUN: grep vmvn %t | count 6
+; RUN: grep {vcgt\\.s8} %t | count 1
+; RUN: grep {vcge\\.s16} %t | count 1
+; RUN: grep {vcgt\\.u16} %t | count 1
+; RUN: grep {vcge\\.u32} %t | count 1
+
+; This tests vicmp operations that do not map directly to NEON instructions.
+; Not-equal (ne) operations are implemented by VCEQ/VMVN.  Less-than (lt/ult)
+; and less-than-or-equal (le/ule) are implemented by swapping the arguments
+; to VCGT and VCGE.  Test all the operand types for not-equal but only sample
+; the other operations.
+
+define <8 x i8> @vcnei8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp ne <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcnei16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ne <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcnei32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp ne <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcneQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp ne <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcneQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp ne <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcneQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ne <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcltQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp slt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @vcles16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sle <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i16> @vcltu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ult <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @vcleQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ule <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vmax.ll
+++ b/test/CodeGen/ARM/vmax.ll
@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmax\\.s8} %t | count 2
+; RUN: grep {vmax\\.s16} %t | count 2
+; RUN: grep {vmax\\.s32} %t | count 2
+; RUN: grep {vmax\\.u8} %t | count 2
+; RUN: grep {vmax\\.u16} %t | count 2
+; RUN: grep {vmax\\.u32} %t | count 2
+; RUN: grep {vmax\\.f32} %t | count 2
+
+define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vmaxQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vmaxQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vmin.ll
+++ b/test/CodeGen/ARM/vmin.ll
@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmin\\.s8} %t | count 2
+; RUN: grep {vmin\\.s16} %t | count 2
+; RUN: grep {vmin\\.s32} %t | count 2
+; RUN: grep {vmin\\.u8} %t | count 2
+; RUN: grep {vmin\\.u16} %t | count 2
+; RUN: grep {vmin\\.u32} %t | count 2
+; RUN: grep {vmin\\.f32} %t | count 2
+
+define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vmla.ll
+++ b/test/CodeGen/ARM/vmla.ll
@ -0,0 +1,77 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmla\\.i8} %t | count 2
+; RUN: grep {vmla\\.i16} %t | count 2
+; RUN: grep {vmla\\.i32} %t | count 2
+; RUN: grep {vmla\\.f32} %t | count 2
+
+define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = mul <8 x i8> %tmp2, %tmp3
+	%tmp5 = add <8 x i8> %tmp1, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vmlai16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = mul <4 x i16> %tmp2, %tmp3
+	%tmp5 = add <4 x i16> %tmp1, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vmlai32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = mul <2 x i32> %tmp2, %tmp3
+	%tmp5 = add <2 x i32> %tmp1, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vmlaf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = load <2 x float>* %C
+	%tmp4 = mul <2 x float> %tmp2, %tmp3
+	%tmp5 = add <2 x float> %tmp1, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vmlaQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = mul <16 x i8> %tmp2, %tmp3
+	%tmp5 = add <16 x i8> %tmp1, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vmlaQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = mul <8 x i16> %tmp2, %tmp3
+	%tmp5 = add <8 x i16> %tmp1, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vmlaQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = mul <4 x i32> %tmp2, %tmp3
+	%tmp5 = add <4 x i32> %tmp1, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = load <4 x float>* %C
+	%tmp4 = mul <4 x float> %tmp2, %tmp3
+	%tmp5 = add <4 x float> %tmp1, %tmp4
+	ret <4 x float> %tmp5
+}
--- a/test/CodeGen/ARM/vmlal.ll
+++ b/test/CodeGen/ARM/vmlal.ll
@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmlal\\.s8} %t | count 1
+; RUN: grep {vmlal\\.s16} %t | count 1
+; RUN: grep {vmlal\\.s32} %t | count 1
+; RUN: grep {vmlal\\.u8} %t | count 1
+; RUN: grep {vmlal\\.u16} %t | count 1
+; RUN: grep {vmlal\\.u32} %t | count 1
+
+define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vmls.ll
+++ b/test/CodeGen/ARM/vmls.ll
@ -0,0 +1,77 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmls\\.i8} %t | count 2
+; RUN: grep {vmls\\.i16} %t | count 2
+; RUN: grep {vmls\\.i32} %t | count 2
+; RUN: grep {vmls\\.f32} %t | count 2
+
+define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = mul <8 x i8> %tmp2, %tmp3
+	%tmp5 = sub <8 x i8> %tmp1, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vmlsi16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = mul <4 x i16> %tmp2, %tmp3
+	%tmp5 = sub <4 x i16> %tmp1, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vmlsi32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = mul <2 x i32> %tmp2, %tmp3
+	%tmp5 = sub <2 x i32> %tmp1, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vmlsf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = load <2 x float>* %C
+	%tmp4 = mul <2 x float> %tmp2, %tmp3
+	%tmp5 = sub <2 x float> %tmp1, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vmlsQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = mul <16 x i8> %tmp2, %tmp3
+	%tmp5 = sub <16 x i8> %tmp1, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vmlsQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = mul <8 x i16> %tmp2, %tmp3
+	%tmp5 = sub <8 x i16> %tmp1, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vmlsQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = mul <4 x i32> %tmp2, %tmp3
+	%tmp5 = sub <4 x i32> %tmp1, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = load <4 x float>* %C
+	%tmp4 = mul <4 x float> %tmp2, %tmp3
+	%tmp5 = sub <4 x float> %tmp1, %tmp4
+	ret <4 x float> %tmp5
+}
--- a/test/CodeGen/ARM/vmlsl.ll
+++ b/test/CodeGen/ARM/vmlsl.ll
@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmlsl\\.s8} %t | count 1
+; RUN: grep {vmlsl\\.s16} %t | count 1
+; RUN: grep {vmlsl\\.s32} %t | count 1
+; RUN: grep {vmlsl\\.u8} %t | count 1
+; RUN: grep {vmlsl\\.u16} %t | count 1
+; RUN: grep {vmlsl\\.u32} %t | count 1
+
+define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vmov.ll
+++ b/test/CodeGen/ARM/vmov.ll
@ -0,0 +1,101 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vmov.i8 %t | count 2
+; RUN: grep vmov.i16 %t | count 4
+; RUN: grep vmov.i32 %t | count 12
+; RUN: grep vmov.i64 %t | count 2
+; Note: function names do not include "vmov" to allow simple grep for opcodes
+
+define <8 x i8> @v_movi8() nounwind {
+	ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <4 x i16> @v_movi16a() nounwind {
+	ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
+}
+
+; 0x1000 = 4096
+define <4 x i16> @v_movi16b() nounwind {
+	ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
+}
+
+define <2 x i32> @v_movi32a() nounwind {
+	ret <2 x i32> < i32 32, i32 32 >
+}
+
+; 0x2000 = 8192
+define <2 x i32> @v_movi32b() nounwind {
+	ret <2 x i32> < i32 8192, i32 8192 >
+}
+
+; 0x200000 = 2097152
+define <2 x i32> @v_movi32c() nounwind {
+	ret <2 x i32> < i32 2097152, i32 2097152 >
+}
+
+; 0x20000000 = 536870912
+define <2 x i32> @v_movi32d() nounwind {
+	ret <2 x i32> < i32 536870912, i32 536870912 >
+}
+
+; 0x20ff = 8447
+define <2 x i32> @v_movi32e() nounwind {
+	ret <2 x i32> < i32 8447, i32 8447 >
+}
+
+; 0x20ffff = 2162687
+define <2 x i32> @v_movi32f() nounwind {
+	ret <2 x i32> < i32 2162687, i32 2162687 >
+}
+
+; 0xff0000ff0000ffff = 18374687574888349695
+define <1 x i64> @v_movi64() nounwind {
+	ret <1 x i64> < i64 18374687574888349695 >
+}
+
+define <16 x i8> @v_movQi8() nounwind {
+	ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <8 x i16> @v_movQi16a() nounwind {
+	ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+}
+
+; 0x1000 = 4096
+define <8 x i16> @v_movQi16b() nounwind {
+	ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
+}
+
+define <4 x i32> @v_movQi32a() nounwind {
+	ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
+}
+
+; 0x2000 = 8192
+define <4 x i32> @v_movQi32b() nounwind {
+	ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
+}
+
+; 0x200000 = 2097152
+define <4 x i32> @v_movQi32c() nounwind {
+	ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
+}
+
+; 0x20000000 = 536870912
+define <4 x i32> @v_movQi32d() nounwind {
+	ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
+}
+
+; 0x20ff = 8447
+define <4 x i32> @v_movQi32e() nounwind {
+	ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
+}
+
+; 0x20ffff = 2162687
+define <4 x i32> @v_movQi32f() nounwind {
+	ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
+}
+
+; 0xff0000ff0000ffff = 18374687574888349695
+define <2 x i64> @v_movQi64() nounwind {
+	ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
+}
+
--- a/test/CodeGen/ARM/vmovl.ll
+++ b/test/CodeGen/ARM/vmovl.ll
@ -0,0 +1,51 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmovl\\.s8} %t | count 1
+; RUN: grep {vmovl\\.s16} %t | count 1
+; RUN: grep {vmovl\\.s32} %t | count 1
+; RUN: grep {vmovl\\.u8} %t | count 1
+; RUN: grep {vmovl\\.u16} %t | count 1
+; RUN: grep {vmovl\\.u32} %t | count 1
+
+define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vmovn.ll
+++ b/test/CodeGen/ARM/vmovn.ll
@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmovn\\.i16} %t | count 1
+; RUN: grep {vmovn\\.i32} %t | count 1
+; RUN: grep {vmovn\\.i64} %t | count 1
+
+define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@ -0,0 +1,79 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmul\\.i8} %t | count 2
+; RUN: grep {vmul\\.i16} %t | count 2
+; RUN: grep {vmul\\.i32} %t | count 2
+; RUN: grep {vmul\\.f32} %t | count 2
+; RUN: grep {vmul\\.p8} %t | count 2
+
+define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = mul <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = mul <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = mul <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = mul <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = mul <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = mul <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = mul <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = mul <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
+
+define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8>  @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
--- a/test/CodeGen/ARM/vmull.ll
+++ b/test/CodeGen/ARM/vmull.ll
@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmull\\.s8} %t | count 1
+; RUN: grep {vmull\\.s16} %t | count 1
+; RUN: grep {vmull\\.s32} %t | count 1
+; RUN: grep {vmull\\.u8} %t | count 1
+; RUN: grep {vmull\\.u16} %t | count 1
+; RUN: grep {vmull\\.u32} %t | count 1
+; RUN: grep {vmull\\.p8} %t | count 1
+
+define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
--- a/test/CodeGen/ARM/vmvn.ll
+++ b/test/CodeGen/ARM/vmvn.ll
@ -0,0 +1,51 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vmvn %t | count 8
+; Note: function names do not include "vmvn" to allow simple grep for opcodes
+
+define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = xor <1 x i64> %tmp1, < i64 -1 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
+	ret <2 x i64> %tmp2
+}
--- a/test/CodeGen/ARM/vneg.ll
+++ b/test/CodeGen/ARM/vneg.ll
@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vneg\\.s8} %t | count 2
+; RUN: grep {vneg\\.s16} %t | count 2
+; RUN: grep {vneg\\.s32} %t | count 2
+; RUN: grep {vneg\\.f32} %t | count 2
+
+define <8 x i8> @vnegs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = sub <8 x i8> zeroinitializer, %tmp1
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vnegs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = sub <4 x i16> zeroinitializer, %tmp1
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vnegs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = sub <2 x i32> zeroinitializer, %tmp1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vnegf32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = sub <2 x float> < float -0.000000e+00, float -0.000000e+00 >, %tmp1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vnegQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = sub <16 x i8> zeroinitializer, %tmp1
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vnegQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = sub <8 x i16> zeroinitializer, %tmp1
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vnegQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = sub <4 x i32> zeroinitializer, %tmp1
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vnegQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = sub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp1
+	ret <4 x float> %tmp2
+}
--- a/test/CodeGen/ARM/vorn.ll
+++ b/test/CodeGen/ARM/vorn.ll
@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vorn %t | count 8
+; Note: function names do not include "vorn" to allow simple grep for opcodes
+
+define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = or <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = or <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
+	%tmp4 = or <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
+	%tmp4 = or <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = or <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = or <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp4 = or <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
+	%tmp4 = or <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
--- a/test/CodeGen/ARM/vorr.ll
+++ b/test/CodeGen/ARM/vorr.ll
@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vorr %t | count 8
+; Note: function names do not include "vorr" to allow simple grep for opcodes
+
+define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = or <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = or <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = or <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = or <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = or <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = or <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = or <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = or <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
--- a/test/CodeGen/ARM/vpadal.ll
+++ b/test/CodeGen/ARM/vpadal.ll
@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpadal\\.s8} %t | count 2
+; RUN: grep {vpadal\\.s16} %t | count 2
+; RUN: grep {vpadal\\.s32} %t | count 2
+; RUN: grep {vpadal\\.u8} %t | count 2
+; RUN: grep {vpadal\\.u16} %t | count 2
+; RUN: grep {vpadal\\.u32} %t | count 2
+
+define <4 x i16> @vpadals8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpadals16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vpadals32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <4 x i16> @vpadalu8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpadalu16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vpadalu32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @vpadalQs8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vpadalQs16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vpadalQs32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vpadalQu8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vpadalQu16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vpadalQu32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@ -0,0 +1,39 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpadd\\.i8} %t | count 1
+; RUN: grep {vpadd\\.i16} %t | count 1
+; RUN: grep {vpadd\\.i32} %t | count 1
+; RUN: grep {vpadd\\.f32} %t | count 1
+
+define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vpaddi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float>, <2 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vpaddl.ll
+++ b/test/CodeGen/ARM/vpaddl.ll
@ -0,0 +1,95 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpaddl\\.s8} %t | count 2
+; RUN: grep {vpaddl\\.s16} %t | count 2
+; RUN: grep {vpaddl\\.s32} %t | count 2
+; RUN: grep {vpaddl\\.u8} %t | count 2
+; RUN: grep {vpaddl\\.u16} %t | count 2
+; RUN: grep {vpaddl\\.u32} %t | count 2
+
+define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
+	ret <1 x i64> %tmp2
+}
+
+define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vpmax.ll
+++ b/test/CodeGen/ARM/vpmax.ll
@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpmax\\.s8} %t | count 1
+; RUN: grep {vpmax\\.s16} %t | count 1
+; RUN: grep {vpmax\\.s32} %t | count 1
+; RUN: grep {vpmax\\.u8} %t | count 1
+; RUN: grep {vpmax\\.u16} %t | count 1
+; RUN: grep {vpmax\\.u32} %t | count 1
+; RUN: grep {vpmax\\.f32} %t | count 1
+
+define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vpmin.ll
+++ b/test/CodeGen/ARM/vpmin.ll
@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vpmin\\.s8} %t | count 1
+; RUN: grep {vpmin\\.s16} %t | count 1
+; RUN: grep {vpmin\\.s32} %t | count 1
+; RUN: grep {vpmin\\.u8} %t | count 1
+; RUN: grep {vpmin\\.u16} %t | count 1
+; RUN: grep {vpmin\\.u32} %t | count 1
+; RUN: grep {vpmin\\.f32} %t | count 1
+
+define <8 x i8> @vpmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vpminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vqabs.ll
+++ b/test/CodeGen/ARM/vqabs.ll
@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqabs\\.s8} %t | count 2
+; RUN: grep {vqabs\\.s16} %t | count 2
+; RUN: grep {vqabs\\.s32} %t | count 2
+
+define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vqadd.ll
+++ b/test/CodeGen/ARM/vqadd.ll
@ -0,0 +1,141 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqadd\\.s8} %t | count 2
+; RUN: grep {vqadd\\.s16} %t | count 2
+; RUN: grep {vqadd\\.s32} %t | count 2
+; RUN: grep {vqadd\\.s64} %t | count 2
+; RUN: grep {vqadd\\.u8} %t | count 2
+; RUN: grep {vqadd\\.u16} %t | count 2
+; RUN: grep {vqadd\\.u32} %t | count 2
+; RUN: grep {vqadd\\.u64} %t | count 2
+
+define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vqdmlal.ll
+++ b/test/CodeGen/ARM/vqdmlal.ll
@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmlal\\.s16} %t | count 1
+; RUN: grep {vqdmlal\\.s32} %t | count 1
+
+define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <4 x i32>  @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vqdmlsl.ll
+++ b/test/CodeGen/ARM/vqdmlsl.ll
@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmlsl\\.s16} %t | count 1
+; RUN: grep {vqdmlsl\\.s32} %t | count 1
+
+define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <4 x i32>  @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vqdmulh.ll
+++ b/test/CodeGen/ARM/vqdmulh.ll
@ -0,0 +1,73 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmulh\\.s16} %t | count 2
+; RUN: grep {vqdmulh\\.s32} %t | count 2
+; RUN: grep {vqrdmulh\\.s16} %t | count 2
+; RUN: grep {vqrdmulh\\.s32} %t | count 2
+
+define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vqdmull.ll
+++ b/test/CodeGen/ARM/vqdmull.ll
@ -0,0 +1,20 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqdmull\\.s16} %t | count 1
+; RUN: grep {vqdmull\\.s32} %t | count 1
+
+define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vqmovn.ll
+++ b/test/CodeGen/ARM/vqmovn.ll
@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqmovn\\.s16} %t | count 1
+; RUN: grep {vqmovn\\.s32} %t | count 1
+; RUN: grep {vqmovn\\.s64} %t | count 1
+; RUN: grep {vqmovn\\.u16} %t | count 1
+; RUN: grep {vqmovn\\.u32} %t | count 1
+; RUN: grep {vqmovn\\.u64} %t | count 1
+; RUN: grep {vqmovun\\.s16} %t | count 1
+; RUN: grep {vqmovun\\.s32} %t | count 1
+; RUN: grep {vqmovun\\.s64} %t | count 1
+
+define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vqneg.ll
+++ b/test/CodeGen/ARM/vqneg.ll
@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqneg\\.s8} %t | count 2
+; RUN: grep {vqneg\\.s16} %t | count 2
+; RUN: grep {vqneg\\.s32} %t | count 2
+
+define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vqrshl.ll
+++ b/test/CodeGen/ARM/vqrshl.ll
@ -0,0 +1,141 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqrshl\\.s8} %t | count 2
+; RUN: grep {vqrshl\\.s16} %t | count 2
+; RUN: grep {vqrshl\\.s32} %t | count 2
+; RUN: grep {vqrshl\\.s64} %t | count 2
+; RUN: grep {vqrshl\\.u8} %t | count 2
+; RUN: grep {vqrshl\\.u16} %t | count 2
+; RUN: grep {vqrshl\\.u32} %t | count 2
+; RUN: grep {vqrshl\\.u64} %t | count 2
+
+define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vqrshrn.ll
+++ b/test/CodeGen/ARM/vqrshrn.ll
@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqrshrn\\.s16} %t | count 1
+; RUN: grep {vqrshrn\\.s32} %t | count 1
+; RUN: grep {vqrshrn\\.s64} %t | count 1
+; RUN: grep {vqrshrn\\.u16} %t | count 1
+; RUN: grep {vqrshrn\\.u32} %t | count 1
+; RUN: grep {vqrshrn\\.u64} %t | count 1
+; RUN: grep {vqrshrun\\.s16} %t | count 1
+; RUN: grep {vqrshrun\\.s32} %t | count 1
+; RUN: grep {vqrshrun\\.s64} %t | count 1
+
+define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vqshl.ll
+++ b/test/CodeGen/ARM/vqshl.ll
@ -0,0 +1,307 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqshl\\.s8} %t | count 4
+; RUN: grep {vqshl\\.s16} %t | count 4
+; RUN: grep {vqshl\\.s32} %t | count 4
+; RUN: grep {vqshl\\.s64} %t | count 4
+; RUN: grep {vqshl\\.u8} %t | count 4
+; RUN: grep {vqshl\\.u16} %t | count 4
+; RUN: grep {vqshl\\.u32} %t | count 4
+; RUN: grep {vqshl\\.u64} %t | count 4
+; RUN: grep {vqshl\\.s8.*#7} %t | count 2
+; RUN: grep {vqshl\\.s16.*#15} %t | count 2
+; RUN: grep {vqshl\\.s32.*#31} %t | count 2
+; RUN: grep {vqshl\\.s64.*#63} %t | count 2
+; RUN: grep {vqshl\\.u8.*#7} %t | count 2
+; RUN: grep {vqshl\\.u16.*#15} %t | count 2
+; RUN: grep {vqshl\\.u32.*#31} %t | count 2
+; RUN: grep {vqshl\\.u64.*#63} %t | count 2
+; RUN: grep {vqshlu\\.s8} %t | count 2
+; RUN: grep {vqshlu\\.s16} %t | count 2
+; RUN: grep {vqshlu\\.s32} %t | count 2
+; RUN: grep {vqshlu\\.s64} %t | count 2
+
+define <8 x i8> @vqshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @vqshls_n8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshls_n16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshls_n32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vqshls_n64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vqshlu_n8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshlu_n16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshlu_n32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vqshlu_n64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vqshlsu_n8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshlsu_n16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshlsu_n32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vqshlsu_n64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vqshlQs_n8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqshlQs_n16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqshlQs_n32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vqshlQs_n64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vqshlQu_n8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqshlQu_n16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqshlQu_n32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vqshlQu_n64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vqshlQsu_n8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqshlQsu_n16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqshlQsu_n32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vqshlQsu_n64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vqshrn.ll
+++ b/test/CodeGen/ARM/vqshrn.ll
@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqshrn\\.s16} %t | count 1
+; RUN: grep {vqshrn\\.s32} %t | count 1
+; RUN: grep {vqshrn\\.s64} %t | count 1
+; RUN: grep {vqshrn\\.u16} %t | count 1
+; RUN: grep {vqshrn\\.u32} %t | count 1
+; RUN: grep {vqshrn\\.u64} %t | count 1
+; RUN: grep {vqshrun\\.s16} %t | count 1
+; RUN: grep {vqshrun\\.s32} %t | count 1
+; RUN: grep {vqshrun\\.s64} %t | count 1
+
+define <8 x i8> @vqshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqshrnu8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshrnu16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshrnu32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqshruns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqshruns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqshruns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vqsub.ll
+++ b/test/CodeGen/ARM/vqsub.ll
@ -0,0 +1,141 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vqsub\\.s8} %t | count 2
+; RUN: grep {vqsub\\.s16} %t | count 2
+; RUN: grep {vqsub\\.s32} %t | count 2
+; RUN: grep {vqsub\\.s64} %t | count 2
+; RUN: grep {vqsub\\.u8} %t | count 2
+; RUN: grep {vqsub\\.u16} %t | count 2
+; RUN: grep {vqsub\\.u32} %t | count 2
+; RUN: grep {vqsub\\.u64} %t | count 2
+
+define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vraddhn.ll
+++ b/test/CodeGen/ARM/vraddhn.ll
@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vraddhn\\.i16} %t | count 1
+; RUN: grep {vraddhn\\.i32} %t | count 1
+; RUN: grep {vraddhn\\.i64} %t | count 1
+
+define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vrecpe.ll
+++ b/test/CodeGen/ARM/vrecpe.ll
@ -0,0 +1,33 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrecpe\\.u32} %t | count 2
+; RUN: grep {vrecpe\\.f32} %t | count 2
+
+define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x float> @vrecpef32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp2
+}
+
+define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vrecps.ll
+++ b/test/CodeGen/ARM/vrecps.ll
@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrecps\\.f32} %t | count 2
+
+define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vrhadd.ll
+++ b/test/CodeGen/ARM/vrhadd.ll
@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrhadd\\.s8} %t | count 2
+; RUN: grep {vrhadd\\.s16} %t | count 2
+; RUN: grep {vrhadd\\.s32} %t | count 2
+; RUN: grep {vrhadd\\.u8} %t | count 2
+; RUN: grep {vrhadd\\.u16} %t | count 2
+; RUN: grep {vrhadd\\.u32} %t | count 2
+
+define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vrshl.ll
+++ b/test/CodeGen/ARM/vrshl.ll
@ -0,0 +1,245 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrshl\\.s8} %t | count 2
+; RUN: grep {vrshl\\.s16} %t | count 2
+; RUN: grep {vrshl\\.s32} %t | count 2
+; RUN: grep {vrshl\\.s64} %t | count 2
+; RUN: grep {vrshl\\.u8} %t | count 2
+; RUN: grep {vrshl\\.u16} %t | count 2
+; RUN: grep {vrshl\\.u32} %t | count 2
+; RUN: grep {vrshl\\.u64} %t | count 2
+; RUN: grep {vrshr\\.s8} %t | count 2
+; RUN: grep {vrshr\\.s16} %t | count 2
+; RUN: grep {vrshr\\.s32} %t | count 2
+; RUN: grep {vrshr\\.s64} %t | count 2
+; RUN: grep {vrshr\\.u8} %t | count 2
+; RUN: grep {vrshr\\.u16} %t | count 2
+; RUN: grep {vrshr\\.u32} %t | count 2
+; RUN: grep {vrshr\\.u64} %t | count 2
+
+define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vrshrn.ll
+++ b/test/CodeGen/ARM/vrshrn.ll
@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrshrn\\.i16} %t | count 1
+; RUN: grep {vrshrn\\.i32} %t | count 1
+; RUN: grep {vrshrn\\.i64} %t | count 1
+
+define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vrsqrte.ll
+++ b/test/CodeGen/ARM/vrsqrte.ll
@ -0,0 +1,33 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrsqrte\\.u32} %t | count 2
+; RUN: grep {vrsqrte\\.f32} %t | count 2
+
+define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp2
+}
+
+define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vrsqrts.ll
+++ b/test/CodeGen/ARM/vrsqrts.ll
@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrsqrts\\.f32} %t | count 2
+
+define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vrsubhn.ll
+++ b/test/CodeGen/ARM/vrsubhn.ll
@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vrsubhn\\.i16} %t | count 1
+; RUN: grep {vrsubhn\\.i32} %t | count 1
+; RUN: grep {vrsubhn\\.i64} %t | count 1
+
+define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vset_lane.ll
+++ b/test/CodeGen/ARM/vset_lane.ll
@ -0,0 +1,40 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmov\\.8} %t | count 2
+; RUN: grep {vmov\\.16} %t | count 2
+; RUN: grep {vmov\\.32} %t | count 2
+
+define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
+	ret <4 x i32> %tmp2
+}
--- a/test/CodeGen/ARM/vshift.ll
+++ b/test/CodeGen/ARM/vshift.ll
@ -0,0 +1,337 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshl\\.s8} %t | count 2
+; RUN: grep {vshl\\.s16} %t | count 2
+; RUN: grep {vshl\\.s32} %t | count 2
+; RUN: grep {vshl\\.s64} %t | count 2
+; RUN: grep {vshl\\.u8} %t | count 4
+; RUN: grep {vshl\\.u16} %t | count 4
+; RUN: grep {vshl\\.u32} %t | count 4
+; RUN: grep {vshl\\.u64} %t | count 4
+; RUN: grep {vshl\\.i8} %t | count 2
+; RUN: grep {vshl\\.i16} %t | count 2
+; RUN: grep {vshl\\.i32} %t | count 2
+; RUN: grep {vshl\\.i64} %t | count 2
+; RUN: grep {vshr\\.u8} %t | count 2
+; RUN: grep {vshr\\.u16} %t | count 2
+; RUN: grep {vshr\\.u32} %t | count 2
+; RUN: grep {vshr\\.u64} %t | count 2
+; RUN: grep {vshr\\.s8} %t | count 2
+; RUN: grep {vshr\\.s16} %t | count 2
+; RUN: grep {vshr\\.s32} %t | count 2
+; RUN: grep {vshr\\.s64} %t | count 2
+; RUN: grep {vneg\\.s8} %t | count 4
+; RUN: grep {vneg\\.s16} %t | count 4
+; RUN: grep {vneg\\.s32} %t | count 4
+; RUN: grep {vsub\\.i64} %t | count 4
+
+define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shl <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shl <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = shl <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = shl <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shl <8 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shl <4 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shl <2 x i32> %tmp1, < i32 31, i32 31 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = shl <1 x i64> %tmp1, < i64 63 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shl <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shl <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shl <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = shl <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shl <16 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shl <8 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shl <4 x i32> %tmp1, < i32 31, i32 31, i32 31, i32 31 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = shl <2 x i64> %tmp1, < i64 63, i64 63 >
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i8> @vlshru8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vlshru16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vlshru32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vlshru64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = lshr <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vlshri8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = lshr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vlshri16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = lshr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vlshri32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = lshr <2 x i32> %tmp1, < i32 32, i32 32 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vlshri64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = lshr <1 x i64> %tmp1, < i64 64 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vlshrQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vlshrQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vlshrQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vlshrQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vlshrQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = lshr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vlshrQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = lshr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vlshrQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = lshr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = lshr <2 x i64> %tmp1, < i64 64, i64 64 >
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vashrs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vashrs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vashrs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = ashr <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vashri8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = ashr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vashri16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = ashr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vashri32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = ashr <2 x i32> %tmp1, < i32 32, i32 32 >
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vashri64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = ashr <1 x i64> %tmp1, < i64 64 >
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vashrQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vashrQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vashrQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vashrQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vashrQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = ashr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vashrQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = ashr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vashrQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = ashr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vashrQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = ashr <2 x i64> %tmp1, < i64 64, i64 64 >
+	ret <2 x i64> %tmp2
+}
--- a/test/CodeGen/ARM/vshiftins.ll
+++ b/test/CodeGen/ARM/vshiftins.ll
@ -0,0 +1,131 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsli\\.8} %t | count 2
+; RUN: grep {vsli\\.16} %t | count 2
+; RUN: grep {vsli\\.32} %t | count 2
+; RUN: grep {vsli\\.64} %t | count 2
+; RUN: grep {vsri\\.8} %t | count 2
+; RUN: grep {vsri\\.16} %t | count 2
+; RUN: grep {vsri\\.32} %t | count 2
+; RUN: grep {vsri\\.64} %t | count 2
+
+define <8 x i8> @vsli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vsli64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vsliQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vsliQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsliQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsliQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @vsri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vsri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vsriQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vsriQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsriQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsriQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vshiftins.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vshl.ll
+++ b/test/CodeGen/ARM/vshl.ll
@ -0,0 +1,302 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshl\\.s8} %t | count 2
+; RUN: grep {vshl\\.s16} %t | count 2
+; RUN: grep {vshl\\.s32} %t | count 2
+; RUN: grep {vshl\\.s64} %t | count 2
+; RUN: grep {vshl\\.u8} %t | count 2
+; RUN: grep {vshl\\.u16} %t | count 2
+; RUN: grep {vshl\\.u32} %t | count 2
+; RUN: grep {vshl\\.u64} %t | count 2
+; RUN: grep {vshl\\.i8} %t | count 2
+; RUN: grep {vshl\\.i16} %t | count 2
+; RUN: grep {vshl\\.i32} %t | count 2
+; RUN: grep {vshl\\.i64} %t | count 2
+; RUN: grep {vshr\\.s8} %t | count 2
+; RUN: grep {vshr\\.s16} %t | count 2
+; RUN: grep {vshr\\.s32} %t | count 2
+; RUN: grep {vshr\\.s64} %t | count 2
+; RUN: grep {vshr\\.u8} %t | count 2
+; RUN: grep {vshr\\.u16} %t | count 2
+; RUN: grep {vshr\\.u32} %t | count 2
+; RUN: grep {vshr\\.u64} %t | count 2
+
+define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+; For left shifts by immediates, the signedness is irrelevant.
+; Test a mix of both signed and unsigned intrinsics.
+
+define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
+	ret <2 x i64> %tmp2
+}
+
+; Right shift by immediate:
+
+define <8 x i8> @vshrs8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshrs16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshrs32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshrs64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vshru8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshru16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshru32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vshru64(<1 x i64>* %A) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+	ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vshrQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshrQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshrQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshrQs64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vshrQu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vshrQu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshrQu32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshrQu64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vshll.ll
+++ b/test/CodeGen/ARM/vshll.ll
@ -0,0 +1,74 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshll\\.s8} %t | count 1
+; RUN: grep {vshll\\.s16} %t | count 1
+; RUN: grep {vshll\\.s32} %t | count 1
+; RUN: grep {vshll\\.u8} %t | count 1
+; RUN: grep {vshll\\.u16} %t | count 1
+; RUN: grep {vshll\\.u32} %t | count 1
+; RUN: grep {vshll\\.i8} %t | count 1
+; RUN: grep {vshll\\.i16} %t | count 1
+; RUN: grep {vshll\\.i32} %t | count 1
+
+define <8 x i16> @vshlls8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlls16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlls32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vshllu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshllu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
+	ret <2 x i64> %tmp2
+}
+
+; The following tests use the maximum shift count, so the signedness is
+; irrelevant.  Test both signed and unsigned versions.
+define <8 x i16> @vshlli8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vshlli16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 16, i16 16, i16 16, i16 16 >)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vshlli32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 32, i32 32 >)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vshrn.ll
+++ b/test/CodeGen/ARM/vshrn.ll
@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vshrn\\.i16} %t | count 1
+; RUN: grep {vshrn\\.i32} %t | count 1
+; RUN: grep {vshrn\\.i64} %t | count 1
+
+define <8 x i8> @vshrns8(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vsra.ll
+++ b/test/CodeGen/ARM/vsra.ll
@ -0,0 +1,293 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsra\\.s8} %t | count 2
+; RUN: grep {vsra\\.s16} %t | count 2
+; RUN: grep {vsra\\.s32} %t | count 2
+; RUN: grep {vsra\\.s64} %t | count 2
+; RUN: grep {vsra\\.u8} %t | count 2
+; RUN: grep {vsra\\.u16} %t | count 2
+; RUN: grep {vsra\\.u32} %t | count 2
+; RUN: grep {vsra\\.u64} %t | count 2
+; RUN: grep {vrsra\\.s8} %t | count 2
+; RUN: grep {vrsra\\.s16} %t | count 2
+; RUN: grep {vrsra\\.s32} %t | count 2
+; RUN: grep {vrsra\\.s64} %t | count 2
+; RUN: grep {vrsra\\.u8} %t | count 2
+; RUN: grep {vrsra\\.u16} %t | count 2
+; RUN: grep {vrsra\\.u32} %t | count 2
+; RUN: grep {vrsra\\.u64} %t | count 2
+
+define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = ashr <1 x i64> %tmp2, < i64 64 >
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = lshr <1 x i64> %tmp2, < i64 64 >
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
+        %tmp4 = add <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vrsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vsub.ll
+++ b/test/CodeGen/ARM/vsub.ll
@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsub\\.i8} %t | count 2
+; RUN: grep {vsub\\.i16} %t | count 2
+; RUN: grep {vsub\\.i32} %t | count 2
+; RUN: grep {vsub\\.i64} %t | count 2
+; RUN: grep {vsub\\.f32} %t | count 2
+
+define <8 x i8> @vsubi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = sub <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsubi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = sub <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsubi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = sub <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vsubi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = sub <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <2 x float> @vsubf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = sub <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vsubQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = sub <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vsubQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = sub <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = sub <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = sub <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = sub <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
--- a/test/CodeGen/ARM/vsubhn.ll
+++ b/test/CodeGen/ARM/vsubhn.ll
@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsubhn\\.i16} %t | count 1
+; RUN: grep {vsubhn\\.i32} %t | count 1
+; RUN: grep {vsubhn\\.i64} %t | count 1
+
+define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vsubl.ll
+++ b/test/CodeGen/ARM/vsubl.ll
@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vsubl\\.s8} %t | count 1
+; RUN: grep {vsubl\\.s16} %t | count 1
+; RUN: grep {vsubl\\.s32} %t | count 1
+; RUN: grep {vsubl\\.u8} %t | count 1
+; RUN: grep {vsubl\\.u16} %t | count 1
+; RUN: grep {vsubl\\.u32} %t | count 1
+
+define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
--- a/Show More
+++ b/Show More