Add support for ARM's Advanced SIMD (NEON) instruction set.

This is still a work in progress but most of the NEON instruction set
is supported.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73919 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bob Wilson 2009-06-22 23:27:02 +00:00
parent 5de83afcdc
commit 5bafff36c7
102 changed files with 10302 additions and 127 deletions

View File

@ -116,6 +116,7 @@ def llvm_v2i64_ty : LLVMType<v2i64>; // 2 x i64
def llvm_v2i32_ty : LLVMType<v2i32>; // 2 x i32
def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64
def llvm_v4i32_ty : LLVMType<v4i32>; // 4 x i32
def llvm_v2f32_ty : LLVMType<v2f32>; // 2 x float
def llvm_v4f32_ty : LLVMType<v4f32>; // 4 x float
def llvm_v2f64_ty : LLVMType<v2f64>; // 2 x double

View File

@ -19,3 +19,298 @@ let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.".
def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
// Advanced SIMD (NEON)
let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.".
// The following classes do not correspond directly to GCC builtins.
class Neon_1Arg_Intrinsic
: Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
class Neon_1Arg_Float_Intrinsic
: Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
class Neon_1Arg_Narrow_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
class Neon_1Arg_Long_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMTruncatedElementVectorType<0>], [IntrNoMem]>;
class Neon_2Arg_Intrinsic
: Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
class Neon_2Arg_Float_Intrinsic
: Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
class Neon_2Arg_Narrow_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMExtendedElementVectorType<0>,
LLVMExtendedElementVectorType<0>],
[IntrNoMem]>;
class Neon_2Arg_Long_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMTruncatedElementVectorType<0>,
LLVMTruncatedElementVectorType<0>],
[IntrNoMem]>;
class Neon_2Arg_Wide_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
[IntrNoMem]>;
class Neon_3Arg_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
class Neon_3Arg_Long_Intrinsic
: Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>,
LLVMTruncatedElementVectorType<0>,
LLVMTruncatedElementVectorType<0>],
[IntrNoMem]>;
class Neon_CvtFxToFP_Intrinsic
: Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
class Neon_CvtFPToFx_Intrinsic
: Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
}
// Arithmetic ops
let Properties = [IntrNoMem, Commutative] in {
// Vector Add.
def int_arm_neon_vhadds : Neon_2Arg_Intrinsic;
def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic;
def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic;
def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic;
def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic;
// Vector Multiply.
def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
// Vector Multiply and Accumulate/Subtract.
def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;
def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;
def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;
def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;
def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
// Vector Maximum.
def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic;
// Vector Minimum.
def int_arm_neon_vmins : Neon_2Arg_Intrinsic;
def int_arm_neon_vminu : Neon_2Arg_Intrinsic;
def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic;
// Vector Reciprocal Step.
def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic;
// Vector Reciprocal Square Root Step.
def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic;
}
// Vector Subtract.
def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic;
def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic;
// Vector Absolute Compare.
let TargetPrefix = "arm" in {
def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
[llvm_v2f32_ty, llvm_v2f32_ty],
[IntrNoMem]>;
def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty],
[IntrNoMem]>;
def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
[llvm_v2f32_ty, llvm_v2f32_ty],
[IntrNoMem]>;
def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty],
[IntrNoMem]>;
}
// Vector Absolute Differences.
def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic;
def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
// Vector Absolute Difference and Accumulate.
def int_arm_neon_vabas : Neon_3Arg_Intrinsic;
def int_arm_neon_vabau : Neon_3Arg_Intrinsic;
def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic;
def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
// Vector Pairwise Add.
def int_arm_neon_vpaddi : Neon_2Arg_Intrinsic;
def int_arm_neon_vpaddf : Neon_2Arg_Float_Intrinsic;
// Vector Pairwise Add Long.
// Note: This is different than the other "long" NEON intrinsics because
// the result vector has half as many elements as the source vector.
// The source and destination vector types must be specified separately.
let TargetPrefix = "arm" in {
def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
[IntrNoMem]>;
def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
[IntrNoMem]>;
}
// Vector Pairwise Add and Accumulate Long.
// Note: This is similar to vpaddl but the destination vector also appears
// as the first argument.
let TargetPrefix = "arm" in {
def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, llvm_anyint_ty],
[IntrNoMem]>;
def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, llvm_anyint_ty],
[IntrNoMem]>;
}
// Vector Pairwise Maximum and Minimum.
def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic;
def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic;
def int_arm_neon_vpmins : Neon_2Arg_Intrinsic;
def int_arm_neon_vpminu : Neon_2Arg_Intrinsic;
def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic;
// Vector Shifts:
//
// The various saturating and rounding vector shift operations need to be
// represented by intrinsics in LLVM, and even the basic VSHL variable shift
// operation cannot be safely translated to LLVM's shift operators. VSHL can
// be used for both left and right shifts, or even combinations of the two,
// depending on the signs of the shift amounts. It also has well-defined
// behavior for shift amounts that LLVM leaves undefined. Only basic shifts
// by constants can be represented with LLVM's shift operators.
//
// The shift counts for these intrinsics are always vectors, even for constant
// shifts, where the constant is replicated. For consistency with VSHL (and
// other variable shift instructions), left shifts have positive shift counts
// and right shifts have negative shift counts. This convention is also used
// for constant right shift intrinsics, and to help preserve sanity, the
// intrinsic names use "shift" instead of either "shl" or "shr". Where
// applicable, signed and unsigned versions of the intrinsics are
// distinguished with "s" and "u" suffixes. A few NEON shift instructions,
// such as VQSHLU, take signed operands but produce unsigned results; these
// use a "su" suffix.
// Vector Shift.
def int_arm_neon_vshifts : Neon_2Arg_Intrinsic;
def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic;
def int_arm_neon_vshiftls : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vshiftlu : Neon_2Arg_Long_Intrinsic;
def int_arm_neon_vshiftn : Neon_2Arg_Narrow_Intrinsic;
// Vector Rounding Shift.
def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic;
def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic;
def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic;
// Vector Saturating Shift.
def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic;
def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic;
def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic;
def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic;
// Vector Saturating Rounding Shift.
def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic;
def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic;
def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic;
def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic;
// Vector Shift and Insert.
def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic;
// Vector Absolute Value and Saturating Absolute Value.
def int_arm_neon_vabs : Neon_1Arg_Intrinsic;
def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic;
def int_arm_neon_vqabs : Neon_1Arg_Intrinsic;
// Vector Saturating Negate.
def int_arm_neon_vqneg : Neon_1Arg_Intrinsic;
// Vector Count Leading Sign/Zero Bits.
def int_arm_neon_vcls : Neon_1Arg_Intrinsic;
def int_arm_neon_vclz : Neon_1Arg_Intrinsic;
// Vector Count One Bits.
def int_arm_neon_vcnt : Neon_1Arg_Intrinsic;
// Vector Reciprocal Estimate.
def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic;
def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic;
// Vector Reciprocal Square Root Estimate.
def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic;
def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic;
// Vector Conversions Between Floating-point and Fixed-point.
def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic;
def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic;
def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;
def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;
// Narrowing and Lengthening Vector Moves.
def int_arm_neon_vmovn : Neon_1Arg_Narrow_Intrinsic;
def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;
def int_arm_neon_vmovls : Neon_1Arg_Long_Intrinsic;
def int_arm_neon_vmovlu : Neon_1Arg_Long_Intrinsic;
let TargetPrefix = "arm" in {
// De-interleaving vector loads from N-element structures.
def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty],
[llvm_ptr_ty], [IntrReadArgMem]>;
def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty],
[llvm_ptr_ty], [IntrReadArgMem]>;
def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty],
[llvm_ptr_ty], [IntrReadArgMem]>;
def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty],
[llvm_ptr_ty], [IntrReadArgMem]>;
// Interleaving vector stores from N-element structures.
def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty],
[llvm_anyint_ty, llvm_ptr_ty],
[IntrWriteArgMem]>;
def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty],
[llvm_anyfloat_ty, llvm_ptr_ty],
[IntrWriteArgMem]>;
def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty],
[llvm_anyint_ty, llvm_ptr_ty],
[IntrWriteArgMem]>;
def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty],
[llvm_anyfloat_ty, llvm_ptr_ty],
[IntrWriteArgMem]>;
}

View File

@ -24,19 +24,29 @@ def CC_ARM_APCS : CallingConv<[
CCIfType<[i8, i16], CCPromoteToType<i32>>,
// f64 is passed in pairs of GPRs, possibly split onto the stack
CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
// Handle all vector types as either f64 or v2f64.
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
CCIfType<[f32], CCBitConvertToType<i32>>,
CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
CCIfType<[i32], CCAssignToStack<4, 4>>,
CCIfType<[f64], CCAssignToStack<8, 4>>
CCIfType<[f64], CCAssignToStack<8, 4>>,
CCIfType<[v2f64], CCAssignToStack<16, 4>>
]>;
def RetCC_ARM_APCS : CallingConv<[
CCIfType<[f32], CCBitConvertToType<i32>>,
CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
// Handle all vector types as either f64 or v2f64.
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
@ -59,7 +69,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
CCAssignToReg<[R0, R1, R2, R3]>>>,
CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
CCIfType<[f64], CCAssignToStack<8, 8>>
CCIfType<[f64], CCAssignToStack<8, 8>>,
CCIfType<[v2f64], CCAssignToStack<16, 8>>
]>;
def RetCC_ARM_AAPCS_Common : CallingConv<[
@ -72,13 +83,21 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[
//===----------------------------------------------------------------------===//
def CC_ARM_AAPCS : CallingConv<[
CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
// Handle all vector types as either f64 or v2f64.
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
CCIfType<[f32], CCBitConvertToType<i32>>,
CCDelegateTo<CC_ARM_AAPCS_Common>
]>;
def RetCC_ARM_AAPCS : CallingConv<[
CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
// Handle all vector types as either f64 or v2f64.
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
CCIfType<[f32], CCBitConvertToType<i32>>,
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;
@ -88,6 +107,10 @@ def RetCC_ARM_AAPCS : CallingConv<[
//===----------------------------------------------------------------------===//
def CC_ARM_AAPCS_VFP : CallingConv<[
// Handle all vector types as either f64 or v2f64.
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15]>>,
@ -95,6 +118,10 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
]>;
def RetCC_ARM_AAPCS_VFP : CallingConv<[
// Handle all vector types as either f64 or v2f64.
CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15]>>,

View File

@ -32,6 +32,9 @@
#include "llvm/Support/Debug.h"
using namespace llvm;
static const unsigned arm_dsubreg_0 = 5;
static const unsigned arm_dsubreg_1 = 6;
//===--------------------------------------------------------------------===//
/// ARMDAGToDAGISel - ARM specific code to select ARM machine
/// instructions for SelectionDAG operations.
@ -918,6 +921,65 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
MVT::Other, Ops, 3);
}
case ISD::CONCAT_VECTORS: {
MVT VT = Op.getValueType();
assert(VT.is128BitVector() && Op.getNumOperands() == 2 &&
"unexpected CONCAT_VECTORS");
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
SDNode *Result =
CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT);
if (N0.getOpcode() != ISD::UNDEF)
Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
SDValue(Result, 0), N0,
CurDAG->getTargetConstant(arm_dsubreg_0,
MVT::i32));
if (N1.getOpcode() != ISD::UNDEF)
Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
SDValue(Result, 0), N1,
CurDAG->getTargetConstant(arm_dsubreg_1,
MVT::i32));
return Result;
}
case ISD::VECTOR_SHUFFLE: {
MVT VT = Op.getValueType();
// Match 128-bit splat to VDUPLANEQ. (This could be done with a Pat in
// ARMInstrNEON.td but it is awkward because the shuffle mask needs to be
// transformed first into a lane number and then to both a subregister
// index and an adjusted lane number.) If the source operand is a
// SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP.
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
if (VT.is128BitVector() && SVOp->isSplat() &&
Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR &&
Op.getOperand(1).getOpcode() == ISD::UNDEF) {
unsigned LaneVal = SVOp->getSplatIndex();
MVT HalfVT;
unsigned Opc = 0;
switch (VT.getVectorElementType().getSimpleVT()) {
default: assert(false && "unhandled VDUP splat type");
case MVT::i8: Opc = ARM::VDUPLN8q; HalfVT = MVT::v8i8; break;
case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break;
case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break;
case MVT::f32: Opc = ARM::VDUPLNfq; HalfVT = MVT::v2f32; break;
}
// The source operand needs to be changed to a subreg of the original
// 128-bit operand, and the lane number needs to be adjusted accordingly.
unsigned NumElts = VT.getVectorNumElements() / 2;
unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1);
SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32);
SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32);
SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG,
dl, HalfVT, N->getOperand(0), SR);
return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane);
}
break;
}
}
return SelectCode(Op);

File diff suppressed because it is too large Load Diff

View File

@ -67,10 +67,65 @@ namespace llvm {
EH_SJLJ_SETJMP, // SjLj exception handling setjmp
EH_SJLJ_LONGJMP, // SjLj exception handling longjmp
THREAD_POINTER
THREAD_POINTER,
VCEQ, // Vector compare equal.
VCGE, // Vector compare greater than or equal.
VCGEU, // Vector compare unsigned greater than or equal.
VCGT, // Vector compare greater than.
VCGTU, // Vector compare unsigned greater than.
VTST, // Vector test bits.
// Vector shift by immediate:
VSHL, // ...left
VSHRs, // ...right (signed)
VSHRu, // ...right (unsigned)
VSHLLs, // ...left long (signed)
VSHLLu, // ...left long (unsigned)
VSHLLi, // ...left long (with maximum shift count)
VSHRN, // ...right narrow
// Vector rounding shift by immediate:
VRSHRs, // ...right (signed)
VRSHRu, // ...right (unsigned)
VRSHRN, // ...right narrow
// Vector saturating shift by immediate:
VQSHLs, // ...left (signed)
VQSHLu, // ...left (unsigned)
VQSHLsu, // ...left (signed to unsigned)
VQSHRNs, // ...right narrow (signed)
VQSHRNu, // ...right narrow (unsigned)
VQSHRNsu, // ...right narrow (signed to unsigned)
// Vector saturating rounding shift by immediate:
VQRSHRNs, // ...right narrow (signed)
VQRSHRNu, // ...right narrow (unsigned)
VQRSHRNsu, // ...right narrow (signed to unsigned)
// Vector shift and insert:
VSLI, // ...left
VSRI, // ...right
// Vector get lane (VMOV scalar to ARM core register)
// (These are used for 8- and 16-bit element types only.)
VGETLANEu, // zero-extend vector extract element
VGETLANEs, // sign-extend vector extract element
// Vector duplicate lane (128-bit result only; 64-bit is a shuffle)
VDUPLANEQ // splat a lane from a 64-bit vector to a 128-bit vector
};
}
/// Define some predicates that are used for node matching.
namespace ARM {
/// getVMOVImm - If this is a build_vector of constants which can be
/// formed by using a VMOV instruction of the specified element size,
/// return the constant being splatted. The ByteSize field indicates the
/// number of bytes of each element [1248].
SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
}
//===--------------------------------------------------------------------===//
// ARMTargetLowering - ARM Implementation of the TargetLowering interface
@ -151,6 +206,21 @@ namespace llvm {
///
unsigned ARMPCLabelIndex;
void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT);
typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
SDValue Chain, SDValue &Arg,
RegsToPassVector &RegsToPass,
CCValAssign &VA, CCValAssign &NextVA,
SDValue &StackPtr,
SmallVector<SDValue, 8> &MemOpChains,
ISD::ArgFlagsTy Flags);
SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const;
SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
const SDValue &StackPtr, const CCValAssign &VA,

View File

@ -49,6 +49,11 @@ def VFPMiscFrm : Format<22>;
def ThumbFrm : Format<23>;
def NEONFrm : Format<24>;
def NEONGetLnFrm : Format<25>;
def NEONSetLnFrm : Format<26>;
def NEONDupFrm : Format<27>;
// Misc flag for data processing instructions that indicates whether
// the instruction has a Rn register operand.
class UnaryDP { bit isUnaryDataProc = 1; }
@ -737,6 +742,14 @@ class TIx2<dag outs, dag ins, string asm, list<dag> pattern>
class TJTI<dag outs, dag ins, string asm, list<dag> pattern>
: ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;
// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsThumb];
}
class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsThumb, HasV5T];
}
//===----------------------------------------------------------------------===//
@ -857,12 +870,102 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// ARM NEON Instruction templates.
//
// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsThumb];
class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
string cstr, list<dag> pattern>
: InstARM<am, Size4Bytes, im, NEONFrm, cstr> {
let OutOperandList = oops;
let InOperandList = iops;
let AsmString = asm;
let Pattern = pattern;
list<Predicate> Predicates = [HasNEON];
}
class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsThumb, HasV5T];
class NI<dag oops, dag iops, string asm, list<dag> pattern>
: NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> {
}
class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern>
: NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> {
let Inst{31-25} = 0b1111001;
}
// NEON "one register and a modified immediate" format.
class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
bit op5, bit op4,
dag oops, dag iops, string asm, string cstr, list<dag> pattern>
: NDataI<oops, iops, asm, cstr, pattern> {
let Inst{23} = op23;
let Inst{21-19} = op21_19;
let Inst{11-8} = op11_8;
let Inst{7} = op7;
let Inst{6} = op6;
let Inst{5} = op5;
let Inst{4} = op4;
}
// NEON 2 vector register format.
class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
bits<5> op11_7, bit op6, bit op4,
dag oops, dag iops, string asm, string cstr, list<dag> pattern>
: NDataI<oops, iops, asm, cstr, pattern> {
let Inst{24-23} = op24_23;
let Inst{21-20} = op21_20;
let Inst{19-18} = op19_18;
let Inst{17-16} = op17_16;
let Inst{11-7} = op11_7;
let Inst{6} = op6;
let Inst{4} = op4;
}
// NEON 2 vector register with immediate.
class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
bit op6, bit op4,
dag oops, dag iops, string asm, string cstr, list<dag> pattern>
: NDataI<oops, iops, asm, cstr, pattern> {
let Inst{24} = op24;
let Inst{23} = op23;
let Inst{21-16} = op21_16;
let Inst{11-8} = op11_8;
let Inst{7} = op7;
let Inst{6} = op6;
let Inst{4} = op4;
}
// NEON 3 vector register format.
class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
dag oops, dag iops, string asm, string cstr, list<dag> pattern>
: NDataI<oops, iops, asm, cstr, pattern> {
let Inst{24} = op24;
let Inst{23} = op23;
let Inst{21-20} = op21_20;
let Inst{11-8} = op11_8;
let Inst{6} = op6;
let Inst{4} = op4;
}
// NEON VMOVs between scalar and core registers.
class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
dag oops, dag iops, Format f, string opc, string asm,
list<dag> pattern>
: AI<oops, iops, f, opc, asm, pattern> {
let Inst{27-20} = opcod1;
let Inst{11-8} = opcod2;
let Inst{6-5} = opcod3;
let Inst{4} = 1;
list<Predicate> Predicates = [HasNEON];
}
class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
dag oops, dag iops, string opc, string asm, list<dag> pattern>
: NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm,
pattern>;
class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
dag oops, dag iops, string opc, string asm, list<dag> pattern>
: NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm,
pattern>;
class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
dag oops, dag iops, string opc, string asm, list<dag> pattern>
: NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>;

View File

@ -59,6 +59,8 @@ bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI,
return false;
case ARM::FCPYS:
case ARM::FCPYD:
case ARM::VMOVD:
case ARM::VMOVQ:
SrcReg = MI.getOperand(1).getReg();
DstReg = MI.getOperand(0).getReg();
return true;
@ -528,6 +530,8 @@ bool ARMInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
else if (DestRC == ARM::DPRRegisterClass)
AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
.addReg(SrcReg));
else if (DestRC == ARM::QPRRegisterClass)
BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
else
return false;
@ -844,6 +848,10 @@ canFoldMemoryOperand(const MachineInstr *MI,
case ARM::FCPYS:
case ARM::FCPYD:
return true;
case ARM::VMOVD:
case ARM::VMOVQ:
return false; // FIXME
}
return false;

View File

@ -114,6 +114,12 @@ namespace ARMII {
// Thumb format
ThumbFrm = 23 << FormShift,
// NEON format
NEONFrm = 24 << FormShift,
NEONGetLnFrm = 25 << FormShift,
NEONSetLnFrm = 26 << FormShift,
NEONDupFrm = 27 << FormShift,
//===------------------------------------------------------------------===//
// Field shifts - such shifts are used to set field while generating
// machine instructions.

View File

@ -93,6 +93,10 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
def HasV5T : Predicate<"Subtarget->hasV5TOps()">;
def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">;
def HasV6 : Predicate<"Subtarget->hasV6Ops()">;
def HasV7 : Predicate<"Subtarget->hasV7Ops()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2()">;
def HasVFP3 : Predicate<"Subtarget->hasVFP3()">;
def HasNEON : Predicate<"Subtarget->hasNEON()">;
def IsThumb : Predicate<"Subtarget->isThumb()">;
def HasThumb2 : Predicate<"Subtarget->hasThumb2()">;
def IsARM : Predicate<"!Subtarget->isThumb()">;
@ -1437,3 +1441,9 @@ include "ARMInstrThumb2.td"
//
include "ARMInstrVFP.td"
//===----------------------------------------------------------------------===//
// Advanced SIMD (NEON) Support
//
include "ARMInstrNEON.td"

File diff suppressed because it is too large Load Diff

View File

@ -77,6 +77,34 @@ def D13 : ARMReg<13, "d13", [S26, S27]>;
def D14 : ARMReg<14, "d14", [S28, S29]>;
def D15 : ARMReg<15, "d15", [S30, S31]>;
// VFP3 defines 16 additional double registers
def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">;
def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">;
def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">;
def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">;
def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">;
def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">;
def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">;
def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">;
// Advanced SIMD (NEON) defines 16 quad-word aliases
def Q0 : ARMReg< 0, "q0", [D0, D1]>;
def Q1 : ARMReg< 1, "q1", [D2, D3]>;
def Q2 : ARMReg< 2, "q2", [D4, D5]>;
def Q3 : ARMReg< 3, "q3", [D6, D7]>;
def Q4 : ARMReg< 4, "q4", [D8, D9]>;
def Q5 : ARMReg< 5, "q5", [D10, D11]>;
def Q6 : ARMReg< 6, "q6", [D12, D13]>;
def Q7 : ARMReg< 7, "q7", [D14, D15]>;
def Q8 : ARMReg< 8, "q8", [D16, D17]>;
def Q9 : ARMReg< 9, "q9", [D18, D19]>;
def Q10 : ARMReg<10, "q10", [D20, D21]>;
def Q11 : ARMReg<11, "q11", [D22, D23]>;
def Q12 : ARMReg<12, "q12", [D24, D25]>;
def Q13 : ARMReg<13, "q13", [D26, D27]>;
def Q14 : ARMReg<14, "q14", [D28, D29]>;
def Q15 : ARMReg<15, "q15", [D30, D31]>;
// Current Program Status Register.
def CPSR : ARMReg<0, "cpsr">;
@ -207,14 +235,67 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
}];
}
// Scalar single precision floating point register class..
def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
// Scalar double precision floating point / generic 64-bit vector register
// class.
// ARM requires only word alignment for double. It's more performant if it
// is double-word alignment though.
def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
D9, D10, D11, D12, D13, D14, D15]>;
def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
[D0, D1, D2, D3, D4, D5, D6, D7,
D8, D9, D10, D11, D12, D13, D14, D15]> {
let SubRegClassList = [SPR, SPR];
let MethodProtos = [{
iterator allocation_order_begin(const MachineFunction &MF) const;
iterator allocation_order_end(const MachineFunction &MF) const;
}];
let MethodBodies = [{
// VFP2
static const unsigned ARM_DPR_VFP2[] = {
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
// VFP3
static const unsigned ARM_DPR_VFP3[] = {
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
ARM::D4, ARM::D5, ARM::D6, ARM::D7,
ARM::D8, ARM::D9, ARM::D10, ARM::D11,
ARM::D12, ARM::D13, ARM::D14, ARM::D15,
ARM::D16, ARM::D17, ARM::D18, ARM::D15,
ARM::D20, ARM::D21, ARM::D22, ARM::D23,
ARM::D24, ARM::D25, ARM::D26, ARM::D27,
ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
DPRClass::iterator
DPRClass::allocation_order_begin(const MachineFunction &MF) const {
const TargetMachine &TM = MF.getTarget();
const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
if (Subtarget.hasVFP3())
return ARM_DPR_VFP3;
return ARM_DPR_VFP2;
}
DPRClass::iterator
DPRClass::allocation_order_end(const MachineFunction &MF) const {
const TargetMachine &TM = MF.getTarget();
const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
if (Subtarget.hasVFP3())
return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned));
else
return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned));
}
}];
}
// Generic 128-bit vector register class.
def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> {
let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR];
}
// Condition code registers.
def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
@ -224,12 +305,40 @@ def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
// sub registers for each register.
//
def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7,
D8, D9, D10, D11, D12, D13, D14, D15],
[S0, S2, S4, S6, S8, S10, S12, S14,
S16, S18, S20, S22, S24, S26, S28, S30]>;
def arm_ssubreg_0 : PatLeaf<(i32 1)>;
def arm_ssubreg_1 : PatLeaf<(i32 2)>;
def arm_ssubreg_2 : PatLeaf<(i32 3)>;
def arm_ssubreg_3 : PatLeaf<(i32 4)>;
def arm_dsubreg_0 : PatLeaf<(i32 5)>;
def arm_dsubreg_1 : PatLeaf<(i32 6)>;
def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7,
D8, D9, D10, D11, D12, D13, D14, D15],
[S1, S3, S5, S7, S9, S11, S13, S15,
// S sub-registers of D registers.
def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7,
D8, D9, D10, D11, D12, D13, D14, D15],
[S0, S2, S4, S6, S8, S10, S12, S14,
S16, S18, S20, S22, S24, S26, S28, S30]>;
def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7,
D8, D9, D10, D11, D12, D13, D14, D15],
[S1, S3, S5, S7, S9, S11, S13, S15,
S17, S19, S21, S23, S25, S27, S29, S31]>;
// S sub-registers of Q registers.
def : SubRegSet<1, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7],
[S0, S4, S8, S12, S16, S20, S24, S28]>;
def : SubRegSet<2, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7],
[S1, S5, S9, S13, S17, S21, S25, S29]>;
def : SubRegSet<3, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7],
[S2, S6, S10, S14, S18, S22, S26, S30]>;
def : SubRegSet<4, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7],
[S3, S7, S11, S15, S19, S23, S27, S31]>;
// D sub-registers of Q registers.
def : SubRegSet<5, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
[D0, D2, D4, D6, D8, D10, D12, D14,
D16, D18, D20, D22, D24, D26, D28, D30]>;
def : SubRegSet<6, [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
[D1, D3, D5, D7, D9, D11, D13, D15,
D17, D19, D21, D23, D25, D27, D29, D31]>;

View File

@ -285,12 +285,22 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
const char *Modifier) {
const MachineOperand &MO = MI->getOperand(opNum);
switch (MO.getType()) {
case MachineOperand::MO_Register:
if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
else
case MachineOperand::MO_Register: {
unsigned Reg = MO.getReg();
if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
if (Modifier && strcmp(Modifier, "dregpair") == 0) {
unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
O << '{'
<< TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi)
<< '}';
} else {
O << TRI->getAsmName(Reg);
}
} else
assert(0 && "not implemented");
break;
}
case MachineOperand::MO_Immediate: {
if (!Modifier || strcmp(Modifier, "no_hash") != 0)
O << "#";

View File

@ -552,3 +552,23 @@ __Z11no_overflowjj:
//===---------------------------------------------------------------------===//
Some of the NEON intrinsics may be appropriate for more general use, either
as target-independent intrinsics or perhaps elsewhere in the ARM backend.
Some of them may also be lowered to target-independent SDNodes, and perhaps
some new SDNodes could be added.
For example, maximum, minimum, and absolute value operations are well-defined
and standard operations, both for vector and scalar types.
The current NEON-specific intrinsics for count leading zeros and count one
bits could perhaps be replaced by the target-independent ctlz and ctpop
intrinsics. It may also make sense to add a target-independent "ctls"
intrinsic for "count leading sign bits". Likewise, the backend could use
the target-independent SDNodes for these operations.
ARMv6 has scalar saturating and halving adds and subtracts. The same
intrinsics could possibly be used for both NEON's vector implementations of
those operations and the ARMv6 scalar versions.
//===---------------------------------------------------------------------===//

View File

@ -0,0 +1,62 @@
; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6,+vfp2
@"\01LC" = external constant [15 x i8] ; <[15 x i8]*> [#uses=1]
declare i32 @printf(i8* nocapture, ...) nounwind
define i32 @main() nounwind {
entry:
br label %bb.i1.i
bb.i1.i: ; preds = %Cos.exit.i.i, %entry
br label %bb.i.i.i
bb.i.i.i: ; preds = %bb.i.i.i, %bb.i1.i
br i1 undef, label %Cos.exit.i.i, label %bb.i.i.i
Cos.exit.i.i: ; preds = %bb.i.i.i
br i1 undef, label %bb2.i.i, label %bb.i1.i
bb2.i.i: ; preds = %Cos.exit.i.i
br label %bb3.i.i
bb3.i.i: ; preds = %bb5.i.i, %bb2.i.i
br label %bb4.i.i
bb4.i.i: ; preds = %bb4.i.i, %bb3.i.i
br i1 undef, label %bb5.i.i, label %bb4.i.i
bb5.i.i: ; preds = %bb4.i.i
br i1 undef, label %bb.i, label %bb3.i.i
bb.i: ; preds = %bb.i, %bb5.i.i
br i1 undef, label %bb1.outer2.i.i.outer, label %bb.i
bb1.outer2.i.i.outer: ; preds = %Fft.exit.i, %bb5.i12.i, %bb.i
br label %bb1.outer2.i.i
bb1.outer2.i.i: ; preds = %bb2.i9.i, %bb1.outer2.i.i.outer
br label %bb1.i.i
bb1.i.i: ; preds = %bb1.i.i, %bb1.outer2.i.i
br i1 undef, label %bb2.i9.i, label %bb1.i.i
bb2.i9.i: ; preds = %bb1.i.i
br i1 undef, label %bb4.i11.i, label %bb1.outer2.i.i
bb4.i11.i: ; preds = %bb4.i11.i, %bb2.i9.i
br i1 undef, label %bb5.i12.i, label %bb4.i11.i
bb5.i12.i: ; preds = %bb4.i11.i
br i1 undef, label %bb7.i.i, label %bb1.outer2.i.i.outer
bb7.i.i: ; preds = %bb7.i.i, %bb5.i12.i
br i1 undef, label %Fft.exit.i, label %bb7.i.i
Fft.exit.i: ; preds = %bb7.i.i
br i1 undef, label %bb5.i, label %bb1.outer2.i.i.outer
bb5.i: ; preds = %Fft.exit.i
%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([15 x i8]* @"\01LC", i32 0, i32 0), double undef, double undef) nounwind ; <i32> [#uses=0]
unreachable
}

View File

@ -0,0 +1,7 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vadd
define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
entry:
%0 = add <8 x i8> %a, %b
ret <8 x i8> %0
}

View File

@ -0,0 +1,22 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fldd | count 4
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fstd
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd
define void @t1(<2 x i32>* %r, <4 x i16>* %a, <4 x i16>* %b) nounwind {
entry:
%0 = load <4 x i16>* %a, align 8 ; <<4 x i16>> [#uses=1]
%1 = load <4 x i16>* %b, align 8 ; <<4 x i16>> [#uses=1]
%2 = add <4 x i16> %0, %1 ; <<4 x i16>> [#uses=1]
%3 = bitcast <4 x i16> %2 to <2 x i32> ; <<2 x i32>> [#uses=1]
store <2 x i32> %3, <2 x i32>* %r, align 8
ret void
}
define <2 x i32> @t2(<4 x i16>* %a, <4 x i16>* %b) nounwind readonly {
entry:
%0 = load <4 x i16>* %a, align 8 ; <<4 x i16>> [#uses=1]
%1 = load <4 x i16>* %b, align 8 ; <<4 x i16>> [#uses=1]
%2 = sub <4 x i16> %0, %1 ; <<4 x i16>> [#uses=1]
%3 = bitcast <4 x i16> %2 to <2 x i32> ; <<2 x i32>> [#uses=1]
ret <2 x i32> %3
}

View File

@ -0,0 +1,23 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vldmia | count 4
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vstmia | count 1
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd | count 2
define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
entry:
%0 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1]
%1 = load <2 x i64>* %b, align 16 ; <<2 x i64>> [#uses=1]
%2 = add <2 x i64> %0, %1 ; <<2 x i64>> [#uses=1]
%3 = bitcast <2 x i64> %2 to <4 x i32> ; <<4 x i32>> [#uses=1]
store <4 x i32> %3, <4 x i32>* %r, align 16
ret void
}
define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
entry:
%0 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1]
%1 = load <2 x i64>* %b, align 16 ; <<2 x i64>> [#uses=1]
%2 = sub <2 x i64> %0, %1 ; <<2 x i64>> [#uses=1]
%3 = bitcast <2 x i64> %2 to <4 x i32> ; <<4 x i32>> [#uses=1]
ret <4 x i32> %3
}

119
test/CodeGen/ARM/vaba.ll Normal file
View File

@ -0,0 +1,119 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vaba\\.s8} %t | count 2
; RUN: grep {vaba\\.s16} %t | count 2
; RUN: grep {vaba\\.s32} %t | count 2
; RUN: grep {vaba\\.u8} %t | count 2
; RUN: grep {vaba\\.u16} %t | count 2
; RUN: grep {vaba\\.u32} %t | count 2
define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i8> %tmp4
}
define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i16> %tmp4
}
define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i32> %tmp4
}
define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i8> %tmp4
}
define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i16> %tmp4
}
define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i32> %tmp4
}
define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
%tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
ret <16 x i8> %tmp4
}
define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
ret <4 x i32> %tmp4
}
define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
%tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
ret <16 x i8> %tmp4
}
define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
ret <4 x i32> %tmp4
}
declare <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone

63
test/CodeGen/ARM/vabal.ll Normal file
View File

@ -0,0 +1,63 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vabal\\.s8} %t | count 1
; RUN: grep {vabal\\.s16} %t | count 1
; RUN: grep {vabal\\.s32} %t | count 1
; RUN: grep {vabal\\.u8} %t | count 1
; RUN: grep {vabal\\.u16} %t | count 1
; RUN: grep {vabal\\.u32} %t | count 1
define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

126
test/CodeGen/ARM/vabd.ll Normal file
View File

@ -0,0 +1,126 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vabd\\.s8} %t | count 2
; RUN: grep {vabd\\.s16} %t | count 2
; RUN: grep {vabd\\.s32} %t | count 2
; RUN: grep {vabd\\.u8} %t | count 2
; RUN: grep {vabd\\.u16} %t | count 2
; RUN: grep {vabd\\.u32} %t | count 2
; RUN: grep {vabd\\.f32} %t | count 2
define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vabds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vabds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vabdu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vabdu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vabdu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vabdf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <16 x i8> @vabdQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vabdQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vabdQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vabdQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <4 x float> @vabdQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone

57
test/CodeGen/ARM/vabdl.ll Normal file
View File

@ -0,0 +1,57 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vabdl\\.s8} %t | count 1
; RUN: grep {vabdl\\.s16} %t | count 1
; RUN: grep {vabdl\\.s32} %t | count 1
; RUN: grep {vabdl\\.u8} %t | count 1
; RUN: grep {vabdl\\.u16} %t | count 1
; RUN: grep {vabdl\\.u32} %t | count 1
define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

64
test/CodeGen/ARM/vabs.ll Normal file
View File

@ -0,0 +1,64 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vabs\\.s8} %t | count 2
; RUN: grep {vabs\\.s16} %t | count 2
; RUN: grep {vabs\\.s32} %t | count 2
; RUN: grep {vabs\\.f32} %t | count 2
define <8 x i8> @vabss8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vabss16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vabss32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <2 x float> @vabsf32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <16 x i8> @vabsQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vabsQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vabsQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <4 x float> @vabsQf32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone

19
test/CodeGen/ARM/vacge.ll Normal file
View File

@ -0,0 +1,19 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vacge\\.f32} %t | count 2
define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone

19
test/CodeGen/ARM/vacgt.ll Normal file
View File

@ -0,0 +1,19 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vacgt\\.f32} %t | count 2
define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x i32> %tmp3
}
define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone

76
test/CodeGen/ARM/vadd.ll Normal file
View File

@ -0,0 +1,76 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vadd\\.i8} %t | count 2
; RUN: grep {vadd\\.i16} %t | count 2
; RUN: grep {vadd\\.i32} %t | count 2
; RUN: grep {vadd\\.i64} %t | count 2
; RUN: grep {vadd\\.f32} %t | count 2
define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = add <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = add <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = add <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @vaddi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = add <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <2 x float> @vaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = add <2 x float> %tmp1, %tmp2
ret <2 x float> %tmp3
}
define <16 x i8> @vaddQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = add <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vaddQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = add <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = add <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = add <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = add <4 x float> %tmp1, %tmp2
ret <4 x float> %tmp3
}

View File

@ -0,0 +1,29 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vaddhn\\.i16} %t | count 1
; RUN: grep {vaddhn\\.i32} %t | count 1
; RUN: grep {vaddhn\\.i64} %t | count 1
define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

57
test/CodeGen/ARM/vaddl.ll Normal file
View File

@ -0,0 +1,57 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vaddl\\.s8} %t | count 1
; RUN: grep {vaddl\\.s16} %t | count 1
; RUN: grep {vaddl\\.s32} %t | count 1
; RUN: grep {vaddl\\.u8} %t | count 1
; RUN: grep {vaddl\\.u16} %t | count 1
; RUN: grep {vaddl\\.u32} %t | count 1
define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

57
test/CodeGen/ARM/vaddw.ll Normal file
View File

@ -0,0 +1,57 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vaddw\\.s8} %t | count 1
; RUN: grep {vaddw\\.s16} %t | count 1
; RUN: grep {vaddw\\.s32} %t | count 1
; RUN: grep {vaddw\\.u8} %t | count 1
; RUN: grep {vaddw\\.u16} %t | count 1
; RUN: grep {vaddw\\.u32} %t | count 1
define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone

59
test/CodeGen/ARM/vand.ll Normal file
View File

@ -0,0 +1,59 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vand %t | count 8
; Note: function names do not include "vand" to allow simple grep for opcodes
define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = and <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = and <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = and <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = and <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = and <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = and <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = and <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = and <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}

67
test/CodeGen/ARM/vbic.ll Normal file
View File

@ -0,0 +1,67 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vbic %t | count 8
; Note: function names do not include "vbic" to allow simple grep for opcodes
define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = and <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = and <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
%tmp4 = and <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
%tmp4 = and <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = and <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = and <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp4 = and <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
%tmp4 = and <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}

91
test/CodeGen/ARM/vbsl.ll Normal file
View File

@ -0,0 +1,91 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vbsl %t | count 8
; Note: function names do not include "vbsl" to allow simple grep for opcodes
define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = and <8 x i8> %tmp1, %tmp2
%tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp6 = and <8 x i8> %tmp5, %tmp3
%tmp7 = or <8 x i8> %tmp4, %tmp6
ret <8 x i8> %tmp7
}
define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = and <4 x i16> %tmp1, %tmp2
%tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp6 = and <4 x i16> %tmp5, %tmp3
%tmp7 = or <4 x i16> %tmp4, %tmp6
ret <4 x i16> %tmp7
}
define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = and <2 x i32> %tmp1, %tmp2
%tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
%tmp6 = and <2 x i32> %tmp5, %tmp3
%tmp7 = or <2 x i32> %tmp4, %tmp6
ret <2 x i32> %tmp7
}
define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = load <1 x i64>* %C
%tmp4 = and <1 x i64> %tmp1, %tmp2
%tmp5 = xor <1 x i64> %tmp1, < i64 -1 >
%tmp6 = and <1 x i64> %tmp5, %tmp3
%tmp7 = or <1 x i64> %tmp4, %tmp6
ret <1 x i64> %tmp7
}
define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
%tmp4 = and <16 x i8> %tmp1, %tmp2
%tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp6 = and <16 x i8> %tmp5, %tmp3
%tmp7 = or <16 x i8> %tmp4, %tmp6
ret <16 x i8> %tmp7
}
define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
%tmp4 = and <8 x i16> %tmp1, %tmp2
%tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp6 = and <8 x i16> %tmp5, %tmp3
%tmp7 = or <8 x i16> %tmp4, %tmp6
ret <8 x i16> %tmp7
}
define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
%tmp4 = and <4 x i32> %tmp1, %tmp2
%tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp6 = and <4 x i32> %tmp5, %tmp3
%tmp7 = or <4 x i32> %tmp4, %tmp6
ret <4 x i32> %tmp7
}
define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = load <2 x i64>* %C
%tmp4 = and <2 x i64> %tmp1, %tmp2
%tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
%tmp6 = and <2 x i64> %tmp5, %tmp3
%tmp7 = or <2 x i64> %tmp4, %tmp6
ret <2 x i64> %tmp7
}

61
test/CodeGen/ARM/vceq.ll Normal file
View File

@ -0,0 +1,61 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vceq\\.i8} %t | count 2
; RUN: grep {vceq\\.i16} %t | count 2
; RUN: grep {vceq\\.i32} %t | count 2
; RUN: grep {vceq\\.f32} %t | count 2
define <8 x i8> @vceqi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = vicmp eq <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vceqi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp eq <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vceqi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = vicmp eq <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <2 x i32> @vceqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp oeq <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <16 x i8> @vceqQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = vicmp eq <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vceqQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = vicmp eq <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vceqQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = vicmp eq <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <4 x i32> @vceqQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = vfcmp oeq <4 x float> %tmp1, %tmp2
ret <4 x i32> %tmp3
}

106
test/CodeGen/ARM/vcge.ll Normal file
View File

@ -0,0 +1,106 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vcge\\.s8} %t | count 2
; RUN: grep {vcge\\.s16} %t | count 2
; RUN: grep {vcge\\.s32} %t | count 2
; RUN: grep {vcge\\.u8} %t | count 2
; RUN: grep {vcge\\.u16} %t | count 2
; RUN: grep {vcge\\.u32} %t | count 2
; RUN: grep {vcge\\.f32} %t | count 2
define <8 x i8> @vcges8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = vicmp sge <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vcges16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp sge <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vcges32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = vicmp sge <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <8 x i8> @vcgeu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = vicmp uge <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vcgeu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp uge <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vcgeu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = vicmp uge <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <2 x i32> @vcgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp oge <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <16 x i8> @vcgeQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = vicmp sge <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vcgeQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = vicmp sge <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vcgeQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = vicmp sge <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <16 x i8> @vcgeQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = vicmp uge <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vcgeQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = vicmp uge <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vcgeQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = vicmp uge <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = vfcmp oge <4 x float> %tmp1, %tmp2
ret <4 x i32> %tmp3
}

106
test/CodeGen/ARM/vcgt.ll Normal file
View File

@ -0,0 +1,106 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vcgt\\.s8} %t | count 2
; RUN: grep {vcgt\\.s16} %t | count 2
; RUN: grep {vcgt\\.s32} %t | count 2
; RUN: grep {vcgt\\.u8} %t | count 2
; RUN: grep {vcgt\\.u16} %t | count 2
; RUN: grep {vcgt\\.u32} %t | count 2
; RUN: grep {vcgt\\.f32} %t | count 2
define <8 x i8> @vcgts8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = vicmp sgt <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vcgts16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp sgt <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vcgts32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = vicmp sgt <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <8 x i8> @vcgtu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = vicmp ugt <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vcgtu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp ugt <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vcgtu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = vicmp ugt <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <2 x i32> @vcgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp ogt <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <16 x i8> @vcgtQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = vicmp sgt <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vcgtQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = vicmp sgt <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vcgtQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = vicmp sgt <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <16 x i8> @vcgtQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = vicmp ugt <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vcgtQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = vicmp ugt <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vcgtQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = vicmp ugt <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = vfcmp ogt <4 x float> %tmp1, %tmp2
ret <4 x i32> %tmp3
}

48
test/CodeGen/ARM/vcls.ll Normal file
View File

@ -0,0 +1,48 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vcls\\.s8} %t | count 2
; RUN: grep {vcls\\.s16} %t | count 2
; RUN: grep {vcls\\.s32} %t | count 2
define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone

48
test/CodeGen/ARM/vclz.ll Normal file
View File

@ -0,0 +1,48 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vclz\\.i8} %t | count 2
; RUN: grep {vclz\\.i16} %t | count 2
; RUN: grep {vclz\\.i32} %t | count 2
define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone

17
test/CodeGen/ARM/vcnt.ll Normal file
View File

@ -0,0 +1,17 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vcnt\\.8} %t | count 2
define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone

53
test/CodeGen/ARM/vcvt.ll Normal file
View File

@ -0,0 +1,53 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = fptosi <2 x float> %tmp1 to <2 x i32>
ret <2 x i32> %tmp2
}
define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = fptoui <2 x float> %tmp1 to <2 x i32>
ret <2 x i32> %tmp2
}
define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = sitofp <2 x i32> %tmp1 to <2 x float>
ret <2 x float> %tmp2
}
define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = uitofp <2 x i32> %tmp1 to <2 x float>
ret <2 x float> %tmp2
}
define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = fptosi <4 x float> %tmp1 to <4 x i32>
ret <4 x i32> %tmp2
}
define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = fptoui <4 x float> %tmp1 to <4 x i32>
ret <4 x i32> %tmp2
}
define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = sitofp <4 x i32> %tmp1 to <4 x float>
ret <4 x float> %tmp2
}
define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = uitofp <4 x i32> %tmp1 to <4 x float>
ret <4 x float> %tmp2
}

View File

@ -0,0 +1,64 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1)
ret <2 x i32> %tmp2
}
define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1)
ret <2 x i32> %tmp2
}
define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
ret <2 x float> %tmp2
}
define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
ret <2 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1)
ret <4 x i32> %tmp2
}
define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1)
ret <4 x i32> %tmp2
}
define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
ret <4 x float> %tmp2
}
define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
ret <4 x float> %tmp2
}
declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone

134
test/CodeGen/ARM/vdup.ll Normal file
View File

@ -0,0 +1,134 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vdup.8 %t | count 4
; RUN: grep vdup.16 %t | count 4
; RUN: grep vdup.32 %t | count 8
define <8 x i8> @v_dup8(i8 %A) nounwind {
%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
ret <8 x i8> %tmp8
}
define <4 x i16> @v_dup16(i16 %A) nounwind {
%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_dup32(i32 %A) nounwind {
%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
ret <2 x i32> %tmp2
}
define <2 x float> @v_dupfloat(float %A) nounwind {
%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
ret <2 x float> %tmp2
}
define <16 x i8> @v_dupQ8(i8 %A) nounwind {
%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
ret <16 x i8> %tmp16
}
define <8 x i16> @v_dupQ16(i16 %A) nounwind {
%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
ret <8 x i16> %tmp8
}
define <4 x i32> @v_dupQ32(i32 %A) nounwind {
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
ret <4 x i32> %tmp4
}
define <4 x float> @v_dupQfloat(float %A) nounwind {
%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
ret <4 x float> %tmp4
}
; Check to make sure it works with shuffles, too.
define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
ret <8 x i8> %tmp2
}
define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %tmp2
}
define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
ret <2 x i32> %tmp2
}
define <2 x float> @v_shuffledupfloat(float %A) nounwind {
%tmp1 = insertelement <2 x float> undef, float %A, i32 0
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
ret <2 x float> %tmp2
}
define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %tmp2
}
define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %tmp2
}
define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
ret <4 x i32> %tmp2
}
define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
%tmp1 = insertelement <4 x float> undef, float %A, i32 0
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %tmp2
}

View File

@ -0,0 +1,52 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vdup.8 %t | count 2
; RUN: grep vdup.16 %t | count 2
; RUN: grep vdup.32 %t | count 4
define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x i32> %tmp2
}
define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x float> %tmp2
}
define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i32> %tmp2
}
define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x float> %tmp2
}

59
test/CodeGen/ARM/veor.ll Normal file
View File

@ -0,0 +1,59 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep veor %t | count 8
; Note: function names do not include "veor" to allow simple grep for opcodes
define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}

96
test/CodeGen/ARM/vfcmp.ll Normal file
View File

@ -0,0 +1,96 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vceq\\.f32} %t | count 1
; RUN: grep {vcgt\\.f32} %t | count 9
; RUN: grep {vcge\\.f32} %t | count 5
; RUN: grep vorr %t | count 4
; RUN: grep vmvn %t | count 7
; This tests vfcmp operations that do not map directly to NEON instructions.
; une is implemented with VCEQ/VMVN
define <2 x i32> @vcunef32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp une <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; olt is implemented with VCGT
define <2 x i32> @vcoltf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp olt <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; ole is implemented with VCGE
define <2 x i32> @vcolef32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp ole <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; uge is implemented with VCGT/VMVN
define <2 x i32> @vcugef32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp uge <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; ule is implemented with VCGT/VMVN
define <2 x i32> @vculef32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp ule <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; ugt is implemented with VCGE/VMVN
define <2 x i32> @vcugtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp ugt <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; ult is implemented with VCGE/VMVN
define <2 x i32> @vcultf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp ult <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; ueq is implemented with VCGT/VCGT/VORR/VMVN
define <2 x i32> @vcueqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp ueq <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; one is implemented with VCGT/VCGT/VORR
define <2 x i32> @vconef32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp one <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; uno is implemented with VCGT/VCGE/VORR/VMVN
define <2 x i32> @vcunof32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp uno <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
; ord is implemented with VCGT/VCGE/VORR
define <2 x i32> @vcordf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = vfcmp ord <2 x float> %tmp1, %tmp2
ret <2 x i32> %tmp3
}

View File

@ -0,0 +1,78 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmov\\.s8} %t | count 2
; RUN: grep {vmov\\.s16} %t | count 2
; RUN: grep {vmov\\.u8} %t | count 2
; RUN: grep {vmov\\.u16} %t | count 2
; RUN: grep {vmov\\.32} %t | count 2
define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = extractelement <8 x i8> %tmp1, i32 1
%tmp3 = sext i8 %tmp2 to i32
ret i32 %tmp3
}
define i32 @vget_lanes16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = extractelement <4 x i16> %tmp1, i32 1
%tmp3 = sext i16 %tmp2 to i32
ret i32 %tmp3
}
define i32 @vget_laneu8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = extractelement <8 x i8> %tmp1, i32 1
%tmp3 = zext i8 %tmp2 to i32
ret i32 %tmp3
}
define i32 @vget_laneu16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = extractelement <4 x i16> %tmp1, i32 1
%tmp3 = zext i16 %tmp2 to i32
ret i32 %tmp3
}
; Do a vector add to keep the extraction from being done directly from memory.
define i32 @vget_lanei32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = add <2 x i32> %tmp1, %tmp1
%tmp3 = extractelement <2 x i32> %tmp2, i32 1
ret i32 %tmp3
}
define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = extractelement <16 x i8> %tmp1, i32 1
%tmp3 = sext i8 %tmp2 to i32
ret i32 %tmp3
}
define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = extractelement <8 x i16> %tmp1, i32 1
%tmp3 = sext i16 %tmp2 to i32
ret i32 %tmp3
}
define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = extractelement <16 x i8> %tmp1, i32 1
%tmp3 = zext i8 %tmp2 to i32
ret i32 %tmp3
}
define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = extractelement <8 x i16> %tmp1, i32 1
%tmp3 = zext i16 %tmp2 to i32
ret i32 %tmp3
}
; Do a vector add to keep the extraction from being done directly from memory.
define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = add <4 x i32> %tmp1, %tmp1
%tmp3 = extractelement <4 x i32> %tmp2, i32 1
ret i32 %tmp3
}

107
test/CodeGen/ARM/vhadd.ll Normal file
View File

@ -0,0 +1,107 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vhadd\\.s8} %t | count 2
; RUN: grep {vhadd\\.s16} %t | count 2
; RUN: grep {vhadd\\.s32} %t | count 2
; RUN: grep {vhadd\\.u8} %t | count 2
; RUN: grep {vhadd\\.u16} %t | count 2
; RUN: grep {vhadd\\.u32} %t | count 2
define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <16 x i8> @vhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

107
test/CodeGen/ARM/vhsub.ll Normal file
View File

@ -0,0 +1,107 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vhsub\\.s8} %t | count 2
; RUN: grep {vhsub\\.s16} %t | count 2
; RUN: grep {vhsub\\.s32} %t | count 2
; RUN: grep {vhsub\\.u8} %t | count 2
; RUN: grep {vhsub\\.u16} %t | count 2
; RUN: grep {vhsub\\.u32} %t | count 2
define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vhsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vhsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vhsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vhsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vhsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <16 x i8> @vhsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vhsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vhsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vhsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vhsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vhsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

85
test/CodeGen/ARM/vicmp.ll Normal file
View File

@ -0,0 +1,85 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vceq\\.i8} %t | count 2
; RUN: grep {vceq\\.i16} %t | count 2
; RUN: grep {vceq\\.i32} %t | count 2
; RUN: grep vmvn %t | count 6
; RUN: grep {vcgt\\.s8} %t | count 1
; RUN: grep {vcge\\.s16} %t | count 1
; RUN: grep {vcgt\\.u16} %t | count 1
; RUN: grep {vcge\\.u32} %t | count 1
; This tests vicmp operations that do not map directly to NEON instructions.
; Not-equal (ne) operations are implemented by VCEQ/VMVN. Less-than (lt/ult)
; and less-than-or-equal (le/ule) are implemented by swapping the arguments
; to VCGT and VCGE. Test all the operand types for not-equal but only sample
; the other operations.
define <8 x i8> @vcnei8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = vicmp ne <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vcnei16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp ne <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vcnei32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = vicmp ne <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <16 x i8> @vcneQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = vicmp ne <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vcneQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = vicmp ne <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vcneQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = vicmp ne <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <16 x i8> @vcltQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = vicmp slt <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <4 x i16> @vcles16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp sle <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <4 x i16> @vcltu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = vicmp ult <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <4 x i32> @vcleQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = vicmp ule <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}

126
test/CodeGen/ARM/vmax.ll Normal file
View File

@ -0,0 +1,126 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmax\\.s8} %t | count 2
; RUN: grep {vmax\\.s16} %t | count 2
; RUN: grep {vmax\\.s32} %t | count 2
; RUN: grep {vmax\\.u8} %t | count 2
; RUN: grep {vmax\\.u16} %t | count 2
; RUN: grep {vmax\\.u32} %t | count 2
; RUN: grep {vmax\\.f32} %t | count 2
define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <16 x i8> @vmaxQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vmaxQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmaxQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vmaxQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vmaxQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmaxQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float>, <4 x float>) nounwind readnone

126
test/CodeGen/ARM/vmin.ll Normal file
View File

@ -0,0 +1,126 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmin\\.s8} %t | count 2
; RUN: grep {vmin\\.s16} %t | count 2
; RUN: grep {vmin\\.s32} %t | count 2
; RUN: grep {vmin\\.u8} %t | count 2
; RUN: grep {vmin\\.u16} %t | count 2
; RUN: grep {vmin\\.u32} %t | count 2
; RUN: grep {vmin\\.f32} %t | count 2
define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float>, <4 x float>) nounwind readnone

77
test/CodeGen/ARM/vmla.ll Normal file
View File

@ -0,0 +1,77 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmla\\.i8} %t | count 2
; RUN: grep {vmla\\.i16} %t | count 2
; RUN: grep {vmla\\.i32} %t | count 2
; RUN: grep {vmla\\.f32} %t | count 2
define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = mul <8 x i8> %tmp2, %tmp3
%tmp5 = add <8 x i8> %tmp1, %tmp4
ret <8 x i8> %tmp5
}
define <4 x i16> @vmlai16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = mul <4 x i16> %tmp2, %tmp3
%tmp5 = add <4 x i16> %tmp1, %tmp4
ret <4 x i16> %tmp5
}
define <2 x i32> @vmlai32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = mul <2 x i32> %tmp2, %tmp3
%tmp5 = add <2 x i32> %tmp1, %tmp4
ret <2 x i32> %tmp5
}
define <2 x float> @vmlaf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = load <2 x float>* %C
%tmp4 = mul <2 x float> %tmp2, %tmp3
%tmp5 = add <2 x float> %tmp1, %tmp4
ret <2 x float> %tmp5
}
define <16 x i8> @vmlaQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
%tmp4 = mul <16 x i8> %tmp2, %tmp3
%tmp5 = add <16 x i8> %tmp1, %tmp4
ret <16 x i8> %tmp5
}
define <8 x i16> @vmlaQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
%tmp4 = mul <8 x i16> %tmp2, %tmp3
%tmp5 = add <8 x i16> %tmp1, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vmlaQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
%tmp4 = mul <4 x i32> %tmp2, %tmp3
%tmp5 = add <4 x i32> %tmp1, %tmp4
ret <4 x i32> %tmp5
}
define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = load <4 x float>* %C
%tmp4 = mul <4 x float> %tmp2, %tmp3
%tmp5 = add <4 x float> %tmp1, %tmp4
ret <4 x float> %tmp5
}

63
test/CodeGen/ARM/vmlal.ll Normal file
View File

@ -0,0 +1,63 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmlal\\.s8} %t | count 1
; RUN: grep {vmlal\\.s16} %t | count 1
; RUN: grep {vmlal\\.s32} %t | count 1
; RUN: grep {vmlal\\.u8} %t | count 1
; RUN: grep {vmlal\\.u16} %t | count 1
; RUN: grep {vmlal\\.u32} %t | count 1
define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

77
test/CodeGen/ARM/vmls.ll Normal file
View File

@ -0,0 +1,77 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmls\\.i8} %t | count 2
; RUN: grep {vmls\\.i16} %t | count 2
; RUN: grep {vmls\\.i32} %t | count 2
; RUN: grep {vmls\\.f32} %t | count 2
define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = mul <8 x i8> %tmp2, %tmp3
%tmp5 = sub <8 x i8> %tmp1, %tmp4
ret <8 x i8> %tmp5
}
define <4 x i16> @vmlsi16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = mul <4 x i16> %tmp2, %tmp3
%tmp5 = sub <4 x i16> %tmp1, %tmp4
ret <4 x i16> %tmp5
}
define <2 x i32> @vmlsi32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = mul <2 x i32> %tmp2, %tmp3
%tmp5 = sub <2 x i32> %tmp1, %tmp4
ret <2 x i32> %tmp5
}
define <2 x float> @vmlsf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = load <2 x float>* %C
%tmp4 = mul <2 x float> %tmp2, %tmp3
%tmp5 = sub <2 x float> %tmp1, %tmp4
ret <2 x float> %tmp5
}
define <16 x i8> @vmlsQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
%tmp4 = mul <16 x i8> %tmp2, %tmp3
%tmp5 = sub <16 x i8> %tmp1, %tmp4
ret <16 x i8> %tmp5
}
define <8 x i16> @vmlsQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
%tmp4 = mul <8 x i16> %tmp2, %tmp3
%tmp5 = sub <8 x i16> %tmp1, %tmp4
ret <8 x i16> %tmp5
}
define <4 x i32> @vmlsQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
%tmp4 = mul <4 x i32> %tmp2, %tmp3
%tmp5 = sub <4 x i32> %tmp1, %tmp4
ret <4 x i32> %tmp5
}
define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = load <4 x float>* %C
%tmp4 = mul <4 x float> %tmp2, %tmp3
%tmp5 = sub <4 x float> %tmp1, %tmp4
ret <4 x float> %tmp5
}

63
test/CodeGen/ARM/vmlsl.ll Normal file
View File

@ -0,0 +1,63 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmlsl\\.s8} %t | count 1
; RUN: grep {vmlsl\\.s16} %t | count 1
; RUN: grep {vmlsl\\.s32} %t | count 1
; RUN: grep {vmlsl\\.u8} %t | count 1
; RUN: grep {vmlsl\\.u16} %t | count 1
; RUN: grep {vmlsl\\.u32} %t | count 1
define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
%tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
ret <8 x i16> %tmp4
}
define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

101
test/CodeGen/ARM/vmov.ll Normal file
View File

@ -0,0 +1,101 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vmov.i8 %t | count 2
; RUN: grep vmov.i16 %t | count 4
; RUN: grep vmov.i32 %t | count 12
; RUN: grep vmov.i64 %t | count 2
; Note: function names do not include "vmov" to allow simple grep for opcodes
define <8 x i8> @v_movi8() nounwind {
ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <4 x i16> @v_movi16a() nounwind {
ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
}
; 0x1000 = 4096
define <4 x i16> @v_movi16b() nounwind {
ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <2 x i32> @v_movi32a() nounwind {
ret <2 x i32> < i32 32, i32 32 >
}
; 0x2000 = 8192
define <2 x i32> @v_movi32b() nounwind {
ret <2 x i32> < i32 8192, i32 8192 >
}
; 0x200000 = 2097152
define <2 x i32> @v_movi32c() nounwind {
ret <2 x i32> < i32 2097152, i32 2097152 >
}
; 0x20000000 = 536870912
define <2 x i32> @v_movi32d() nounwind {
ret <2 x i32> < i32 536870912, i32 536870912 >
}
; 0x20ff = 8447
define <2 x i32> @v_movi32e() nounwind {
ret <2 x i32> < i32 8447, i32 8447 >
}
; 0x20ffff = 2162687
define <2 x i32> @v_movi32f() nounwind {
ret <2 x i32> < i32 2162687, i32 2162687 >
}
; 0xff0000ff0000ffff = 18374687574888349695
define <1 x i64> @v_movi64() nounwind {
ret <1 x i64> < i64 18374687574888349695 >
}
define <16 x i8> @v_movQi8() nounwind {
ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <8 x i16> @v_movQi16a() nounwind {
ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
}
; 0x1000 = 4096
define <8 x i16> @v_movQi16b() nounwind {
ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <4 x i32> @v_movQi32a() nounwind {
ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
}
; 0x2000 = 8192
define <4 x i32> @v_movQi32b() nounwind {
ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
}
; 0x200000 = 2097152
define <4 x i32> @v_movQi32c() nounwind {
ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
}
; 0x20000000 = 536870912
define <4 x i32> @v_movQi32d() nounwind {
ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
}
; 0x20ff = 8447
define <4 x i32> @v_movQi32e() nounwind {
ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
}
; 0x20ffff = 2162687
define <4 x i32> @v_movQi32f() nounwind {
ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
}
; 0xff0000ff0000ffff = 18374687574888349695
define <2 x i64> @v_movQi64() nounwind {
ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
}

51
test/CodeGen/ARM/vmovl.ll Normal file
View File

@ -0,0 +1,51 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmovl\\.s8} %t | count 1
; RUN: grep {vmovl\\.s16} %t | count 1
; RUN: grep {vmovl\\.s32} %t | count 1
; RUN: grep {vmovl\\.u8} %t | count 1
; RUN: grep {vmovl\\.u16} %t | count 1
; RUN: grep {vmovl\\.u32} %t | count 1
define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1)
ret <2 x i64> %tmp2
}
define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1)
ret <2 x i64> %tmp2
}
declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone

26
test/CodeGen/ARM/vmovn.ll Normal file
View File

@ -0,0 +1,26 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmovn\\.i16} %t | count 1
; RUN: grep {vmovn\\.i32} %t | count 1
; RUN: grep {vmovn\\.i64} %t | count 1
define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone

79
test/CodeGen/ARM/vmul.ll Normal file
View File

@ -0,0 +1,79 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmul\\.i8} %t | count 2
; RUN: grep {vmul\\.i16} %t | count 2
; RUN: grep {vmul\\.i32} %t | count 2
; RUN: grep {vmul\\.f32} %t | count 2
; RUN: grep {vmul\\.p8} %t | count 2
define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = mul <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = mul <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = mul <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = mul <2 x float> %tmp1, %tmp2
ret <2 x float> %tmp3
}
define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = mul <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = mul <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = mul <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = mul <4 x float> %tmp1, %tmp2
ret <4 x float> %tmp3
}
define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone

67
test/CodeGen/ARM/vmull.ll Normal file
View File

@ -0,0 +1,67 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmull\\.s8} %t | count 1
; RUN: grep {vmull\\.s16} %t | count 1
; RUN: grep {vmull\\.s32} %t | count 1
; RUN: grep {vmull\\.u8} %t | count 1
; RUN: grep {vmull\\.u16} %t | count 1
; RUN: grep {vmull\\.u32} %t | count 1
; RUN: grep {vmull\\.p8} %t | count 1
define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone

51
test/CodeGen/ARM/vmvn.ll Normal file
View File

@ -0,0 +1,51 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vmvn %t | count 8
; Note: function names do not include "vmvn" to allow simple grep for opcodes
define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
ret <2 x i32> %tmp2
}
define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = xor <1 x i64> %tmp1, < i64 -1 >
ret <1 x i64> %tmp2
}
define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
ret <4 x i32> %tmp2
}
define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
ret <2 x i64> %tmp2
}

53
test/CodeGen/ARM/vneg.ll Normal file
View File

@ -0,0 +1,53 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vneg\\.s8} %t | count 2
; RUN: grep {vneg\\.s16} %t | count 2
; RUN: grep {vneg\\.s32} %t | count 2
; RUN: grep {vneg\\.f32} %t | count 2
define <8 x i8> @vnegs8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = sub <8 x i8> zeroinitializer, %tmp1
ret <8 x i8> %tmp2
}
define <4 x i16> @vnegs16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = sub <4 x i16> zeroinitializer, %tmp1
ret <4 x i16> %tmp2
}
define <2 x i32> @vnegs32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = sub <2 x i32> zeroinitializer, %tmp1
ret <2 x i32> %tmp2
}
define <2 x float> @vnegf32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = sub <2 x float> < float -0.000000e+00, float -0.000000e+00 >, %tmp1
ret <2 x float> %tmp2
}
define <16 x i8> @vnegQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = sub <16 x i8> zeroinitializer, %tmp1
ret <16 x i8> %tmp2
}
define <8 x i16> @vnegQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = sub <8 x i16> zeroinitializer, %tmp1
ret <8 x i16> %tmp2
}
define <4 x i32> @vnegQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = sub <4 x i32> zeroinitializer, %tmp1
ret <4 x i32> %tmp2
}
define <4 x float> @vnegQf32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = sub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp1
ret <4 x float> %tmp2
}

67
test/CodeGen/ARM/vorn.ll Normal file
View File

@ -0,0 +1,67 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vorn %t | count 8
; Note: function names do not include "vorn" to allow simple grep for opcodes
define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = or <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = or <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
%tmp4 = or <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
%tmp4 = or <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp4 = or <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp4 = or <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
%tmp4 = or <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
%tmp4 = or <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}

59
test/CodeGen/ARM/vorr.ll Normal file
View File

@ -0,0 +1,59 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep vorr %t | count 8
; Note: function names do not include "vorr" to allow simple grep for opcodes
define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = or <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = or <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = or <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = or <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = or <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = or <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = or <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = or <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}

107
test/CodeGen/ARM/vpadal.ll Normal file
View File

@ -0,0 +1,107 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vpadal\\.s8} %t | count 2
; RUN: grep {vpadal\\.s16} %t | count 2
; RUN: grep {vpadal\\.s32} %t | count 2
; RUN: grep {vpadal\\.u8} %t | count 2
; RUN: grep {vpadal\\.u16} %t | count 2
; RUN: grep {vpadal\\.u32} %t | count 2
define <4 x i16> @vpadals8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpadals16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vpadals32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
ret <1 x i64> %tmp3
}
define <4 x i16> @vpadalu8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpadalu16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vpadalu32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i16> @vpadalQs8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vpadalQs16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vpadalQs32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vpadalQu8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vpadalQu16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vpadalQu32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) nounwind readnone

39
test/CodeGen/ARM/vpadd.ll Normal file
View File

@ -0,0 +1,39 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vpadd\\.i8} %t | count 1
; RUN: grep {vpadd\\.i16} %t | count 1
; RUN: grep {vpadd\\.i32} %t | count 1
; RUN: grep {vpadd\\.f32} %t | count 1
define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float>, <2 x float>) nounwind readnone

View File

@ -0,0 +1,95 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vpaddl\\.s8} %t | count 2
; RUN: grep {vpaddl\\.s16} %t | count 2
; RUN: grep {vpaddl\\.s32} %t | count 2
; RUN: grep {vpaddl\\.u8} %t | count 2
; RUN: grep {vpaddl\\.u16} %t | count 2
; RUN: grep {vpaddl\\.u32} %t | count 2
define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp2
}
define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp2
}
define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
ret <2 x i32> %tmp2
}
define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
ret <1 x i64> %tmp2
}
define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp2
}
define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
ret <2 x i64> %tmp2
}
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone

67
test/CodeGen/ARM/vpmax.ll Normal file
View File

@ -0,0 +1,67 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vpmax\\.s8} %t | count 1
; RUN: grep {vpmax\\.s16} %t | count 1
; RUN: grep {vpmax\\.s32} %t | count 1
; RUN: grep {vpmax\\.u8} %t | count 1
; RUN: grep {vpmax\\.u16} %t | count 1
; RUN: grep {vpmax\\.u32} %t | count 1
; RUN: grep {vpmax\\.f32} %t | count 1
define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone

67
test/CodeGen/ARM/vpmin.ll Normal file
View File

@ -0,0 +1,67 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vpmin\\.s8} %t | count 1
; RUN: grep {vpmin\\.s16} %t | count 1
; RUN: grep {vpmin\\.s32} %t | count 1
; RUN: grep {vpmin\\.u8} %t | count 1
; RUN: grep {vpmin\\.u16} %t | count 1
; RUN: grep {vpmin\\.u32} %t | count 1
; RUN: grep {vpmin\\.f32} %t | count 1
define <8 x i8> @vpmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vpminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vpminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float>, <2 x float>) nounwind readnone

48
test/CodeGen/ARM/vqabs.ll Normal file
View File

@ -0,0 +1,48 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqabs\\.s8} %t | count 2
; RUN: grep {vqabs\\.s16} %t | count 2
; RUN: grep {vqabs\\.s32} %t | count 2
define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone

141
test/CodeGen/ARM/vqadd.ll Normal file
View File

@ -0,0 +1,141 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqadd\\.s8} %t | count 2
; RUN: grep {vqadd\\.s16} %t | count 2
; RUN: grep {vqadd\\.s32} %t | count 2
; RUN: grep {vqadd\\.s64} %t | count 2
; RUN: grep {vqadd\\.u8} %t | count 2
; RUN: grep {vqadd\\.u16} %t | count 2
; RUN: grep {vqadd\\.u32} %t | count 2
; RUN: grep {vqadd\\.u64} %t | count 2
define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,22 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqdmlal\\.s16} %t | count 1
; RUN: grep {vqdmlal\\.s32} %t | count 1
define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,22 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqdmlsl\\.s16} %t | count 1
; RUN: grep {vqdmlsl\\.s32} %t | count 1
define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,73 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqdmulh\\.s16} %t | count 2
; RUN: grep {vqdmulh\\.s32} %t | count 2
; RUN: grep {vqrdmulh\\.s16} %t | count 2
; RUN: grep {vqrdmulh\\.s32} %t | count 2
define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

View File

@ -0,0 +1,20 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqdmull\\.s16} %t | count 1
; RUN: grep {vqdmull\\.s32} %t | count 1
define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

View File

@ -0,0 +1,76 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqmovn\\.s16} %t | count 1
; RUN: grep {vqmovn\\.s32} %t | count 1
; RUN: grep {vqmovn\\.s64} %t | count 1
; RUN: grep {vqmovn\\.u16} %t | count 1
; RUN: grep {vqmovn\\.u32} %t | count 1
; RUN: grep {vqmovn\\.u64} %t | count 1
; RUN: grep {vqmovun\\.s16} %t | count 1
; RUN: grep {vqmovun\\.s32} %t | count 1
; RUN: grep {vqmovun\\.s64} %t | count 1
define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone

48
test/CodeGen/ARM/vqneg.ll Normal file
View File

@ -0,0 +1,48 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqneg\\.s8} %t | count 2
; RUN: grep {vqneg\\.s16} %t | count 2
; RUN: grep {vqneg\\.s32} %t | count 2
define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone

141
test/CodeGen/ARM/vqrshl.ll Normal file
View File

@ -0,0 +1,141 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqrshl\\.s8} %t | count 2
; RUN: grep {vqrshl\\.s16} %t | count 2
; RUN: grep {vqrshl\\.s32} %t | count 2
; RUN: grep {vqrshl\\.s64} %t | count 2
; RUN: grep {vqrshl\\.u8} %t | count 2
; RUN: grep {vqrshl\\.u16} %t | count 2
; RUN: grep {vqrshl\\.u32} %t | count 2
; RUN: grep {vqrshl\\.u64} %t | count 2
define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,76 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqrshrn\\.s16} %t | count 1
; RUN: grep {vqrshrn\\.s32} %t | count 1
; RUN: grep {vqrshrn\\.s64} %t | count 1
; RUN: grep {vqrshrn\\.u16} %t | count 1
; RUN: grep {vqrshrn\\.u32} %t | count 1
; RUN: grep {vqrshrn\\.u64} %t | count 1
; RUN: grep {vqrshrun\\.s16} %t | count 1
; RUN: grep {vqrshrun\\.s32} %t | count 1
; RUN: grep {vqrshrun\\.s64} %t | count 1
define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

307
test/CodeGen/ARM/vqshl.ll Normal file
View File

@ -0,0 +1,307 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqshl\\.s8} %t | count 4
; RUN: grep {vqshl\\.s16} %t | count 4
; RUN: grep {vqshl\\.s32} %t | count 4
; RUN: grep {vqshl\\.s64} %t | count 4
; RUN: grep {vqshl\\.u8} %t | count 4
; RUN: grep {vqshl\\.u16} %t | count 4
; RUN: grep {vqshl\\.u32} %t | count 4
; RUN: grep {vqshl\\.u64} %t | count 4
; RUN: grep {vqshl\\.s8.*#7} %t | count 2
; RUN: grep {vqshl\\.s16.*#15} %t | count 2
; RUN: grep {vqshl\\.s32.*#31} %t | count 2
; RUN: grep {vqshl\\.s64.*#63} %t | count 2
; RUN: grep {vqshl\\.u8.*#7} %t | count 2
; RUN: grep {vqshl\\.u16.*#15} %t | count 2
; RUN: grep {vqshl\\.u32.*#31} %t | count 2
; RUN: grep {vqshl\\.u64.*#63} %t | count 2
; RUN: grep {vqshlu\\.s8} %t | count 2
; RUN: grep {vqshlu\\.s16} %t | count 2
; RUN: grep {vqshlu\\.s32} %t | count 2
; RUN: grep {vqshlu\\.s64} %t | count 2
define <8 x i8> @vqshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vqshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vqshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vqshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i8> @vqshls_n8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshls_n16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshls_n32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vqshls_n64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vqshlu_n8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshlu_n16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshlu_n32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vqshlu_n64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vqshlsu_n8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshlsu_n16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshlsu_n32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vqshlsu_n64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vqshlQs_n8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqshlQs_n16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqshlQs_n32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vqshlQs_n64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vqshlQu_n8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqshlQu_n16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqshlQu_n32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vqshlQu_n64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vqshlQsu_n8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqshlQsu_n16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqshlQsu_n32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vqshlQsu_n64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,76 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqshrn\\.s16} %t | count 1
; RUN: grep {vqshrn\\.s32} %t | count 1
; RUN: grep {vqshrn\\.s64} %t | count 1
; RUN: grep {vqshrn\\.u16} %t | count 1
; RUN: grep {vqshrn\\.u32} %t | count 1
; RUN: grep {vqshrn\\.u64} %t | count 1
; RUN: grep {vqshrun\\.s16} %t | count 1
; RUN: grep {vqshrun\\.s32} %t | count 1
; RUN: grep {vqshrun\\.s64} %t | count 1
define <8 x i8> @vqshrns8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshrns16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshrns32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqshrnu8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshrnu16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshrnu32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqshruns8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshruns16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshruns32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

141
test/CodeGen/ARM/vqsub.ll Normal file
View File

@ -0,0 +1,141 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vqsub\\.s8} %t | count 2
; RUN: grep {vqsub\\.s16} %t | count 2
; RUN: grep {vqsub\\.s32} %t | count 2
; RUN: grep {vqsub\\.s64} %t | count 2
; RUN: grep {vqsub\\.u8} %t | count 2
; RUN: grep {vqsub\\.u16} %t | count 2
; RUN: grep {vqsub\\.u32} %t | count 2
; RUN: grep {vqsub\\.u64} %t | count 2
define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,29 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vraddhn\\.i16} %t | count 1
; RUN: grep {vraddhn\\.i32} %t | count 1
; RUN: grep {vraddhn\\.i64} %t | count 1
define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,33 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrecpe\\.u32} %t | count 2
; RUN: grep {vrecpe\\.f32} %t | count 2
define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x float> @vrecpef32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float>) nounwind readnone

View File

@ -0,0 +1,19 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrecps\\.f32} %t | count 2
define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone

107
test/CodeGen/ARM/vrhadd.ll Normal file
View File

@ -0,0 +1,107 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrhadd\\.s8} %t | count 2
; RUN: grep {vrhadd\\.s16} %t | count 2
; RUN: grep {vrhadd\\.s32} %t | count 2
; RUN: grep {vrhadd\\.u8} %t | count 2
; RUN: grep {vrhadd\\.u16} %t | count 2
; RUN: grep {vrhadd\\.u32} %t | count 2
define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone

245
test/CodeGen/ARM/vrshl.ll Normal file
View File

@ -0,0 +1,245 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrshl\\.s8} %t | count 2
; RUN: grep {vrshl\\.s16} %t | count 2
; RUN: grep {vrshl\\.s32} %t | count 2
; RUN: grep {vrshl\\.s64} %t | count 2
; RUN: grep {vrshl\\.u8} %t | count 2
; RUN: grep {vrshl\\.u16} %t | count 2
; RUN: grep {vrshl\\.u32} %t | count 2
; RUN: grep {vrshl\\.u64} %t | count 2
; RUN: grep {vrshr\\.s8} %t | count 2
; RUN: grep {vrshr\\.s16} %t | count 2
; RUN: grep {vrshr\\.s32} %t | count 2
; RUN: grep {vrshr\\.s64} %t | count 2
; RUN: grep {vrshr\\.u8} %t | count 2
; RUN: grep {vrshr\\.u16} %t | count 2
; RUN: grep {vrshr\\.u32} %t | count 2
; RUN: grep {vrshr\\.u64} %t | count 2
define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,26 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrshrn\\.i16} %t | count 1
; RUN: grep {vrshrn\\.i32} %t | count 1
; RUN: grep {vrshrn\\.i64} %t | count 1
define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,33 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrsqrte\\.u32} %t | count 2
; RUN: grep {vrsqrte\\.f32} %t | count 2
define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = call <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = call <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone
declare <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float>) nounwind readnone

View File

@ -0,0 +1,19 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrsqrts\\.f32} %t | count 2
define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone

View File

@ -0,0 +1,29 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vrsubhn\\.i16} %t | count 1
; RUN: grep {vrsubhn\\.i32} %t | count 1
; RUN: grep {vrsubhn\\.i64} %t | count 1
define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

View File

@ -0,0 +1,40 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vmov\\.8} %t | count 2
; RUN: grep {vmov\\.16} %t | count 2
; RUN: grep {vmov\\.32} %t | count 2
define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1
ret <8 x i8> %tmp2
}
define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1
ret <4 x i16> %tmp2
}
define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1
ret <2 x i32> %tmp2
}
define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1
ret <16 x i8> %tmp2
}
define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1
ret <8 x i16> %tmp2
}
define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
ret <4 x i32> %tmp2
}

337
test/CodeGen/ARM/vshift.ll Normal file
View File

@ -0,0 +1,337 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vshl\\.s8} %t | count 2
; RUN: grep {vshl\\.s16} %t | count 2
; RUN: grep {vshl\\.s32} %t | count 2
; RUN: grep {vshl\\.s64} %t | count 2
; RUN: grep {vshl\\.u8} %t | count 4
; RUN: grep {vshl\\.u16} %t | count 4
; RUN: grep {vshl\\.u32} %t | count 4
; RUN: grep {vshl\\.u64} %t | count 4
; RUN: grep {vshl\\.i8} %t | count 2
; RUN: grep {vshl\\.i16} %t | count 2
; RUN: grep {vshl\\.i32} %t | count 2
; RUN: grep {vshl\\.i64} %t | count 2
; RUN: grep {vshr\\.u8} %t | count 2
; RUN: grep {vshr\\.u16} %t | count 2
; RUN: grep {vshr\\.u32} %t | count 2
; RUN: grep {vshr\\.u64} %t | count 2
; RUN: grep {vshr\\.s8} %t | count 2
; RUN: grep {vshr\\.s16} %t | count 2
; RUN: grep {vshr\\.s32} %t | count 2
; RUN: grep {vshr\\.s64} %t | count 2
; RUN: grep {vneg\\.s8} %t | count 4
; RUN: grep {vneg\\.s16} %t | count 4
; RUN: grep {vneg\\.s32} %t | count 4
; RUN: grep {vsub\\.i64} %t | count 4
define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = shl <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = shl <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = shl <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = shl <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = shl <8 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = shl <4 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = shl <2 x i32> %tmp1, < i32 31, i32 31 >
ret <2 x i32> %tmp2
}
define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = shl <1 x i64> %tmp1, < i64 63 >
ret <1 x i64> %tmp2
}
define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = shl <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = shl <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = shl <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = shl <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = shl <16 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = shl <8 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = shl <4 x i32> %tmp1, < i32 31, i32 31, i32 31, i32 31 >
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = shl <2 x i64> %tmp1, < i64 63, i64 63 >
ret <2 x i64> %tmp2
}
define <8 x i8> @vlshru8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = lshr <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vlshru16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = lshr <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vlshru32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = lshr <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @vlshru64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = lshr <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <8 x i8> @vlshri8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = lshr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vlshri16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = lshr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vlshri32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = lshr <2 x i32> %tmp1, < i32 32, i32 32 >
ret <2 x i32> %tmp2
}
define <1 x i64> @vlshri64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = lshr <1 x i64> %tmp1, < i64 64 >
ret <1 x i64> %tmp2
}
define <16 x i8> @vlshrQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = lshr <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vlshrQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = lshr <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vlshrQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = lshr <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @vlshrQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = lshr <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <16 x i8> @vlshrQi8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = lshr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vlshrQi16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = lshr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vlshrQi32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = lshr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
ret <4 x i32> %tmp2
}
define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = lshr <2 x i64> %tmp1, < i64 64, i64 64 >
ret <2 x i64> %tmp2
}
define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = ashr <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vashrs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = ashr <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vashrs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = ashr <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @vashrs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = ashr <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <8 x i8> @vashri8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = ashr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vashri16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = ashr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vashri32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = ashr <2 x i32> %tmp1, < i32 32, i32 32 >
ret <2 x i32> %tmp2
}
define <1 x i64> @vashri64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = ashr <1 x i64> %tmp1, < i64 64 >
ret <1 x i64> %tmp2
}
define <16 x i8> @vashrQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = ashr <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vashrQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = ashr <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vashrQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = ashr <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @vashrQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = ashr <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <16 x i8> @vashrQi8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = ashr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vashrQi16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = ashr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vashrQi32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = ashr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
ret <4 x i32> %tmp2
}
define <2 x i64> @vashrQi64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = ashr <2 x i64> %tmp1, < i64 64, i64 64 >
ret <2 x i64> %tmp2
}

View File

@ -0,0 +1,131 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vsli\\.8} %t | count 2
; RUN: grep {vsli\\.16} %t | count 2
; RUN: grep {vsli\\.32} %t | count 2
; RUN: grep {vsli\\.64} %t | count 2
; RUN: grep {vsri\\.8} %t | count 2
; RUN: grep {vsri\\.16} %t | count 2
; RUN: grep {vsri\\.32} %t | count 2
; RUN: grep {vsri\\.64} %t | count 2
define <8 x i8> @vsli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp3
}
define <4 x i16> @vsli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp3
}
define <2 x i32> @vsli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp3
}
define <1 x i64> @vsli64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp3
}
define <16 x i8> @vsliQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp3
}
define <8 x i16> @vsliQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsliQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsliQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp3
}
define <8 x i8> @vsri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp3
}
define <4 x i16> @vsri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp3
}
define <2 x i32> @vsri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp3
}
define <1 x i64> @vsri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp3
}
define <16 x i8> @vsriQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp3
}
define <8 x i16> @vsriQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsriQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsriQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone

302
test/CodeGen/ARM/vshl.ll Normal file
View File

@ -0,0 +1,302 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vshl\\.s8} %t | count 2
; RUN: grep {vshl\\.s16} %t | count 2
; RUN: grep {vshl\\.s32} %t | count 2
; RUN: grep {vshl\\.s64} %t | count 2
; RUN: grep {vshl\\.u8} %t | count 2
; RUN: grep {vshl\\.u16} %t | count 2
; RUN: grep {vshl\\.u32} %t | count 2
; RUN: grep {vshl\\.u64} %t | count 2
; RUN: grep {vshl\\.i8} %t | count 2
; RUN: grep {vshl\\.i16} %t | count 2
; RUN: grep {vshl\\.i32} %t | count 2
; RUN: grep {vshl\\.i64} %t | count 2
; RUN: grep {vshr\\.s8} %t | count 2
; RUN: grep {vshr\\.s16} %t | count 2
; RUN: grep {vshr\\.s32} %t | count 2
; RUN: grep {vshr\\.s64} %t | count 2
; RUN: grep {vshr\\.u8} %t | count 2
; RUN: grep {vshr\\.u16} %t | count 2
; RUN: grep {vshr\\.u32} %t | count 2
; RUN: grep {vshr\\.u64} %t | count 2
define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <8 x i8> @vshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <1 x i64> @vshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
ret <1 x i64> %tmp3
}
define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
define <16 x i8> @vshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
ret <16 x i8> %tmp3
}
define <8 x i16> @vshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i64> %tmp3
}
; For left shifts by immediates, the signedness is irrelevant.
; Test a mix of both signed and unsigned intrinsics.
define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
}
; Right shift by immediate:
define <8 x i8> @vshrs8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshrs16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshrs32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vshrs64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vshru8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshru16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshru32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vshru64(<1 x i64>* %A) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vshrQs8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vshrQs16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshrQs32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshrQs64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vshrQu8(<16 x i8>* %A) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vshrQu16(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshrQu32(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshrQu64(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

74
test/CodeGen/ARM/vshll.ll Normal file
View File

@ -0,0 +1,74 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vshll\\.s8} %t | count 1
; RUN: grep {vshll\\.s16} %t | count 1
; RUN: grep {vshll\\.s32} %t | count 1
; RUN: grep {vshll\\.u8} %t | count 1
; RUN: grep {vshll\\.u16} %t | count 1
; RUN: grep {vshll\\.u32} %t | count 1
; RUN: grep {vshll\\.i8} %t | count 1
; RUN: grep {vshll\\.i16} %t | count 1
; RUN: grep {vshll\\.i32} %t | count 1
define <8 x i16> @vshlls8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlls16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlls32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i64> %tmp2
}
define <8 x i16> @vshllu8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshllu16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i64> %tmp2
}
; The following tests use the maximum shift count, so the signedness is
; irrelevant. Test both signed and unsigned versions.
define <8 x i16> @vshlli8(<8 x i8>* %A) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlli16(<4 x i16>* %A) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 16, i16 16, i16 16, i16 16 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlli32(<2 x i32>* %A) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 32, i32 32 >)
ret <2 x i64> %tmp2
}
declare <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

26
test/CodeGen/ARM/vshrn.ll Normal file
View File

@ -0,0 +1,26 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vshrn\\.i16} %t | count 1
; RUN: grep {vshrn\\.i32} %t | count 1
; RUN: grep {vshrn\\.i64} %t | count 1
define <8 x i8> @vshrns8(<8 x i16>* %A) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

293
test/CodeGen/ARM/vsra.ll Normal file
View File

@ -0,0 +1,293 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vsra\\.s8} %t | count 2
; RUN: grep {vsra\\.s16} %t | count 2
; RUN: grep {vsra\\.s32} %t | count 2
; RUN: grep {vsra\\.s64} %t | count 2
; RUN: grep {vsra\\.u8} %t | count 2
; RUN: grep {vsra\\.u16} %t | count 2
; RUN: grep {vsra\\.u32} %t | count 2
; RUN: grep {vsra\\.u64} %t | count 2
; RUN: grep {vrsra\\.s8} %t | count 2
; RUN: grep {vrsra\\.s16} %t | count 2
; RUN: grep {vrsra\\.s32} %t | count 2
; RUN: grep {vrsra\\.s64} %t | count 2
; RUN: grep {vrsra\\.u8} %t | count 2
; RUN: grep {vrsra\\.u16} %t | count 2
; RUN: grep {vrsra\\.u32} %t | count 2
; RUN: grep {vrsra\\.u64} %t | count 2
define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
%tmp4 = add <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
%tmp4 = add <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 >
%tmp4 = add <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = ashr <1 x i64> %tmp2, < i64 64 >
%tmp4 = add <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
%tmp4 = add <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
%tmp4 = add <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
%tmp4 = add <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 >
%tmp4 = add <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
%tmp4 = add <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
%tmp4 = add <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 >
%tmp4 = add <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = lshr <1 x i64> %tmp2, < i64 64 >
%tmp4 = add <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
%tmp4 = add <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
%tmp4 = add <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
%tmp4 = add <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 >
%tmp4 = add <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
%tmp4 = add <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
%tmp4 = add <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
%tmp4 = add <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
%tmp4 = add <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
%tmp4 = add <8 x i8> %tmp1, %tmp3
ret <8 x i8> %tmp4
}
define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
%tmp4 = add <4 x i16> %tmp1, %tmp3
ret <4 x i16> %tmp4
}
define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
%tmp4 = add <2 x i32> %tmp1, %tmp3
ret <2 x i32> %tmp4
}
define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
%tmp4 = add <1 x i64> %tmp1, %tmp3
ret <1 x i64> %tmp4
}
define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
%tmp4 = add <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
%tmp4 = add <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
%tmp4 = add <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
%tmp4 = add <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
%tmp4 = add <16 x i8> %tmp1, %tmp3
ret <16 x i8> %tmp4
}
define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
%tmp4 = add <8 x i16> %tmp1, %tmp3
ret <8 x i16> %tmp4
}
define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
%tmp4 = add <4 x i32> %tmp1, %tmp3
ret <4 x i32> %tmp4
}
define <2 x i64> @vrsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
%tmp4 = add <2 x i64> %tmp1, %tmp3
ret <2 x i64> %tmp4
}
declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone

76
test/CodeGen/ARM/vsub.ll Normal file
View File

@ -0,0 +1,76 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vsub\\.i8} %t | count 2
; RUN: grep {vsub\\.i16} %t | count 2
; RUN: grep {vsub\\.i32} %t | count 2
; RUN: grep {vsub\\.i64} %t | count 2
; RUN: grep {vsub\\.f32} %t | count 2
define <8 x i8> @vsubi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = sub <8 x i8> %tmp1, %tmp2
ret <8 x i8> %tmp3
}
define <4 x i16> @vsubi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = sub <4 x i16> %tmp1, %tmp2
ret <4 x i16> %tmp3
}
define <2 x i32> @vsubi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = sub <2 x i32> %tmp1, %tmp2
ret <2 x i32> %tmp3
}
define <1 x i64> @vsubi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = sub <1 x i64> %tmp1, %tmp2
ret <1 x i64> %tmp3
}
define <2 x float> @vsubf32(<2 x float>* %A, <2 x float>* %B) nounwind {
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = sub <2 x float> %tmp1, %tmp2
ret <2 x float> %tmp3
}
define <16 x i8> @vsubQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = sub <16 x i8> %tmp1, %tmp2
ret <16 x i8> %tmp3
}
define <8 x i16> @vsubQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = sub <8 x i16> %tmp1, %tmp2
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = sub <4 x i32> %tmp1, %tmp2
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = sub <2 x i64> %tmp1, %tmp2
ret <2 x i64> %tmp3
}
define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = sub <4 x float> %tmp1, %tmp2
ret <4 x float> %tmp3
}

View File

@ -0,0 +1,29 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vsubhn\\.i16} %t | count 1
; RUN: grep {vsubhn\\.i32} %t | count 1
; RUN: grep {vsubhn\\.i64} %t | count 1
define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
ret <2 x i32> %tmp3
}
declare <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone

57
test/CodeGen/ARM/vsubl.ll Normal file
View File

@ -0,0 +1,57 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
; RUN: grep {vsubl\\.s8} %t | count 1
; RUN: grep {vsubl\\.s16} %t | count 1
; RUN: grep {vsubl\\.s32} %t | count 1
; RUN: grep {vsubl\\.u8} %t | count 1
; RUN: grep {vsubl\\.u16} %t | count 1
; RUN: grep {vsubl\\.u32} %t | count 1
define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i16> %tmp3
}
define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp3
}
define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp3
}
declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone

Some files were not shown because too many files have changed in this diff Show More