[ARM][NEON] Use address space in vld([1234]|[234]lane) and vst([1234]|[234]lane) instructions

This commit changes the interface of the vld[1234], vld[234]lane, and vst[1234],
vst[234]lane ARM neon intrinsics and associates an address space with the
pointer that these intrinsics take. This changes, e.g.,

<2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)

to

<2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32)

This change ensures that address spaces are fully taken into account in the ARM
target during lowering of interleaved loads and stores.

Differential Revision: http://reviews.llvm.org/D12985

llvm-svn: 248887
This commit is contained in:
Jeroen Ketema 2015-09-30 10:56:37 +00:00
parent 42e651fa43
commit ab99b59e8c
45 changed files with 746 additions and 530 deletions

View File

@ -405,36 +405,36 @@ def int_arm_neon_vrintp : Neon_1Arg_Intrinsic;
// De-interleaving vector loads from N-element structures. // De-interleaving vector loads from N-element structures.
// Source operands are the address and alignment. // Source operands are the address and alignment.
def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty], def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
[llvm_ptr_ty, llvm_i32_ty], [llvm_anyptr_ty, llvm_i32_ty],
[IntrReadArgMem]>; [IntrReadArgMem]>;
def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_ptr_ty, llvm_i32_ty], [llvm_anyptr_ty, llvm_i32_ty],
[IntrReadArgMem]>; [IntrReadArgMem]>;
def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>], LLVMMatchType<0>],
[llvm_ptr_ty, llvm_i32_ty], [llvm_anyptr_ty, llvm_i32_ty],
[IntrReadArgMem]>; [IntrReadArgMem]>;
def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>], LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_ptr_ty, llvm_i32_ty], [llvm_anyptr_ty, llvm_i32_ty],
[IntrReadArgMem]>; [IntrReadArgMem]>;
// Vector load N-element structure to one lane. // Vector load N-element structure to one lane.
// Source operands are: the address, the N input vectors (since only one // Source operands are: the address, the N input vectors (since only one
// lane is assigned), the lane number, and the alignment. // lane is assigned), the lane number, and the alignment.
def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_ptr_ty, LLVMMatchType<0>, [llvm_anyptr_ty, LLVMMatchType<0>,
LLVMMatchType<0>, llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty,
llvm_i32_ty], [IntrReadArgMem]>; llvm_i32_ty], [IntrReadArgMem]>;
def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>], LLVMMatchType<0>],
[llvm_ptr_ty, LLVMMatchType<0>, [llvm_anyptr_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
llvm_i32_ty, llvm_i32_ty], llvm_i32_ty, llvm_i32_ty],
[IntrReadArgMem]>; [IntrReadArgMem]>;
def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>], LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_ptr_ty, LLVMMatchType<0>, [llvm_anyptr_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, llvm_i32_ty, LLVMMatchType<0>, llvm_i32_ty,
llvm_i32_ty], [IntrReadArgMem]>; llvm_i32_ty], [IntrReadArgMem]>;
@ -442,38 +442,38 @@ def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
// Interleaving vector stores from N-element structures. // Interleaving vector stores from N-element structures.
// Source operands are: the address, the N vectors, and the alignment. // Source operands are: the address, the N vectors, and the alignment.
def int_arm_neon_vst1 : Intrinsic<[], def int_arm_neon_vst1 : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty, [llvm_anyptr_ty, llvm_anyvector_ty,
llvm_i32_ty], [IntrReadWriteArgMem]>; llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_arm_neon_vst2 : Intrinsic<[], def int_arm_neon_vst2 : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty, [llvm_anyptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, llvm_i32_ty], LLVMMatchType<1>, llvm_i32_ty],
[IntrReadWriteArgMem]>; [IntrReadWriteArgMem]>;
def int_arm_neon_vst3 : Intrinsic<[], def int_arm_neon_vst3 : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty, [llvm_anyptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<1>, LLVMMatchType<1>,
llvm_i32_ty], [IntrReadWriteArgMem]>; llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_arm_neon_vst4 : Intrinsic<[], def int_arm_neon_vst4 : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty, [llvm_anyptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<1>, LLVMMatchType<1>,
LLVMMatchType<0>, llvm_i32_ty], LLVMMatchType<1>, llvm_i32_ty],
[IntrReadWriteArgMem]>; [IntrReadWriteArgMem]>;
// Vector store N-element structure from one lane. // Vector store N-element structure from one lane.
// Source operands are: the address, the N vectors, the lane number, and // Source operands are: the address, the N vectors, the lane number, and
// the alignment. // the alignment.
def int_arm_neon_vst2lane : Intrinsic<[], def int_arm_neon_vst2lane : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty, [llvm_anyptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, llvm_i32_ty, LLVMMatchType<1>, llvm_i32_ty,
llvm_i32_ty], [IntrReadWriteArgMem]>; llvm_i32_ty], [IntrReadWriteArgMem]>;
def int_arm_neon_vst3lane : Intrinsic<[], def int_arm_neon_vst3lane : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty, [llvm_anyptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<1>, LLVMMatchType<1>,
llvm_i32_ty, llvm_i32_ty], llvm_i32_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>; [IntrReadWriteArgMem]>;
def int_arm_neon_vst4lane : Intrinsic<[], def int_arm_neon_vst4lane : Intrinsic<[],
[llvm_ptr_ty, llvm_anyvector_ty, [llvm_anyptr_ty, llvm_anyvector_ty,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<1>, LLVMMatchType<1>,
LLVMMatchType<0>, llvm_i32_ty, LLVMMatchType<1>, llvm_i32_ty,
llvm_i32_ty], [IntrReadWriteArgMem]>; llvm_i32_ty], [IntrReadWriteArgMem]>;
// Vector bitwise select. // Vector bitwise select.

View File

@ -27,6 +27,7 @@
#include "llvm/IR/LLVMContext.h" #include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h" #include "llvm/IR/Module.h"
#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"
#include <cstring> #include <cstring>
using namespace llvm; using namespace llvm;
@ -92,8 +93,41 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
F->arg_begin()->getType()); F->arg_begin()->getType());
return true; return true;
} }
Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$");
if (vldRegex.match(Name)) {
auto fArgs = F->getFunctionType()->params();
SmallVector<Type *, 4> Tys(fArgs.begin(), fArgs.end());
// Can't use Intrinsic::getDeclaration here as the return types might
// then only be structurally equal.
FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false);
NewFn = Function::Create(fType, F->getLinkage(),
"llvm." + Name + ".p0i8", F->getParent());
return true;
}
Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$");
if (vstRegex.match(Name)) {
static Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1,
Intrinsic::arm_neon_vst2,
Intrinsic::arm_neon_vst3,
Intrinsic::arm_neon_vst4};
static Intrinsic::ID StoreLaneInts[] = {Intrinsic::arm_neon_vst2lane,
Intrinsic::arm_neon_vst3lane,
Intrinsic::arm_neon_vst4lane};
auto fArgs = F->getFunctionType()->params();
Type *Tys[] = {fArgs[0], fArgs[1]};
if (Name.find("lane") == StringRef::npos)
NewFn = Intrinsic::getDeclaration(F->getParent(),
StoreInts[fArgs.size() - 3], Tys);
else
NewFn = Intrinsic::getDeclaration(F->getParent(),
StoreLaneInts[fArgs.size() - 5], Tys);
return true;
}
break; break;
} }
case 'c': { case 'c': {
if (Name.startswith("ctlz.") && F->arg_size() == 1) { if (Name.startswith("ctlz.") && F->arg_size() == 1) {
F->setName(Name + ".old"); F->setName(Name + ".old");
@ -651,6 +685,27 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
default: default:
llvm_unreachable("Unknown function for CallInst upgrade."); llvm_unreachable("Unknown function for CallInst upgrade.");
case Intrinsic::arm_neon_vld1:
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
case Intrinsic::arm_neon_vld4lane:
case Intrinsic::arm_neon_vst1:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
case Intrinsic::arm_neon_vst2lane:
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane: {
SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
CI->arg_operands().end());
CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
CI->eraseFromParent();
return;
}
case Intrinsic::ctlz: case Intrinsic::ctlz:
case Intrinsic::cttz: case Intrinsic::cttz:
assert(CI->getNumArgOperands() == 1 && assert(CI->getNumArgOperands() == 1 &&

View File

@ -11802,9 +11802,6 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld3,
Intrinsic::arm_neon_vld4}; Intrinsic::arm_neon_vld4};
Function *VldnFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy);
IRBuilder<> Builder(LI); IRBuilder<> Builder(LI);
SmallVector<Value *, 2> Ops; SmallVector<Value *, 2> Ops;
@ -11812,6 +11809,9 @@ bool ARMTargetLowering::lowerInterleavedLoad(
Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
Ops.push_back(Builder.getInt32(LI->getAlignment())); Ops.push_back(Builder.getInt32(LI->getAlignment()));
Type *Tys[] = { VecTy, Int8Ptr };
Function *VldnFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
// Replace uses of each shufflevector with the corresponding vector loaded // Replace uses of each shufflevector with the corresponding vector loaded
@ -11903,14 +11903,15 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst3,
Intrinsic::arm_neon_vst4}; Intrinsic::arm_neon_vst4};
Function *VstNFunc = Intrinsic::getDeclaration(
SI->getModule(), StoreInts[Factor - 2], SubVecTy);
SmallVector<Value *, 6> Ops; SmallVector<Value *, 6> Ops;
Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
Type *Tys[] = { Int8Ptr, SubVecTy };
Function *VstNFunc = Intrinsic::getDeclaration(
SI->getModule(), StoreInts[Factor - 2], Tys);
// Split the shufflevector operands into sub vectors for the new vstN call. // Split the shufflevector operands into sub vectors for the new vstN call.
for (unsigned i = 0; i < Factor; i++) for (unsigned i = 0; i < Factor; i++)
Ops.push_back(Builder.CreateShuffleVector( Ops.push_back(Builder.CreateShuffleVector(

View File

@ -2,8 +2,8 @@
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
target triple = "arm-apple-ios" target triple = "arm-apple-ios"
declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
@ -13,27 +13,27 @@ declare void @a_readonly_func(i8 *) noinline nounwind readonly
define <8 x i16> @test1(i8* %p, <8 x i16> %y) { define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
entry: entry:
%q = getelementptr i8, i8* %p, i64 16 %q = getelementptr i8, i8* %p, i64 16
%a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
%b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
%c = add <8 x i16> %a, %b %c = add <8 x i16> %a, %b
ret <8 x i16> %c ret <8 x i16> %c
; CHECK-LABEL: Function: test1: ; CHECK-LABEL: Function: test1:
; CHECK: NoAlias: i8* %p, i8* %q ; CHECK: NoAlias: i8* %p, i8* %q
; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 ; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4
; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
} }
define void @test2(i8* %P, i8* %Q) nounwind ssp { define void @test2(i8* %P, i8* %Q) nounwind ssp {

View File

@ -7,14 +7,14 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
; CHECK: define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) { ; CHECK: define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR:#[0-9]+]] ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]]
; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
; CHECK-NEXT: %c = add <8 x i16> %a, %a ; CHECK-NEXT: %c = add <8 x i16> %a, %a
define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) { define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
entry: entry:
%a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
%b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
%c = add <8 x i16> %a, %b %c = add <8 x i16> %a, %b
ret <8 x i16> %c ret <8 x i16> %c
} }
@ -22,21 +22,21 @@ entry:
; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) { ; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16 ; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16
; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR]] ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR]]
; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
; CHECK-NEXT: %c = add <8 x i16> %a, %a ; CHECK-NEXT: %c = add <8 x i16> %a, %a
define <8 x i16> @test1(i8* %p, <8 x i16> %y) { define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
entry: entry:
%q = getelementptr i8, i8* %p, i64 16 %q = getelementptr i8, i8* %p, i64 16
%a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
%b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
%c = add <8 x i16> %a, %b %c = add <8 x i16> %a, %b
ret <8 x i16> %c ret <8 x i16> %c
} }
declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
; CHECK: attributes #0 = { nounwind readonly argmemonly } ; CHECK: attributes #0 = { nounwind readonly argmemonly }
; CHECK: attributes #1 = { nounwind argmemonly } ; CHECK: attributes #1 = { nounwind argmemonly }

View File

@ -7,20 +7,20 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
; CHECK: define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) { ; CHECK: define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[NUW:#[0-9]+]] ; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[NUW:#[0-9]+]]
; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
; CHECK-NEXT: %c = add <8 x i16> %a, %a ; CHECK-NEXT: %c = add <8 x i16> %a, %a
define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) { define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
entry: entry:
%a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2 %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1 call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1
%b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2 %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
%c = add <8 x i16> %a, %b %c = add <8 x i16> %a, %b
ret <8 x i16> %c ret <8 x i16> %c
} }
declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
; CHECK: attributes #0 = { nounwind readonly argmemonly } ; CHECK: attributes #0 = { nounwind readonly argmemonly }
; CHECK: attributes #1 = { nounwind argmemonly } ; CHECK: attributes #1 = { nounwind argmemonly }

View File

@ -1,36 +1,36 @@
; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null ; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null
; This test would crash the rewriter when trying to handle a spill after one of ; This test would crash the rewriter when trying to handle a spill after one of
; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register. ; the @llvm.arm.neon.vld3.v8i8.p0i8 defined three parts of a register.
%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind { define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
%tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1] %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
%tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1] %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
%tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1] %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
%tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1] %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
%tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1] %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
%tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1] %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
%tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1] %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
%tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1] %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
%tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1] %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
%tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1] %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
%tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1] %tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1]
%tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1] %tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1]
%tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1] %tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1]
%tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2] %tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2]
call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1) call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1)
%tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1] %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1]
%tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1] %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1]
%tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1] %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1]
@ -38,8 +38,8 @@ define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A
%tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1] %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1]
%tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1] %tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1]
%tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2] %tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2]
call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1) call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1)
%tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1] %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1]
tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1) tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1)
ret <8 x i8> %tmp4 ret <8 x i8> %tmp4
} }

View File

@ -36,8 +36,8 @@ entry:
%tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3 %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
%19 = fmul <4 x float> %tmp5, %2 %19 = fmul <4 x float> %tmp5, %2
%20 = bitcast float* %fltp to i8* %20 = bitcast float* %fltp to i8*
tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %20, <4 x float> %19, i32 1)
ret void ret void
} }
declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind

View File

@ -12,8 +12,8 @@ entry:
%tmp9 = trunc i128 %tmp8 to i64 ; <i64> [#uses=1] %tmp9 = trunc i128 %tmp8 to i64 ; <i64> [#uses=1]
%tmp16.i = bitcast i64 %tmp6 to <8 x i8> ; <<8 x i8>> [#uses=1] %tmp16.i = bitcast i64 %tmp6 to <8 x i8> ; <<8 x i8>> [#uses=1]
%tmp20.i = bitcast i64 %tmp9 to <8 x i8> ; <<8 x i8>> [#uses=1] %tmp20.i = bitcast i64 %tmp9 to <8 x i8> ; <<8 x i8>> [#uses=1]
tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind
ret void ret void
} }
declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind

View File

@ -16,10 +16,10 @@ target triple = "thumbv7-apple-darwin10"
define i32 @test(i8* %arg) nounwind { define i32 @test(i8* %arg) nounwind {
entry: entry:
%0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1) %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %arg, i32 1)
%1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> <i32 1, i32 2> %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> <i32 1, i32 2>
store <2 x i64> %1, <2 x i64>* undef, align 16 store <2 x i64> %1, <2 x i64>* undef, align 16
ret i32 undef ret i32 undef
} }
declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly

View File

@ -4,9 +4,9 @@
define void @test_vmovqqqq_pseudo() nounwind ssp { define void @test_vmovqqqq_pseudo() nounwind ssp {
entry: entry:
%vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2) %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2)
store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef
ret void ret void
} }
declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly

View File

@ -52,8 +52,8 @@ cond.end295: ; preds = %entry
%shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer %shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
%shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32> <i32 0, i32 1> %shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32> <i32 0, i32 1>
%2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float> %2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float>
tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind
tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind
unreachable unreachable
for.end: ; preds = %entry for.end: ; preds = %entry
@ -63,10 +63,10 @@ for.end: ; preds = %entry
; Check that pseudo-expansion preserves <undef> flags. ; Check that pseudo-expansion preserves <undef> flags.
define void @foo3(i8* %p) nounwind ssp { define void @foo3(i8* %p) nounwind ssp {
entry: entry:
tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4) tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
ret void ret void
} }
declare arm_aapcs_vfpcc void @bar(i8*, float, float, float) declare arm_aapcs_vfpcc void @bar(i8*, float, float, float)
declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind

View File

@ -7,8 +7,8 @@ entry:
%vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0 %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
%vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
%0 = bitcast i32* %p to i8* %0 = bitcast i32* %p to i8*
tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4) tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
ret void ret void
} }
declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind

View File

@ -5,9 +5,9 @@
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
target triple = "thumbv7-apple-ios5.1.0" target triple = "thumbv7-apple-ios5.1.0"
declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
define void @findEdges(i8*) nounwind ssp { define void @findEdges(i8*) nounwind ssp {
%2 = icmp sgt i32 undef, 0 %2 = icmp sgt i32 undef, 0
@ -19,16 +19,16 @@ define void @findEdges(i8*) nounwind ssp {
; <label>:5 ; preds = %5, %1 ; <label>:5 ; preds = %5, %1
%6 = phi i8* [ %19, %5 ], [ %0, %1 ] %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
%7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1) %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* null, i32 1)
%8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0 %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
%9 = getelementptr inbounds i8, i8* null, i32 3 %9 = getelementptr inbounds i8, i8* null, i32 3
%10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1) %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %9, i32 1)
%11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2 %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
%12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1) %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %6, i32 1)
%13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0 %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
%14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1 %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
%15 = getelementptr inbounds i8, i8* %6, i32 3 %15 = getelementptr inbounds i8, i8* %6, i32 3
%16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1) %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %15, i32 1)
%17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1 %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
%18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2 %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
%19 = getelementptr inbounds i8, i8* %6, i32 48 %19 = getelementptr inbounds i8, i8* %6, i32 48
@ -111,7 +111,7 @@ define void @findEdges(i8*) nounwind ssp {
%96 = bitcast <8 x i8> %94 to <1 x i64> %96 = bitcast <8 x i8> %94 to <1 x i64>
%97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1> %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
%98 = bitcast <2 x i64> %97 to <16 x i8> %98 = bitcast <2 x i64> %97 to <16 x i8>
tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* null, <16 x i8> %98, i32 1)
%99 = icmp slt i32 undef, undef %99 = icmp slt i32 undef, undef
br i1 %99, label %5, label %3 br i1 %99, label %5, label %3
} }

View File

@ -10,12 +10,12 @@
; CHECK-NOT: Number of pipeline stalls ; CHECK-NOT: Number of pipeline stalls
define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) { define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) {
entry: entry:
%vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %foo, i32 1) %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %foo, i32 1)
%vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1) %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %bar, i32 1)
%and = and i32 %avail, 3 %and = and i32 %avail, 3
%tobool = icmp eq i32 %and, 0 %tobool = icmp eq i32 %and, 0
%retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2 %retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2
ret <16 x i8> %retv ret <16 x i8> %retv
} }
declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 ) declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* , i32 )

View File

@ -27,7 +27,7 @@ entry:
%n0 = insertelement <2 x i64> undef, i64 %tmp0, i32 0 %n0 = insertelement <2 x i64> undef, i64 %tmp0, i32 0
%n1 = insertelement <2 x i64> %n0, i64 %tmp1, i32 1 %n1 = insertelement <2 x i64> %n0, i64 %tmp1, i32 1
call void @llvm.arm.neon.vst4.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8) call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8)
call void @bar(<2 x i64> %n1) call void @bar(<2 x i64> %n1)
@ -50,7 +50,7 @@ define <8 x i8> @vtbx4(<8 x i8>* %A, %struct.__neon_int8x8x4_t* %B, <8 x i8>* %C
ret <8 x i8> %tmp8 ret <8 x i8> %tmp8
} }
declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
declare void @bar2(%struct.__neon_int8x8x4_t, <8 x i8>) declare void @bar2(%struct.__neon_int8x8x4_t, <8 x i8>)
declare void @bar(<2 x i64> %arg) declare void @bar(<2 x i64> %arg)

View File

@ -202,3 +202,24 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
ret void ret void
} }
; The following test cases check that address spaces are properly handled
; CHECK-LABEL: load_address_space
; CHECK: vld3.32
define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) {
%tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A
%interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 3>
store <2 x i32> %interleaved, <2 x i32>* %B
ret void
}
; CHECK-LABEL: store_address_space
; CHECK: vst2.32
define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) {
%tmp0 = load <2 x i32>, <2 x i32>* %A
%tmp1 = load <2 x i32>, <2 x i32>* %B
%interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
ret void
}

View File

@ -14,11 +14,11 @@ target triple = "thumbv7-apple-ios0.0.0"
define void @f(float* %p, i32 %c) nounwind ssp { define void @f(float* %p, i32 %c) nounwind ssp {
entry: entry:
%0 = bitcast float* %p to i8* %0 = bitcast float* %p to i8*
%vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4) %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
%vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1 %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
%add.ptr = getelementptr inbounds float, float* %p, i32 8 %add.ptr = getelementptr inbounds float, float* %p, i32 8
%1 = bitcast float* %add.ptr to i8* %1 = bitcast float* %add.ptr to i8*
tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4) tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
ret void ret void
} }
@ -27,13 +27,13 @@ entry:
define void @f1(float* %p, i32 %c) nounwind ssp { define void @f1(float* %p, i32 %c) nounwind ssp {
entry: entry:
%0 = bitcast float* %p to i8* %0 = bitcast float* %p to i8*
%vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4) %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
%vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1 %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
%add.ptr = getelementptr inbounds float, float* %p, i32 8 %add.ptr = getelementptr inbounds float, float* %p, i32 8
%1 = bitcast float* %add.ptr to i8* %1 = bitcast float* %add.ptr to i8*
%vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4) %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
%vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0 %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4) tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
ret void ret void
} }
@ -42,7 +42,7 @@ entry:
define void @f2(float* %p, i32 %c) nounwind ssp { define void @f2(float* %p, i32 %c) nounwind ssp {
entry: entry:
%0 = bitcast float* %p to i8* %0 = bitcast float* %p to i8*
%vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4) %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %0, i32 4)
%vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1 %vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
br label %do.body br label %do.body
@ -52,10 +52,10 @@ do.body: ; preds = %do.body, %entry
%p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ] %p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ]
%add.ptr = getelementptr inbounds float, float* %p.addr.0, i32 8 %add.ptr = getelementptr inbounds float, float* %p.addr.0, i32 8
%1 = bitcast float* %add.ptr to i8* %1 = bitcast float* %add.ptr to i8*
%vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4) %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* %1, i32 4)
%vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0 %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
%vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1 %vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1
tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4) tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
%dec = add nsw i32 %c.addr.0, -1 %dec = add nsw i32 %c.addr.0, -1
%tobool = icmp eq i32 %dec, 0 %tobool = icmp eq i32 %dec, 0
br i1 %tobool, label %do.end, label %do.body br i1 %tobool, label %do.end, label %do.body
@ -64,8 +64,8 @@ do.end: ; preds = %do.body
ret void ret void
} }
declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
; CHECK: f3 ; CHECK: f3
; This function has lane insertions that span basic blocks. ; This function has lane insertions that span basic blocks.
@ -109,12 +109,12 @@ if.end: ; preds = %if.else, %if.then
%x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ] %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
%add.ptr = getelementptr inbounds float, float* %p, i32 4 %add.ptr = getelementptr inbounds float, float* %p, i32 4
%4 = bitcast float* %add.ptr to i8* %4 = bitcast float* %add.ptr to i8*
tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4) tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %4, <2 x float> %x.0, i32 4)
ret void ret void
} }
declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
; CHECK: f4 ; CHECK: f4
; This function inserts a lane into a fully defined vector. ; This function inserts a lane into a fully defined vector.
@ -124,7 +124,7 @@ declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
define void @f4(float* %p, float* %q) nounwind ssp { define void @f4(float* %p, float* %q) nounwind ssp {
entry: entry:
%0 = bitcast float* %p to i8* %0 = bitcast float* %p to i8*
%vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4) %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %0, i32 4)
%tobool = icmp eq float* %q, null %tobool = icmp eq float* %q, null
br i1 %tobool, label %if.end, label %if.then br i1 %tobool, label %if.end, label %if.then
@ -138,7 +138,7 @@ if.then: ; preds = %entry
if.end: ; preds = %entry, %if.then if.end: ; preds = %entry, %if.then
%x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ] %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4) tail call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %0, <2 x float> %x.0, i32 4)
ret void ret void
} }
@ -154,7 +154,7 @@ if.end: ; preds = %entry, %if.then
define void @f5(float* %p, float* %q) nounwind ssp { define void @f5(float* %p, float* %q) nounwind ssp {
entry: entry:
%0 = bitcast float* %p to i8* %0 = bitcast float* %p to i8*
%vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4) %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %0, i32 4)
%vecext = extractelement <4 x float> %vld1, i32 0 %vecext = extractelement <4 x float> %vld1, i32 0
%vecext1 = extractelement <4 x float> %vld1, i32 1 %vecext1 = extractelement <4 x float> %vld1, i32 1
%vecext2 = extractelement <4 x float> %vld1, i32 2 %vecext2 = extractelement <4 x float> %vld1, i32 2
@ -182,13 +182,13 @@ if.end: ; preds = %entry, %if.then
%vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1 %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
%vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2 %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
%vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3 %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4) tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
ret void ret void
} }
declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
; CHECK: pr13999 ; CHECK: pr13999
define void @pr13999() nounwind readonly { define void @pr13999() nounwind readonly {

View File

@ -19,8 +19,8 @@ bb:
%tmp5 = bitcast i64 %tmp4 to <8 x i8> %tmp5 = bitcast i64 %tmp4 to <8 x i8>
%tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%tmp7 = shufflevector <16 x i8> %tmp6, <16 x i8> %tmp3, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %tmp7 = shufflevector <16 x i8> %tmp6, <16 x i8> %tmp3, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
tail call void @llvm.arm.neon.vst1.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2) tail call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %arg, <16 x i8> %tmp7, i32 2)
ret void ret void
} }
declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32)

View File

@ -22,7 +22,7 @@ declare arm_aapcs_vfpcc %2* @func3(%2*, %2*, i32)
declare arm_aapcs_vfpcc %2** @func4() declare arm_aapcs_vfpcc %2** @func4()
define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 { define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
%2 = call arm_aapcs_vfpcc %0** @func2() nounwind %2 = call arm_aapcs_vfpcc %0** @func2() nounwind
%3 = load %0*, %0** %2, align 4 %3 = load %0*, %0** %2, align 4
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
@ -40,10 +40,10 @@ define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
%10 = fmul float undef, 2.000000e+05 %10 = fmul float undef, 2.000000e+05
%11 = fadd float %10, -1.000000e+05 %11 = fadd float %10, -1.000000e+05
store float %11, float* undef, align 4 store float %11, float* undef, align 4
call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
ret void ret void
} }
declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
declare arm_aapcs_vfpcc i32 @rand() declare arm_aapcs_vfpcc i32 @rand()

View File

@ -8,7 +8,7 @@ target triple = "thumbv7-none-linux-gnueabi"
define void @foo(float* nocapture %A) #0 { define void @foo(float* nocapture %A) #0 {
%1= bitcast float* %A to i8* %1= bitcast float* %A to i8*
%2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4) %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* %1, i32 4)
%3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
%divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3 %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
%4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1 %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
@ -17,7 +17,7 @@ define void @foo(float* nocapture %A) #0 {
%div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5 %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
%6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3 %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
%div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6 %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4) tail call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
ret void ret void
} }
@ -27,8 +27,8 @@ declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
; Function Attrs: nounwind readonly ; Function Attrs: nounwind readonly
; Function Attrs: nounwind ; Function Attrs: nounwind
declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1 declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) #2
; Function Attrs: nounwind ; Function Attrs: nounwind

View File

@ -24,7 +24,7 @@ entry:
%2 = getelementptr inbounds %struct.int32x4_t, %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1] %2 = getelementptr inbounds %struct.int32x4_t, %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
%3 = load <4 x i32>, <4 x i32>* %2, align 16 ; <<4 x i32>> [#uses=1] %3 = load <4 x i32>, <4 x i32>* %2, align 16 ; <<4 x i32>> [#uses=1]
%4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1] %4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1]
%5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1] %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
%6 = bitcast <8 x i16> %5 to <2 x double> ; <<2 x double>> [#uses=2] %6 = bitcast <8 x i16> %5 to <2 x double> ; <<2 x double>> [#uses=2]
%7 = extractelement <2 x double> %6, i32 0 ; <double> [#uses=1] %7 = extractelement <2 x double> %6, i32 0 ; <double> [#uses=1]
%8 = bitcast double %7 to <4 x i16> ; <<4 x i16>> [#uses=1] %8 = bitcast double %7 to <4 x i16> ; <<4 x i16>> [#uses=1]
@ -40,7 +40,7 @@ entry:
%trunc_16 = trunc <4 x i32> %16 to <4 x i16> %trunc_16 = trunc <4 x i32> %16 to <4 x i16>
%17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1] %17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
%18 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1] %18 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1]
tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %18, <8 x i16> %17, i32 1)
ret void ret void
} }
@ -60,17 +60,17 @@ entry:
%2 = getelementptr inbounds %struct.int16x8_t, %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1] %2 = getelementptr inbounds %struct.int16x8_t, %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
%3 = load <8 x i16>, <8 x i16>* %2, align 16 ; <<8 x i16>> [#uses=1] %3 = load <8 x i16>, <8 x i16>* %2, align 16 ; <<8 x i16>> [#uses=1]
%4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1] %4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1]
%5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4, i32 1) ; <<8 x i16>> [#uses=1] %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %4, i32 1) ; <<8 x i16>> [#uses=1]
%6 = getelementptr inbounds i16, i16* %i_ptr, i32 8 ; <i16*> [#uses=1] %6 = getelementptr inbounds i16, i16* %i_ptr, i32 8 ; <i16*> [#uses=1]
%7 = bitcast i16* %6 to i8* ; <i8*> [#uses=1] %7 = bitcast i16* %6 to i8* ; <i8*> [#uses=1]
%8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7, i32 1) ; <<8 x i16>> [#uses=1] %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %7, i32 1) ; <<8 x i16>> [#uses=1]
%9 = mul <8 x i16> %1, %5 ; <<8 x i16>> [#uses=1] %9 = mul <8 x i16> %1, %5 ; <<8 x i16>> [#uses=1]
%10 = mul <8 x i16> %3, %8 ; <<8 x i16>> [#uses=1] %10 = mul <8 x i16> %3, %8 ; <<8 x i16>> [#uses=1]
%11 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1] %11 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1]
tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %11, <8 x i16> %9, i32 1)
%12 = getelementptr inbounds i16, i16* %o_ptr, i32 8 ; <i16*> [#uses=1] %12 = getelementptr inbounds i16, i16* %o_ptr, i32 8 ; <i16*> [#uses=1]
%13 = bitcast i16* %12 to i8* ; <i8*> [#uses=1] %13 = bitcast i16* %12 to i8* ; <i8*> [#uses=1]
tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %13, <8 x i16> %10, i32 1)
ret void ret void
} }
@ -81,14 +81,14 @@ define <8 x i8> @t3(i8* %A, i8* %B) nounwind {
; CHECK: vmov r ; CHECK: vmov r
; CHECK-NOT: vmov d ; CHECK-NOT: vmov d
; CHECK: vst3.8 ; CHECK: vst3.8
%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2]
%tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1] %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1] %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
%tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1] %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
%tmp5 = sub <8 x i8> %tmp3, %tmp4 %tmp5 = sub <8 x i8> %tmp3, %tmp4
%tmp6 = add <8 x i8> %tmp2, %tmp3 ; <<8 x i8>> [#uses=1] %tmp6 = add <8 x i8> %tmp2, %tmp3 ; <<8 x i8>> [#uses=1]
%tmp7 = mul <8 x i8> %tmp4, %tmp2 %tmp7 = mul <8 x i8> %tmp4, %tmp2
tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1) tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7, i32 1)
ret <8 x i8> %tmp4 ret <8 x i8> %tmp4
} }
@ -101,10 +101,10 @@ entry:
; CHECK-NOT: vmov ; CHECK-NOT: vmov
; CHECK: bne ; CHECK: bne
%tmp1 = bitcast i32* %in to i8* ; <i8*> [#uses=1] %tmp1 = bitcast i32* %in to i8* ; <i8*> [#uses=1]
%tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2] %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp1, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
%tmp3 = getelementptr inbounds i32, i32* %in, i32 8 ; <i32*> [#uses=1] %tmp3 = getelementptr inbounds i32, i32* %in, i32 8 ; <i32*> [#uses=1]
%tmp4 = bitcast i32* %tmp3 to i8* ; <i8*> [#uses=1] %tmp4 = bitcast i32* %tmp3 to i8* ; <i8*> [#uses=1]
%tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2] %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp4, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
%tmp8 = bitcast i32* %out to i8* ; <i8*> [#uses=1] %tmp8 = bitcast i32* %out to i8* ; <i8*> [#uses=1]
br i1 undef, label %return1, label %return2 br i1 undef, label %return1, label %return2
@ -120,7 +120,7 @@ return1:
%tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1] %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
%tmp6 = add <4 x i32> %tmp52, %tmp ; <<4 x i32>> [#uses=1] %tmp6 = add <4 x i32> %tmp52, %tmp ; <<4 x i32>> [#uses=1]
%tmp7 = add <4 x i32> %tmp57, %tmp39 ; <<4 x i32>> [#uses=1] %tmp7 = add <4 x i32> %tmp57, %tmp39 ; <<4 x i32>> [#uses=1]
tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1) tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7, i32 1)
ret void ret void
return2: return2:
@ -131,7 +131,7 @@ return2:
%tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1] %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
%tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1] %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
%tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1] %tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1) tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101, i32 1)
call void @llvm.trap() call void @llvm.trap()
unreachable unreachable
} }
@ -147,7 +147,7 @@ define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
; CHECK: vadd.i16 ; CHECK: vadd.i16
%tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1] %tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1]
%tmp1 = load <8 x i16>, <8 x i16>* %B ; <<8 x i16>> [#uses=2] %tmp1 = load <8 x i16>, <8 x i16>* %B ; <<8 x i16>> [#uses=2]
%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2] %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
%tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1] %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
%tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1] %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
%tmp5 = add <8 x i16> %tmp3, %tmp4 ; <<8 x i16>> [#uses=1] %tmp5 = add <8 x i16> %tmp3, %tmp4 ; <<8 x i16>> [#uses=1]
@ -160,7 +160,7 @@ define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
; CHECK: vorr d[[D0:[0-9]+]], d[[D1:[0-9]+]] ; CHECK: vorr d[[D0:[0-9]+]], d[[D1:[0-9]+]]
; CHECK-NEXT: vld2.8 {d[[D1]][1], d[[D0]][1]} ; CHECK-NEXT: vld2.8 {d[[D1]][1], d[[D0]][1]}
%tmp1 = load <8 x i8>, <8 x i8>* %B ; <<8 x i8>> [#uses=2] %tmp1 = load <8 x i8>, <8 x i8>* %B ; <<8 x i8>> [#uses=2]
%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2] %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1] %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
%tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1] %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
%tmp5 = add <8 x i8> %tmp3, %tmp4 ; <<8 x i8>> [#uses=1] %tmp5 = add <8 x i8> %tmp3, %tmp4 ; <<8 x i8>> [#uses=1]
@ -178,14 +178,14 @@ entry:
; CHECK: vuzp.32 q[[Q1]], q[[Q0]] ; CHECK: vuzp.32 q[[Q1]], q[[Q0]]
; CHECK: vst1.32 ; CHECK: vst1.32
%0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2] %0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2]
%1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2] %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
%tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1] %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
%tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1] %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
%2 = bitcast i32* %optr to i8* ; <i8*> [#uses=2] %2 = bitcast i32* %optr to i8* ; <i8*> [#uses=2]
tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1) tail call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60, i32 1)
%3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0, i32 1) ; <<4 x i32>> [#uses=1] %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %0, i32 1) ; <<4 x i32>> [#uses=1]
%4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1] %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %2, <4 x i32> %4, i32 1)
ret void ret void
} }
@ -307,43 +307,43 @@ bb14: ; preds = %bb6
; This test crashes the coalescer because live variables were not updated properly. ; This test crashes the coalescer because live variables were not updated properly.
define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind { define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
%tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1] %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
%tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1]
%tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1] %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
%tmp2bd = add <8 x i8> zeroinitializer, %tmp2d ; <<8 x i8>> [#uses=1] %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d ; <<8 x i8>> [#uses=1]
%tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1] %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
%tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f ; <<8 x i8>> [#uses=1] %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f ; <<8 x i8>> [#uses=1]
%tmp2efgh = mul <8 x i8> %tmp2ef, undef ; <<8 x i8>> [#uses=2] %tmp2efgh = mul <8 x i8> %tmp2ef, undef ; <<8 x i8>> [#uses=2]
call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1) call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh, i32 1)
%tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd ; <<8 x i8>> [#uses=1] %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd ; <<8 x i8>> [#uses=1]
%tmp7 = mul <8 x i8> undef, %tmp2 ; <<8 x i8>> [#uses=1] %tmp7 = mul <8 x i8> undef, %tmp2 ; <<8 x i8>> [#uses=1]
tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1) tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7, i32 1)
ret <8 x i8> undef ret <8 x i8> undef
} }
declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
nounwind nounwind
declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone

View File

@ -7,7 +7,7 @@
%quux = type { i32 (...)**, %baz*, i32 } %quux = type { i32 (...)**, %baz*, i32 }
%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo } %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
define void @aaa(%quuz* %this, i8* %block) { define void @aaa(%quuz* %this, i8* %block) {
; CHECK-LABEL: aaa: ; CHECK-LABEL: aaa:
@ -18,30 +18,30 @@ entry:
%aligned_vec = alloca <4 x float>, align 16 %aligned_vec = alloca <4 x float>, align 16
%"alloca point" = bitcast i32 0 to i32 %"alloca point" = bitcast i32 0 to i32
%vecptr = bitcast <4 x float>* %aligned_vec to i8* %vecptr = bitcast <4 x float>* %aligned_vec to i8*
%0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1] %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %vecptr, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 6.300000e+01, float* undef, align 4 store float 6.300000e+01, float* undef, align 4
%1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1] %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1] %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
%ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%val173 = load <4 x float>, <4 x float>* undef ; <<4 x float>> [#uses=1] %val173 = load <4 x float>, <4 x float>* undef ; <<4 x float>> [#uses=1]
br label %bb4 br label %bb4

View File

@ -196,8 +196,8 @@ entry:
%3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%4 = add <8 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> %4 = add <8 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%5 = trunc <8 x i16> %4 to <8 x i8> %5 = trunc <8 x i16> %4 to <8 x i8>
tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %5, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %5, i32 1)
unreachable unreachable
} }
declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind

View File

@ -78,11 +78,11 @@ entry:
%2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = add <8 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> %3 = add <8 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%4 = trunc <8 x i16> %3 to <8 x i8> %4 = trunc <8 x i16> %3 to <8 x i8>
tail call void @llvm.arm.neon.vst1.v8i8(i8* undef, <8 x i8> %4, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* undef, <8 x i8> %4, i32 1)
unreachable unreachable
} }
declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
; Test that loads and stores of i64 vector elements are handled as f64 values ; Test that loads and stores of i64 vector elements are handled as f64 values
; so they are not split up into i32 values. Radar 8755338. ; so they are not split up into i32 values. Radar 8755338.

View File

@ -0,0 +1,139 @@
; RUN: llc -mtriple=arm-eabi -mattr=+neon < %s | FileCheck %s
%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
; vld[1234] auto-upgrade tests
; CHECK-LABEL: test_vld1_upgrade:
; CHECK: vld1.32 {d16}, [r0]
define <2 x i32> @test_vld1_upgrade(i8* %ptr) {
%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %ptr, i32 1)
ret <2 x i32> %tmp1
}
declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly
; CHECK-LABEL: test_vld2_upgrade:
; CHECK: vld2.32 {d16, d17}, [r0]
define %struct.__neon_int32x2x2_t @test_vld2_upgrade(i8* %ptr) {
%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %ptr, i32 1)
ret %struct.__neon_int32x2x2_t %tmp1
}
declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly
; CHECK-LABEL: test_vld3_upgrade:
; CHECK: vld3.32 {d16, d17, d18}, [r1]
define %struct.__neon_int32x2x3_t @test_vld3_upgrade(i8* %ptr) {
%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %ptr, i32 1)
ret %struct.__neon_int32x2x3_t %tmp1
}
declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly
; CHECK-LABEL: test_vld4_upgrade:
; CHECK: vld4.32 {d16, d17, d18, d19}, [r1]
define %struct.__neon_int32x2x4_t @test_vld4_upgrade(i8* %ptr) {
%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %ptr, i32 1)
ret %struct.__neon_int32x2x4_t %tmp1
}
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly
; vld[234]lane auto-upgrade tests
; CHECK-LABEL: test_vld2lane_upgrade:
; CHECK: vld2.32 {d16[1], d17[1]}, [r0]
define %struct.__neon_int32x2x2_t @test_vld2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
ret %struct.__neon_int32x2x2_t %tmp1
}
declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
; CHECK-LABEL: test_vld3lane_upgrade:
; CHECK: vld3.32 {d16[1], d17[1], d18[1]}, [r1]
define %struct.__neon_int32x2x3_t @test_vld3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
ret %struct.__neon_int32x2x3_t %tmp1
}
declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
; CHECK-LABEL: test_vld4lane_upgrade:
; CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r1]
define %struct.__neon_int32x2x4_t @test_vld4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
ret %struct.__neon_int32x2x4_t %tmp1
}
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
; vst[1234] auto-upgrade tests
; CHECK-LABEL: test_vst1_upgrade:
; CHECK: vst1.32 {d16}, [r0]
define void @test_vst1_upgrade(i8* %ptr, <2 x i32> %A) {
call void @llvm.arm.neon.vst1.v2i32(i8* %ptr, <2 x i32> %A, i32 1)
ret void
}
declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
; CHECK-LABEL: test_vst2_upgrade:
; CHECK: vst2.32 {d16, d17}, [r0]
define void @test_vst2_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
call void @llvm.arm.neon.vst2.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1)
ret void
}
declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
; CHECK-LABEL: test_vst3_upgrade:
; CHECK: vst3.32 {d16, d17, d18}, [r0]
define void @test_vst3_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
call void @llvm.arm.neon.vst3.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1)
ret void
}
declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
; CHECK-LABEL: test_vst4_upgrade:
; CHECK: vst4.32 {d16, d17, d18, d19}, [r0]
define void @test_vst4_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
call void @llvm.arm.neon.vst4.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1)
ret void
}
declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
; vst[234]lane auto-upgrade tests
; CHECK-LABEL: test_vst2lane_upgrade:
; CHECK: vst2.32 {d16[1], d17[1]}, [r0]
define void @test_vst2lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B) {
call void @llvm.arm.neon.vst2lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, i32 1, i32 1)
ret void
}
declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
; CHECK-LABEL: test_vst3lane_upgrade:
; CHECK: vst3.32 {d16[1], d17[1], d18[1]}, [r0]
define void @test_vst3lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
call void @llvm.arm.neon.vst3lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32 1, i32 1)
ret void
}
declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
; CHECK-LABEL: test_vst4lane_upgrade:
; CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
define void @test_vst4lane_upgrade(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) {
call void @llvm.arm.neon.vst4lane.v2i32(i8* %ptr, <2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32 1, i32 1)
ret void
}
declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind

View File

@ -7,7 +7,7 @@ define <8 x i8> @vld1i8(i8* %A) nounwind {
;CHECK-LABEL: vld1i8: ;CHECK-LABEL: vld1i8:
;Check the alignment value. Max for this instruction is 64 bits: ;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld1.8 {d16}, [r0:64] ;CHECK: vld1.8 {d16}, [r0:64]
%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16) %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %A, i32 16)
ret <8 x i8> %tmp1 ret <8 x i8> %tmp1
} }
@ -15,7 +15,7 @@ define <4 x i16> @vld1i16(i16* %A) nounwind {
;CHECK-LABEL: vld1i16: ;CHECK-LABEL: vld1i16:
;CHECK: vld1.16 ;CHECK: vld1.16
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1) %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
ret <4 x i16> %tmp1 ret <4 x i16> %tmp1
} }
@ -25,7 +25,7 @@ define <4 x i16> @vld1i16_update(i16** %ptr) nounwind {
;CHECK: vld1.16 {d16}, [{{r[0-9]+}}]! ;CHECK: vld1.16 {d16}, [{{r[0-9]+}}]!
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1) %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %tmp0, i32 1)
%tmp2 = getelementptr i16, i16* %A, i32 4 %tmp2 = getelementptr i16, i16* %A, i32 4
store i16* %tmp2, i16** %ptr store i16* %tmp2, i16** %ptr
ret <4 x i16> %tmp1 ret <4 x i16> %tmp1
@ -35,7 +35,7 @@ define <2 x i32> @vld1i32(i32* %A) nounwind {
;CHECK-LABEL: vld1i32: ;CHECK-LABEL: vld1i32:
;CHECK: vld1.32 ;CHECK: vld1.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1) %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
ret <2 x i32> %tmp1 ret <2 x i32> %tmp1
} }
@ -45,7 +45,7 @@ define <2 x i32> @vld1i32_update(i32** %ptr, i32 %inc) nounwind {
;CHECK: vld1.32 {d16}, [{{r[0-9]+}}], {{r[0-9]+}} ;CHECK: vld1.32 {d16}, [{{r[0-9]+}}], {{r[0-9]+}}
%A = load i32*, i32** %ptr %A = load i32*, i32** %ptr
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1) %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %tmp0, i32 1)
%tmp2 = getelementptr i32, i32* %A, i32 %inc %tmp2 = getelementptr i32, i32* %A, i32 %inc
store i32* %tmp2, i32** %ptr store i32* %tmp2, i32** %ptr
ret <2 x i32> %tmp1 ret <2 x i32> %tmp1
@ -55,7 +55,7 @@ define <2 x float> @vld1f(float* %A) nounwind {
;CHECK-LABEL: vld1f: ;CHECK-LABEL: vld1f:
;CHECK: vld1.32 ;CHECK: vld1.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %tmp0, i32 1) %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* %tmp0, i32 1)
ret <2 x float> %tmp1 ret <2 x float> %tmp1
} }
@ -63,7 +63,7 @@ define <1 x i64> @vld1i64(i64* %A) nounwind {
;CHECK-LABEL: vld1i64: ;CHECK-LABEL: vld1i64:
;CHECK: vld1.64 ;CHECK: vld1.64
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0, i32 1) %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %tmp0, i32 1)
ret <1 x i64> %tmp1 ret <1 x i64> %tmp1
} }
@ -71,7 +71,7 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind {
;CHECK-LABEL: vld1Qi8: ;CHECK-LABEL: vld1Qi8:
;Check the alignment value. Max for this instruction is 128 bits: ;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld1.8 {d16, d17}, [r0:64] ;CHECK: vld1.8 {d16, d17}, [r0:64]
%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8) %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
ret <16 x i8> %tmp1 ret <16 x i8> %tmp1
} }
@ -80,7 +80,7 @@ define <16 x i8> @vld1Qi8_update(i8** %ptr) nounwind {
;CHECK-LABEL: vld1Qi8_update: ;CHECK-LABEL: vld1Qi8_update:
;CHECK: vld1.8 {d16, d17}, [{{r[0-9]+}}:64]! ;CHECK: vld1.8 {d16, d17}, [{{r[0-9]+}}:64]!
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8) %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %A, i32 8)
%tmp2 = getelementptr i8, i8* %A, i32 16 %tmp2 = getelementptr i8, i8* %A, i32 16
store i8* %tmp2, i8** %ptr store i8* %tmp2, i8** %ptr
ret <16 x i8> %tmp1 ret <16 x i8> %tmp1
@ -91,7 +91,7 @@ define <8 x i16> @vld1Qi16(i16* %A) nounwind {
;Check the alignment value. Max for this instruction is 128 bits: ;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld1.16 {d16, d17}, [r0:128] ;CHECK: vld1.16 {d16, d17}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32) %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %tmp0, i32 32)
ret <8 x i16> %tmp1 ret <8 x i16> %tmp1
} }
@ -99,7 +99,7 @@ define <4 x i32> @vld1Qi32(i32* %A) nounwind {
;CHECK-LABEL: vld1Qi32: ;CHECK-LABEL: vld1Qi32:
;CHECK: vld1.32 ;CHECK: vld1.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0, i32 1) %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %tmp0, i32 1)
ret <4 x i32> %tmp1 ret <4 x i32> %tmp1
} }
@ -107,7 +107,7 @@ define <4 x float> @vld1Qf(float* %A) nounwind {
;CHECK-LABEL: vld1Qf: ;CHECK-LABEL: vld1Qf:
;CHECK: vld1.32 ;CHECK: vld1.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %tmp0, i32 1) %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %tmp0, i32 1)
ret <4 x float> %tmp1 ret <4 x float> %tmp1
} }
@ -115,7 +115,7 @@ define <2 x i64> @vld1Qi64(i64* %A) nounwind {
;CHECK-LABEL: vld1Qi64: ;CHECK-LABEL: vld1Qi64:
;CHECK: vld1.64 ;CHECK: vld1.64
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0, i32 1) %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %tmp0, i32 1)
ret <2 x i64> %tmp1 ret <2 x i64> %tmp1
} }
@ -123,28 +123,28 @@ define <2 x double> @vld1Qf64(double* %A) nounwind {
;CHECK-LABEL: vld1Qf64: ;CHECK-LABEL: vld1Qf64:
;CHECK: vld1.64 ;CHECK: vld1.64
%tmp0 = bitcast double* %A to i8* %tmp0 = bitcast double* %A to i8*
%tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %tmp0, i32 1) %tmp1 = call <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8* %tmp0, i32 1)
ret <2 x double> %tmp1 ret <2 x double> %tmp1
} }
declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly declare <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8*, i32) nounwind readonly
declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32) nounwind readonly declare <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8*, i32) nounwind readonly
declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) nounwind readonly declare <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32) nounwind readonly
declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly declare <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8*, i32) nounwind readonly
declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32) nounwind readonly
declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32) nounwind readonly declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32) nounwind readonly
declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly
declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32) nounwind readonly declare <2 x double> @llvm.arm.neon.vld1.v2f64.p0i8(i8*, i32) nounwind readonly
; Radar 8355607 ; Radar 8355607
; Do not crash if the vld1 result is not used. ; Do not crash if the vld1 result is not used.
define void @unused_vld1_result() { define void @unused_vld1_result() {
entry: entry:
%0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1)
call void @llvm.trap() call void @llvm.trap()
unreachable unreachable
} }

View File

@ -15,7 +15,7 @@ define <8 x i8> @vld2i8(i8* %A) nounwind {
;CHECK-LABEL: vld2i8: ;CHECK-LABEL: vld2i8:
;Check the alignment value. Max for this instruction is 128 bits: ;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld2.8 {d16, d17}, [r0:64] ;CHECK: vld2.8 {d16, d17}, [r0:64]
%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8) %tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
%tmp4 = add <8 x i8> %tmp2, %tmp3 %tmp4 = add <8 x i8> %tmp2, %tmp3
@ -27,7 +27,7 @@ define <4 x i16> @vld2i16(i16* %A) nounwind {
;Check the alignment value. Max for this instruction is 128 bits: ;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld2.16 {d16, d17}, [r0:128] ;CHECK: vld2.16 {d16, d17}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32) %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
%tmp4 = add <4 x i16> %tmp2, %tmp3 %tmp4 = add <4 x i16> %tmp2, %tmp3
@ -38,7 +38,7 @@ define <2 x i32> @vld2i32(i32* %A) nounwind {
;CHECK-LABEL: vld2i32: ;CHECK-LABEL: vld2i32:
;CHECK: vld2.32 ;CHECK: vld2.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
%tmp4 = add <2 x i32> %tmp2, %tmp3 %tmp4 = add <2 x i32> %tmp2, %tmp3
@ -49,7 +49,7 @@ define <2 x float> @vld2f(float* %A) nounwind {
;CHECK-LABEL: vld2f: ;CHECK-LABEL: vld2f:
;CHECK: vld2.32 ;CHECK: vld2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
%tmp4 = fadd <2 x float> %tmp2, %tmp3 %tmp4 = fadd <2 x float> %tmp2, %tmp3
@ -62,7 +62,7 @@ define <2 x float> @vld2f_update(float** %ptr) nounwind {
;CHECK: vld2.32 {d16, d17}, [r1]! ;CHECK: vld2.32 {d16, d17}, [r1]!
%A = load float*, float** %ptr %A = load float*, float** %ptr
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
%tmp4 = fadd <2 x float> %tmp2, %tmp3 %tmp4 = fadd <2 x float> %tmp2, %tmp3
@ -76,7 +76,7 @@ define <1 x i64> @vld2i64(i64* %A) nounwind {
;Check the alignment value. Max for this instruction is 128 bits: ;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vld1.64 {d16, d17}, [r0:128] ;CHECK: vld1.64 {d16, d17}, [r0:128]
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32) %tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1
%tmp4 = add <1 x i64> %tmp2, %tmp3 %tmp4 = add <1 x i64> %tmp2, %tmp3
@ -87,7 +87,7 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind {
;CHECK-LABEL: vld2Qi8: ;CHECK-LABEL: vld2Qi8:
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld2.8 {d16, d17, d18, d19}, [r0:64] ;CHECK: vld2.8 {d16, d17, d18, d19}, [r0:64]
%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8) %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
%tmp4 = add <16 x i8> %tmp2, %tmp3 %tmp4 = add <16 x i8> %tmp2, %tmp3
@ -99,7 +99,7 @@ define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
;CHECK-LABEL: vld2Qi8_update: ;CHECK-LABEL: vld2Qi8_update:
;CHECK: vld2.8 {d16, d17, d18, d19}, [r2:128], r1 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r2:128], r1
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16) %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 16)
%tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
%tmp4 = add <16 x i8> %tmp2, %tmp3 %tmp4 = add <16 x i8> %tmp2, %tmp3
@ -113,7 +113,7 @@ define <8 x i16> @vld2Qi16(i16* %A) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128] ;CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16) %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
%tmp4 = add <8 x i16> %tmp2, %tmp3 %tmp4 = add <8 x i16> %tmp2, %tmp3
@ -125,7 +125,7 @@ define <4 x i32> @vld2Qi32(i32* %A) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256] ;CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64) %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp0, i32 64)
%tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
%tmp4 = add <4 x i32> %tmp2, %tmp3 %tmp4 = add <4 x i32> %tmp2, %tmp3
@ -136,20 +136,20 @@ define <4 x float> @vld2Qf(float* %A) nounwind {
;CHECK-LABEL: vld2Qf: ;CHECK-LABEL: vld2Qf:
;CHECK: vld2.32 ;CHECK: vld2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1
%tmp4 = fadd <4 x float> %tmp2, %tmp3 %tmp4 = fadd <4 x float> %tmp2, %tmp3
ret <4 x float> %tmp4 ret <4 x float> %tmp4
} }
declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*, i32) nounwind readonly declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*, i32) nounwind readonly declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*, i32) nounwind readonly declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*, i32) nounwind readonly declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly

View File

@ -16,7 +16,7 @@ define <8 x i8> @vld3i8(i8* %A) nounwind {
;CHECK-LABEL: vld3i8: ;CHECK-LABEL: vld3i8:
;Check the alignment value. Max for this instruction is 64 bits: ;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld3.8 {d16, d17, d18}, [r0:64] ;CHECK: vld3.8 {d16, d17, d18}, [r0:64]
%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32) %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A, i32 32)
%tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
%tmp4 = add <8 x i8> %tmp2, %tmp3 %tmp4 = add <8 x i8> %tmp2, %tmp3
@ -27,7 +27,7 @@ define <4 x i16> @vld3i16(i16* %A) nounwind {
;CHECK-LABEL: vld3i16: ;CHECK-LABEL: vld3i16:
;CHECK: vld3.16 ;CHECK: vld3.16
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
%tmp4 = add <4 x i16> %tmp2, %tmp3 %tmp4 = add <4 x i16> %tmp2, %tmp3
@ -40,7 +40,7 @@ define <4 x i16> @vld3i16_update(i16** %ptr, i32 %inc) nounwind {
;CHECK: vld3.16 {d16, d17, d18}, [{{r[0-9]+}}], {{r[0-9]+}} ;CHECK: vld3.16 {d16, d17, d18}, [{{r[0-9]+}}], {{r[0-9]+}}
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
%tmp4 = add <4 x i16> %tmp2, %tmp3 %tmp4 = add <4 x i16> %tmp2, %tmp3
@ -53,7 +53,7 @@ define <2 x i32> @vld3i32(i32* %A) nounwind {
;CHECK-LABEL: vld3i32: ;CHECK-LABEL: vld3i32:
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2
%tmp4 = add <2 x i32> %tmp2, %tmp3 %tmp4 = add <2 x i32> %tmp2, %tmp3
@ -64,7 +64,7 @@ define <2 x float> @vld3f(float* %A) nounwind {
;CHECK-LABEL: vld3f: ;CHECK-LABEL: vld3f:
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2
%tmp4 = fadd <2 x float> %tmp2, %tmp3 %tmp4 = fadd <2 x float> %tmp2, %tmp3
@ -76,7 +76,7 @@ define <1 x i64> @vld3i64(i64* %A) nounwind {
;Check the alignment value. Max for this instruction is 64 bits: ;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld1.64 {d16, d17, d18}, [r0:64] ;CHECK: vld1.64 {d16, d17, d18}, [r0:64]
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16) %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
%tmp4 = add <1 x i64> %tmp2, %tmp3 %tmp4 = add <1 x i64> %tmp2, %tmp3
@ -87,7 +87,7 @@ define <1 x i64> @vld3i64_update(i64** %ptr, i64* %A) nounwind {
;CHECK-LABEL: vld3i64_update: ;CHECK-LABEL: vld3i64_update:
;CHECK: vld1.64 {d16, d17, d18}, [r1:64]! ;CHECK: vld1.64 {d16, d17, d18}, [r1:64]!
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16) %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16)
%tmp5 = getelementptr i64, i64* %A, i32 3 %tmp5 = getelementptr i64, i64* %A, i32 3
store i64* %tmp5, i64** %ptr store i64* %tmp5, i64** %ptr
%tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
@ -101,7 +101,7 @@ define <16 x i8> @vld3Qi8(i8* %A) nounwind {
;Check the alignment value. Max for this instruction is 64 bits: ;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld3.8 {d16, d18, d20}, [r0:64]! ;CHECK: vld3.8 {d16, d18, d20}, [r0:64]!
;CHECK: vld3.8 {d17, d19, d21}, [r0:64] ;CHECK: vld3.8 {d17, d19, d21}, [r0:64]
%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32) %tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8* %A, i32 32)
%tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
%tmp4 = add <16 x i8> %tmp2, %tmp3 %tmp4 = add <16 x i8> %tmp2, %tmp3
@ -113,7 +113,7 @@ define <8 x i16> @vld3Qi16(i16* %A) nounwind {
;CHECK: vld3.16 ;CHECK: vld3.16
;CHECK: vld3.16 ;CHECK: vld3.16
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2
%tmp4 = add <8 x i16> %tmp2, %tmp3 %tmp4 = add <8 x i16> %tmp2, %tmp3
@ -125,7 +125,7 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind {
;CHECK: vld3.32 ;CHECK: vld3.32
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
%tmp4 = add <4 x i32> %tmp2, %tmp3 %tmp4 = add <4 x i32> %tmp2, %tmp3
@ -139,7 +139,7 @@ define <4 x i32> @vld3Qi32_update(i32** %ptr) nounwind {
;CHECK: vld3.32 {d17, d19, d21}, [r[[R]]]! ;CHECK: vld3.32 {d17, d19, d21}, [r[[R]]]!
%A = load i32*, i32** %ptr %A = load i32*, i32** %ptr
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
%tmp4 = add <4 x i32> %tmp2, %tmp3 %tmp4 = add <4 x i32> %tmp2, %tmp3
@ -153,20 +153,20 @@ define <4 x float> @vld3Qf(float* %A) nounwind {
;CHECK: vld3.32 ;CHECK: vld3.32
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2
%tmp4 = fadd <4 x float> %tmp2, %tmp3 %tmp4 = fadd <4 x float> %tmp2, %tmp3
ret <4 x float> %tmp4 ret <4 x float> %tmp4
} }
declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*, i32) nounwind readonly declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*, i32) nounwind readonly declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*, i32) nounwind readonly declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*, i32) nounwind readonly declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*, i32) nounwind readonly declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32.p0i8(i8*, i32) nounwind readonly

View File

@ -15,7 +15,7 @@ define <8 x i8> @vld4i8(i8* %A) nounwind {
;CHECK-LABEL: vld4i8: ;CHECK-LABEL: vld4i8:
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.8 {d16, d17, d18, d19}, [r0:64] ;CHECK: vld4.8 {d16, d17, d18, d19}, [r0:64]
%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8) %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8)
%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
%tmp4 = add <8 x i8> %tmp2, %tmp3 %tmp4 = add <8 x i8> %tmp2, %tmp3
@ -27,7 +27,7 @@ define <8 x i8> @vld4i8_update(i8** %ptr, i32 %inc) nounwind {
;CHECK-LABEL: vld4i8_update: ;CHECK-LABEL: vld4i8_update:
;CHECK: vld4.8 {d16, d17, d18, d19}, [r2:128], r1 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r2:128], r1
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 16) %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 16)
%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
%tmp4 = add <8 x i8> %tmp2, %tmp3 %tmp4 = add <8 x i8> %tmp2, %tmp3
@ -41,7 +41,7 @@ define <4 x i16> @vld4i16(i16* %A) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.16 {d16, d17, d18, d19}, [r0:128] ;CHECK: vld4.16 {d16, d17, d18, d19}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16) %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8* %tmp0, i32 16)
%tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2
%tmp4 = add <4 x i16> %tmp2, %tmp3 %tmp4 = add <4 x i16> %tmp2, %tmp3
@ -53,7 +53,7 @@ define <2 x i32> @vld4i32(i32* %A) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.32 {d16, d17, d18, d19}, [r0:256] ;CHECK: vld4.32 {d16, d17, d18, d19}, [r0:256]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32) %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* %tmp0, i32 32)
%tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
%tmp4 = add <2 x i32> %tmp2, %tmp3 %tmp4 = add <2 x i32> %tmp2, %tmp3
@ -64,7 +64,7 @@ define <2 x float> @vld4f(float* %A) nounwind {
;CHECK-LABEL: vld4f: ;CHECK-LABEL: vld4f:
;CHECK: vld4.32 ;CHECK: vld4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2
%tmp4 = fadd <2 x float> %tmp2, %tmp3 %tmp4 = fadd <2 x float> %tmp2, %tmp3
@ -76,7 +76,7 @@ define <1 x i64> @vld4i64(i64* %A) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld1.64 {d16, d17, d18, d19}, [r0:256] ;CHECK: vld1.64 {d16, d17, d18, d19}, [r0:256]
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64) %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
%tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
%tmp4 = add <1 x i64> %tmp2, %tmp3 %tmp4 = add <1 x i64> %tmp2, %tmp3
@ -87,7 +87,7 @@ define <1 x i64> @vld4i64_update(i64** %ptr, i64* %A) nounwind {
;CHECK-LABEL: vld4i64_update: ;CHECK-LABEL: vld4i64_update:
;CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]! ;CHECK: vld1.64 {d16, d17, d18, d19}, [r1:256]!
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64) %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64)
%tmp5 = getelementptr i64, i64* %A, i32 4 %tmp5 = getelementptr i64, i64* %A, i32 4
store i64* %tmp5, i64** %ptr store i64* %tmp5, i64** %ptr
%tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
@ -101,7 +101,7 @@ define <16 x i8> @vld4Qi8(i8* %A) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vld4.8 {d16, d18, d20, d22}, [r0:256]! ;CHECK: vld4.8 {d16, d18, d20, d22}, [r0:256]!
;CHECK: vld4.8 {d17, d19, d21, d23}, [r0:256] ;CHECK: vld4.8 {d17, d19, d21, d23}, [r0:256]
%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64) %tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8* %A, i32 64)
%tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
%tmp4 = add <16 x i8> %tmp2, %tmp3 %tmp4 = add <16 x i8> %tmp2, %tmp3
@ -114,7 +114,7 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind {
;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]! ;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]!
;CHECK: vld4.16 {d17, d19, d21, d23}, [r0] ;CHECK: vld4.16 {d17, d19, d21, d23}, [r0]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
%tmp4 = add <8 x i16> %tmp2, %tmp3 %tmp4 = add <8 x i16> %tmp2, %tmp3
@ -128,7 +128,7 @@ define <8 x i16> @vld4Qi16_update(i16** %ptr) nounwind {
;CHECK: vld4.16 {d17, d19, d21, d23}, [r1:64]! ;CHECK: vld4.16 {d17, d19, d21, d23}, [r1:64]!
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8) %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8* %tmp0, i32 8)
%tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
%tmp4 = add <8 x i16> %tmp2, %tmp3 %tmp4 = add <8 x i16> %tmp2, %tmp3
@ -142,7 +142,7 @@ define <4 x i32> @vld4Qi32(i32* %A) nounwind {
;CHECK: vld4.32 ;CHECK: vld4.32
;CHECK: vld4.32 ;CHECK: vld4.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2
%tmp4 = add <4 x i32> %tmp2, %tmp3 %tmp4 = add <4 x i32> %tmp2, %tmp3
@ -154,20 +154,20 @@ define <4 x float> @vld4Qf(float* %A) nounwind {
;CHECK: vld4.32 ;CHECK: vld4.32
;CHECK: vld4.32 ;CHECK: vld4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8* %tmp0, i32 1) %tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8* %tmp0, i32 1)
%tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2
%tmp4 = fadd <4 x float> %tmp2, %tmp3 %tmp4 = fadd <4 x float> %tmp2, %tmp3
ret <4 x float> %tmp4 ret <4 x float> %tmp4
} }
declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*, i32) nounwind readonly declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*, i32) nounwind readonly declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*, i32) nounwind readonly declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*, i32) nounwind readonly declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*, i32) nounwind readonly declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32.p0i8(i8*, i32) nounwind readonly
declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*, i32) nounwind readonly declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32.p0i8(i8*, i32) nounwind readonly

View File

@ -66,7 +66,7 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
;CHECK-LABEL: vld2dupi8: ;CHECK-LABEL: vld2dupi8:
;Check the (default) alignment value. ;Check the (default) alignment value.
;CHECK: vld2.8 {d16[], d17[]}, [r0] ;CHECK: vld2.8 {d16[], d17[]}, [r0]
%tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) %tmp0 = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
%tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp0, 1
@ -80,7 +80,7 @@ define <4 x i16> @vld2dupi16(i8* %A) nounwind {
;Check that a power-of-two alignment smaller than the total size of the memory ;Check that a power-of-two alignment smaller than the total size of the memory
;being loaded is ignored. ;being loaded is ignored.
;CHECK: vld2.16 {d16[], d17[]}, [r0] ;CHECK: vld2.16 {d16[], d17[]}, [r0]
%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@ -95,7 +95,7 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
;CHECK: vld2.16 {d16[], d17[]}, [r1]! ;CHECK: vld2.16 {d16[], d17[]}, [r1]!
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%A2 = bitcast i16* %A to i8* %A2 = bitcast i16* %A to i8*
%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@ -110,7 +110,7 @@ define <2 x i32> @vld2dupi32(i8* %A) nounwind {
;CHECK-LABEL: vld2dupi32: ;CHECK-LABEL: vld2dupi32:
;Check the alignment value. Max for this instruction is 64 bits: ;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vld2.32 {d16[], d17[]}, [r0:64] ;CHECK: vld2.32 {d16[], d17[]}, [r0:64]
%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16) %tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
@ -119,9 +119,9 @@ define <2 x i32> @vld2dupi32(i8* %A) nounwind {
ret <2 x i32> %tmp5 ret <2 x i32> %tmp5
} }
declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@ -131,7 +131,7 @@ define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
;CHECK-LABEL: vld3dupi8_update: ;CHECK-LABEL: vld3dupi8_update:
;CHECK: vld3.8 {d16[], d17[], d18[]}, [r2], r1 ;CHECK: vld3.8 {d16[], d17[], d18[]}, [r2], r1
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8) %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
%tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1
@ -149,7 +149,7 @@ define <4 x i16> @vld3dupi16(i8* %A) nounwind {
;CHECK-LABEL: vld3dupi16: ;CHECK-LABEL: vld3dupi16:
;Check the (default) alignment value. VLD3 does not support alignment. ;Check the (default) alignment value. VLD3 does not support alignment.
;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0] ;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8) %tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
%tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
@ -161,8 +161,8 @@ define <4 x i16> @vld3dupi16(i8* %A) nounwind {
ret <4 x i16> %tmp8 ret <4 x i16> %tmp8
} }
declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
@ -173,7 +173,7 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]! ;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%A2 = bitcast i16* %A to i8* %A2 = bitcast i16* %A to i8*
%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1) %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
%tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
@ -195,7 +195,7 @@ define <2 x i32> @vld4dupi32(i8* %A) nounwind {
;Check the alignment value. An 8-byte alignment is allowed here even though ;Check the alignment value. An 8-byte alignment is allowed here even though
;it is smaller than the total size of the memory being loaded. ;it is smaller than the total size of the memory being loaded.
;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0:64] ;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0:64]
%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8) %tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
%tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0 %tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
@ -210,5 +210,5 @@ define <2 x i32> @vld4dupi32(i8* %A) nounwind {
ret <2 x i32> %tmp11 ret <2 x i32> %tmp11
} }
declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly

View File

@ -102,7 +102,7 @@ define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 16 bits: ;Check the alignment value. Max for this instruction is 16 bits:
;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16] ;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
%tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
%tmp5 = add <8 x i8> %tmp3, %tmp4 %tmp5 = add <8 x i8> %tmp3, %tmp4
@ -115,7 +115,7 @@ define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32] ;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
%tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
%tmp5 = add <4 x i16> %tmp3, %tmp4 %tmp5 = add <4 x i16> %tmp3, %tmp4
@ -127,7 +127,7 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vld2.32 ;CHECK: vld2.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
%tmp5 = add <2 x i32> %tmp3, %tmp4 %tmp5 = add <2 x i32> %tmp3, %tmp4
@ -141,7 +141,7 @@ define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
%A = load i32*, i32** %ptr %A = load i32*, i32** %ptr
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
%tmp5 = add <2 x i32> %tmp3, %tmp4 %tmp5 = add <2 x i32> %tmp3, %tmp4
@ -155,7 +155,7 @@ define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
;CHECK: vld2.32 ;CHECK: vld2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
%tmp5 = fadd <2 x float> %tmp3, %tmp4 %tmp5 = fadd <2 x float> %tmp3, %tmp4
@ -168,7 +168,7 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}] ;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
%tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
%tmp5 = add <8 x i16> %tmp3, %tmp4 %tmp5 = add <8 x i16> %tmp3, %tmp4
@ -181,7 +181,7 @@ define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64] ;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
%tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
%tmp5 = add <4 x i32> %tmp3, %tmp4 %tmp5 = add <4 x i32> %tmp3, %tmp4
@ -193,21 +193,21 @@ define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vld2.32 ;CHECK: vld2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
%tmp5 = fadd <4 x float> %tmp3, %tmp4 %tmp5 = fadd <4 x float> %tmp3, %tmp4
ret <4 x float> %tmp5 ret <4 x float> %tmp5
} }
declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> } %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@ -222,7 +222,7 @@ define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK-LABEL: vld3lanei8: ;CHECK-LABEL: vld3lanei8:
;CHECK: vld3.8 ;CHECK: vld3.8
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
@ -237,7 +237,7 @@ define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
@ -251,7 +251,7 @@ define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
@ -265,7 +265,7 @@ define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
@ -280,7 +280,7 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}] ;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
@ -296,7 +296,7 @@ define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounw
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8) %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
@ -312,7 +312,7 @@ define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1) %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
%tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
@ -326,7 +326,7 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vld3.32 ;CHECK: vld3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
@ -335,14 +335,14 @@ define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
ret <4 x float> %tmp7 ret <4 x float> %tmp7
} }
declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
@ -358,7 +358,7 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 32 bits: ;Check the alignment value. Max for this instruction is 32 bits:
;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32] ;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
@ -375,7 +375,7 @@ define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]! ;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]!
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
@ -395,7 +395,7 @@ define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}] ;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4) %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
@ -413,7 +413,7 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64] ;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8) %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
@ -429,7 +429,7 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
;CHECK: vld4.32 ;CHECK: vld4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
@ -446,7 +446,7 @@ define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64] ;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16) %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
%tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
@ -463,7 +463,7 @@ define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}] ;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
%tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
@ -479,7 +479,7 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vld4.32 ;CHECK: vld4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
%tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
%tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
%tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
@ -490,14 +490,14 @@ define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
ret <4 x float> %tmp9 ret <4 x float> %tmp9
} }
declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register ; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
@ -511,7 +511,7 @@ define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
%tmp65 = shl i128 %tmp64, 64 %tmp65 = shl i128 %tmp64, 64
%ins67 = or i128 %tmp65, 0 %ins67 = or i128 %tmp65, 0
%tmp78 = bitcast i128 %ins67 to <8 x i16> %tmp78 = bitcast i128 %ins67 to <8 x i16>
%vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2) %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
%tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
%tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
%tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2

View File

@ -393,8 +393,8 @@ entry:
%sub.i = sub <4 x i32> %add.i185, zeroinitializer %sub.i = sub <4 x i32> %add.i185, zeroinitializer
%add.i = add <4 x i32> %sub.i, zeroinitializer %add.i = add <4 x i32> %sub.i, zeroinitializer
%vmovn.i = trunc <4 x i32> %add.i to <4 x i16> %vmovn.i = trunc <4 x i32> %add.i to <4 x i16>
tail call void @llvm.arm.neon.vst1.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2) tail call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* undef, <4 x i16> %vmovn.i, i32 2)
unreachable unreachable
} }
declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind

View File

@ -447,7 +447,7 @@ entry:
%0 = trunc i32 %mul to i8 %0 = trunc i32 %mul to i8
%1 = insertelement <8 x i8> undef, i8 %0, i32 0 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
%2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
%3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1) %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
%4 = bitcast <16 x i8> %3 to <2 x double> %4 = bitcast <16 x i8> %3 to <2 x double>
%5 = extractelement <2 x double> %4, i32 1 %5 = extractelement <2 x double> %4, i32 1
%6 = bitcast double %5 to <8 x i8> %6 = bitcast double %5 to <8 x i8>
@ -459,13 +459,13 @@ entry:
%12 = add <8 x i16> %7, %11 %12 = add <8 x i16> %7, %11
%13 = mul <8 x i16> %12, %8 %13 = mul <8 x i16> %12, %8
%14 = bitcast i16* %dst to i8* %14 = bitcast i16* %dst to i8*
tail call void @llvm.arm.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2) tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %14, <8 x i16> %13, i32 2)
ret void ret void
} }
declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
; Take advantage of the Cortex-A8 multiplier accumulator forward. ; Take advantage of the Cortex-A8 multiplier accumulator forward.
@ -480,7 +480,7 @@ entry:
%0 = trunc i32 %mul to i8 %0 = trunc i32 %mul to i8
%1 = insertelement <8 x i8> undef, i8 %0, i32 0 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
%2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
%3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1) %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
%4 = bitcast <16 x i8> %3 to <2 x double> %4 = bitcast <16 x i8> %3 to <2 x double>
%5 = extractelement <2 x double> %4, i32 1 %5 = extractelement <2 x double> %4, i32 1
%6 = bitcast double %5 to <8 x i8> %6 = bitcast double %5 to <8 x i8>
@ -502,7 +502,7 @@ entry:
%0 = trunc i32 %mul to i8 %0 = trunc i32 %mul to i8
%1 = insertelement <8 x i8> undef, i8 %0, i32 0 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
%2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
%3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1) %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
%4 = bitcast <16 x i8> %3 to <2 x double> %4 = bitcast <16 x i8> %3 to <2 x double>
%5 = extractelement <2 x double> %4, i32 1 %5 = extractelement <2 x double> %4, i32 1
%6 = bitcast double %5 to <8 x i8> %6 = bitcast double %5 to <8 x i8>
@ -559,7 +559,7 @@ for.body33.lr.ph: ; preds = %for.body
for.body33: ; preds = %for.body33, %for.body33.lr.ph for.body33: ; preds = %for.body33, %for.body33.lr.ph
%add45 = add i32 undef, undef %add45 = add i32 undef, undef
%vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* undef, i32 1) %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* undef, i32 1)
%0 = load i32*, i32** undef, align 4 %0 = load i32*, i32** undef, align 4
%shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
%1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8> %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>

View File

@ -5,7 +5,7 @@ define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 64 bits: ;Check the alignment value. Max for this instruction is 64 bits:
;CHECK: vst1.8 {d16}, [r0:64] ;CHECK: vst1.8 {d16}, [r0:64]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16) call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
ret void ret void
} }
@ -14,7 +14,7 @@ define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst1.16 ;CHECK: vst1.16
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
ret void ret void
} }
@ -23,7 +23,7 @@ define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst1.32 ;CHECK: vst1.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
ret void ret void
} }
@ -32,7 +32,7 @@ define void @vst1f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst1.32 ;CHECK: vst1.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
ret void ret void
} }
@ -43,7 +43,7 @@ define void @vst1f_update(float** %ptr, <2 x float>* %B) nounwind {
%A = load float*, float** %ptr %A = load float*, float** %ptr
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
%tmp2 = getelementptr float, float* %A, i32 2 %tmp2 = getelementptr float, float* %A, i32 2
store float* %tmp2, float** %ptr store float* %tmp2, float** %ptr
ret void ret void
@ -54,7 +54,7 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst1.64 ;CHECK: vst1.64
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B %tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
ret void ret void
} }
@ -63,7 +63,7 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 128 bits: ;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vst1.8 {d16, d17}, [r0:64] ;CHECK: vst1.8 {d16, d17}, [r0:64]
%tmp1 = load <16 x i8>, <16 x i8>* %B %tmp1 = load <16 x i8>, <16 x i8>* %B
call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8) call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
ret void ret void
} }
@ -73,7 +73,7 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst1.16 {d16, d17}, [r0:128] ;CHECK: vst1.16 {d16, d17}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32) call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
ret void ret void
} }
@ -84,7 +84,7 @@ define void @vst1Qi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8) call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
%tmp2 = getelementptr i16, i16* %A, i32 %inc %tmp2 = getelementptr i16, i16* %A, i32 %inc
store i16* %tmp2, i16** %ptr store i16* %tmp2, i16** %ptr
ret void ret void
@ -95,7 +95,7 @@ define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst1.32 ;CHECK: vst1.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
ret void ret void
} }
@ -104,7 +104,7 @@ define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst1.32 ;CHECK: vst1.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
ret void ret void
} }
@ -113,7 +113,7 @@ define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
;CHECK: vst1.64 ;CHECK: vst1.64
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <2 x i64>, <2 x i64>* %B %tmp1 = load <2 x i64>, <2 x i64>* %B
call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
ret void ret void
} }
@ -122,19 +122,19 @@ define void @vst1Qf64(double* %A, <2 x double>* %B) nounwind {
;CHECK: vst1.64 ;CHECK: vst1.64
%tmp0 = bitcast double* %A to i8* %tmp0 = bitcast double* %A to i8*
%tmp1 = load <2 x double>, <2 x double>* %B %tmp1 = load <2 x double>, <2 x double>* %B
call void @llvm.arm.neon.vst1.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1) call void @llvm.arm.neon.vst1.p0i8.v2f64(i8* %tmp0, <2 x double> %tmp1, i32 1)
ret void ret void
} }
declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind
declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v1i64(i8*, <1 x i64>, i32) nounwind
declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v2i64(i8*, <2 x i64>, i32) nounwind
declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v2f64(i8*, <2 x double>, i32) nounwind

View File

@ -5,7 +5,7 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 128 bits: ;Check the alignment value. Max for this instruction is 128 bits:
;CHECK: vst2.8 {d16, d17}, [r0:64] ;CHECK: vst2.8 {d16, d17}, [r0:64]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8) call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
ret void ret void
} }
@ -15,7 +15,7 @@ define void @vst2i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
;CHECK: vst2.8 {d16, d17}, [r1], r2 ;CHECK: vst2.8 {d16, d17}, [r1], r2
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4) call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
%tmp2 = getelementptr i8, i8* %A, i32 %inc %tmp2 = getelementptr i8, i8* %A, i32 %inc
store i8* %tmp2, i8** %ptr store i8* %tmp2, i8** %ptr
ret void ret void
@ -27,7 +27,7 @@ define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst2.16 {d16, d17}, [r0:128] ;CHECK: vst2.16 {d16, d17}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32) call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
ret void ret void
} }
@ -36,7 +36,7 @@ define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst2.32 ;CHECK: vst2.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
ret void ret void
} }
@ -45,7 +45,7 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst2.32 ;CHECK: vst2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
ret void ret void
} }
@ -55,7 +55,7 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst1.64 {d16, d17}, [r0:128] ;CHECK: vst1.64 {d16, d17}, [r0:128]
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B %tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32) call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
ret void ret void
} }
@ -66,7 +66,7 @@ define void @vst2i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
%A = load i64*, i64** %ptr %A = load i64*, i64** %ptr
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B %tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8) call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
%tmp2 = getelementptr i64, i64* %A, i32 2 %tmp2 = getelementptr i64, i64* %A, i32 2
store i64* %tmp2, i64** %ptr store i64* %tmp2, i64** %ptr
ret void ret void
@ -77,7 +77,7 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst2.8 {d16, d17, d18, d19}, [r0:64] ;CHECK: vst2.8 {d16, d17, d18, d19}, [r0:64]
%tmp1 = load <16 x i8>, <16 x i8>* %B %tmp1 = load <16 x i8>, <16 x i8>* %B
call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8) call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
ret void ret void
} }
@ -87,7 +87,7 @@ define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst2.16 {d16, d17, d18, d19}, [r0:128] ;CHECK: vst2.16 {d16, d17, d18, d19}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16) call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
ret void ret void
} }
@ -97,7 +97,7 @@ define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst2.32 {d16, d17, d18, d19}, [r0:256] ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0:256]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64) call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
ret void ret void
} }
@ -106,7 +106,7 @@ define void @vst2Qf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst2.32 ;CHECK: vst2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
ret void ret void
} }
@ -114,7 +114,7 @@ define i8* @vst2update(i8* %out, <4 x i16>* %B) nounwind {
;CHECK-LABEL: vst2update: ;CHECK-LABEL: vst2update:
;CHECK: vst2.16 {d16, d17}, [r0]! ;CHECK: vst2.16 {d16, d17}, [r0]!
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
tail call void @llvm.arm.neon.vst2.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2) tail call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* %out, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 2)
%t5 = getelementptr inbounds i8, i8* %out, i32 16 %t5 = getelementptr inbounds i8, i8* %out, i32 16
ret i8* %t5 ret i8* %t5
} }
@ -123,18 +123,18 @@ define i8* @vst2update2(i8 * %out, <4 x float> * %this) nounwind optsize ssp ali
;CHECK-LABEL: vst2update2: ;CHECK-LABEL: vst2update2:
;CHECK: vst2.32 {d16, d17, d18, d19}, [r0]! ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0]!
%tmp1 = load <4 x float>, <4 x float>* %this %tmp1 = load <4 x float>, <4 x float>* %this
call void @llvm.arm.neon.vst2.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %out, <4 x float> %tmp1, <4 x float> %tmp1, i32 4) nounwind
%tmp2 = getelementptr inbounds i8, i8* %out, i32 32 %tmp2 = getelementptr inbounds i8, i8* %out, i32 32
ret i8* %tmp2 ret i8* %tmp2
} }
declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind

View File

@ -6,7 +6,7 @@ define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
;This test runs at -O0 so do not check for specific register numbers. ;This test runs at -O0 so do not check for specific register numbers.
;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64] ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32) call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
ret void ret void
} }
@ -15,7 +15,7 @@ define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst3.16 ;CHECK: vst3.16
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
ret void ret void
} }
@ -24,7 +24,7 @@ define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
ret void ret void
} }
@ -35,7 +35,7 @@ define void @vst3i32_update(i32** %ptr, <2 x i32>* %B) nounwind {
%A = load i32*, i32** %ptr %A = load i32*, i32** %ptr
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
%tmp2 = getelementptr i32, i32* %A, i32 6 %tmp2 = getelementptr i32, i32* %A, i32 6
store i32* %tmp2, i32** %ptr store i32* %tmp2, i32** %ptr
ret void ret void
@ -46,7 +46,7 @@ define void @vst3f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst3.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
ret void ret void
} }
@ -57,7 +57,7 @@ define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64] ;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B %tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16) call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
ret void ret void
} }
@ -67,7 +67,7 @@ define void @vst3i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
%A = load i64*, i64** %ptr %A = load i64*, i64** %ptr
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B %tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
%tmp2 = getelementptr i64, i64* %A, i32 3 %tmp2 = getelementptr i64, i64* %A, i32 3
store i64* %tmp2, i64** %ptr store i64* %tmp2, i64** %ptr
ret void ret void
@ -80,7 +80,7 @@ define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]! ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]!
;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64] ;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
%tmp1 = load <16 x i8>, <16 x i8>* %B %tmp1 = load <16 x i8>, <16 x i8>* %B
call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32) call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
ret void ret void
} }
@ -90,7 +90,7 @@ define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst3.16 ;CHECK: vst3.16
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
ret void ret void
} }
@ -102,7 +102,7 @@ define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
%tmp2 = getelementptr i16, i16* %A, i32 24 %tmp2 = getelementptr i16, i16* %A, i32 24
store i16* %tmp2, i16** %ptr store i16* %tmp2, i16** %ptr
ret void ret void
@ -114,7 +114,7 @@ define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst3.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
ret void ret void
} }
@ -124,17 +124,17 @@ define void @vst3Qf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst3.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
ret void ret void
} }
declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst3.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind

View File

@ -5,7 +5,7 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 256 bits: ;Check the alignment value. Max for this instruction is 256 bits:
;CHECK: vst4.8 {d16, d17, d18, d19}, [r0:64] ;CHECK: vst4.8 {d16, d17, d18, d19}, [r0:64]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8) call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
ret void ret void
} }
@ -15,7 +15,7 @@ define void @vst4i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
;CHECK: vst4.8 {d16, d17, d18, d19}, [r1:128], r2 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r1:128], r2
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16) call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
%tmp2 = getelementptr i8, i8* %A, i32 %inc %tmp2 = getelementptr i8, i8* %A, i32 %inc
store i8* %tmp2, i8** %ptr store i8* %tmp2, i8** %ptr
ret void ret void
@ -27,7 +27,7 @@ define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst4.16 {d16, d17, d18, d19}, [r0:128] ;CHECK: vst4.16 {d16, d17, d18, d19}, [r0:128]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16) call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
ret void ret void
} }
@ -37,7 +37,7 @@ define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst4.32 {d16, d17, d18, d19}, [r0:256] ;CHECK: vst4.32 {d16, d17, d18, d19}, [r0:256]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32) call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
ret void ret void
} }
@ -46,7 +46,7 @@ define void @vst4f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst4.32 ;CHECK: vst4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst4.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
ret void ret void
} }
@ -56,7 +56,7 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256] ;CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B %tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64) call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
ret void ret void
} }
@ -66,7 +66,7 @@ define void @vst4i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
%A = load i64*, i64** %ptr %A = load i64*, i64** %ptr
%tmp0 = bitcast i64* %A to i8* %tmp0 = bitcast i64* %A to i8*
%tmp1 = load <1 x i64>, <1 x i64>* %B %tmp1 = load <1 x i64>, <1 x i64>* %B
call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1) call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1)
%tmp2 = getelementptr i64, i64* %A, i32 4 %tmp2 = getelementptr i64, i64* %A, i32 4
store i64* %tmp2, i64** %ptr store i64* %tmp2, i64** %ptr
ret void ret void
@ -78,7 +78,7 @@ define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst4.8 {d16, d18, d20, d22}, [r0:256]! ;CHECK: vst4.8 {d16, d18, d20, d22}, [r0:256]!
;CHECK: vst4.8 {d17, d19, d21, d23}, [r0:256] ;CHECK: vst4.8 {d17, d19, d21, d23}, [r0:256]
%tmp1 = load <16 x i8>, <16 x i8>* %B %tmp1 = load <16 x i8>, <16 x i8>* %B
call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64) call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
ret void ret void
} }
@ -89,7 +89,7 @@ define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst4.16 {d17, d19, d21, d23}, [r0] ;CHECK: vst4.16 {d17, d19, d21, d23}, [r0]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
ret void ret void
} }
@ -99,7 +99,7 @@ define void @vst4Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst4.32 ;CHECK: vst4.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst4.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1) call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
ret void ret void
} }
@ -109,7 +109,7 @@ define void @vst4Qf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst4.32 ;CHECK: vst4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
ret void ret void
} }
@ -121,19 +121,19 @@ define void @vst4Qf_update(float** %ptr, <4 x float>* %B) nounwind {
%A = load float*, float** %ptr %A = load float*, float** %ptr
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
%tmp2 = getelementptr float, float* %A, i32 16 %tmp2 = getelementptr float, float* %A, i32 16
store float* %tmp2, float** %ptr store float* %tmp2, float** %ptr
ret void ret void
} }
declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind

View File

@ -110,7 +110,7 @@ define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 16 bits: ;Check the alignment value. Max for this instruction is 16 bits:
;CHECK: vst2.8 {d16[1], d17[1]}, [r0:16] ;CHECK: vst2.8 {d16[1], d17[1]}, [r0:16]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4) call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
ret void ret void
} }
@ -120,7 +120,7 @@ define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst2.16 {d16[1], d17[1]}, [r0:32] ;CHECK: vst2.16 {d16[1], d17[1]}, [r0:32]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
ret void ret void
} }
@ -131,7 +131,7 @@ define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
%A = load i16*, i16** %ptr %A = load i16*, i16** %ptr
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2) call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
%tmp2 = getelementptr i16, i16* %A, i32 %inc %tmp2 = getelementptr i16, i16* %A, i32 %inc
store i16* %tmp2, i16** %ptr store i16* %tmp2, i16** %ptr
ret void ret void
@ -142,7 +142,7 @@ define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst2.32 ;CHECK: vst2.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -151,7 +151,7 @@ define void @vst2lanef(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst2.32 ;CHECK: vst2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -161,7 +161,7 @@ define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst2.16 {d17[1], d19[1]}, [r0] ;CHECK: vst2.16 {d17[1], d19[1]}, [r0]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1) call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
ret void ret void
} }
@ -171,7 +171,7 @@ define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst2.32 {d17[0], d19[0]}, [r0:64] ;CHECK: vst2.32 {d17[0], d19[0]}, [r0:64]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16) call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
ret void ret void
} }
@ -180,24 +180,24 @@ define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst2.32 ;CHECK: vst2.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1) call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
ret void ret void
} }
declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind declare void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind declare void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind declare void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind declare void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind declare void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind declare void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind declare void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind { define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK-LABEL: vst3lanei8: ;CHECK-LABEL: vst3lanei8:
;CHECK: vst3.8 ;CHECK: vst3.8
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -207,7 +207,7 @@ define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0] ;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8) call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
ret void ret void
} }
@ -216,7 +216,7 @@ define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -225,7 +225,7 @@ define void @vst3lanef(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -235,7 +235,7 @@ define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0] ;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8) call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
ret void ret void
} }
@ -244,7 +244,7 @@ define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
ret void ret void
} }
@ -255,7 +255,7 @@ define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
%A = load i32*, i32** %ptr %A = load i32*, i32** %ptr
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1) call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
%tmp2 = getelementptr i32, i32* %A, i32 3 %tmp2 = getelementptr i32, i32* %A, i32 3
store i32* %tmp2, i32** %ptr store i32* %tmp2, i32** %ptr
ret void ret void
@ -266,18 +266,18 @@ define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst3.32 ;CHECK: vst3.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
ret void ret void
} }
declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind declare void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind declare void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind declare void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind declare void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind declare void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind declare void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind declare void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind { define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
@ -285,7 +285,7 @@ define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
;Check the alignment value. Max for this instruction is 32 bits: ;Check the alignment value. Max for this instruction is 32 bits:
;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32] ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
ret void ret void
} }
@ -295,7 +295,7 @@ define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]! ;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]!
%A = load i8*, i8** %ptr %A = load i8*, i8** %ptr
%tmp1 = load <8 x i8>, <8 x i8>* %B %tmp1 = load <8 x i8>, <8 x i8>* %B
call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8) call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
%tmp2 = getelementptr i8, i8* %A, i32 4 %tmp2 = getelementptr i8, i8* %A, i32 4
store i8* %tmp2, i8** %ptr store i8* %tmp2, i8** %ptr
ret void ret void
@ -306,7 +306,7 @@ define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst4.16 ;CHECK: vst4.16
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <4 x i16>, <4 x i16>* %B %tmp1 = load <4 x i16>, <4 x i16>* %B
call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -316,7 +316,7 @@ define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128] ;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <2 x i32>, <2 x i32>* %B %tmp1 = load <2 x i32>, <2 x i32>* %B
call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16) call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
ret void ret void
} }
@ -325,7 +325,7 @@ define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst4.32 ;CHECK: vst4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <2 x float>, <2 x float>* %B %tmp1 = load <2 x float>, <2 x float>* %B
call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -335,7 +335,7 @@ define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64] ;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64]
%tmp0 = bitcast i16* %A to i8* %tmp0 = bitcast i16* %A to i8*
%tmp1 = load <8 x i16>, <8 x i16>* %B %tmp1 = load <8 x i16>, <8 x i16>* %B
call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16) call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
ret void ret void
} }
@ -345,7 +345,7 @@ define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0] ;CHECK: vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
%tmp0 = bitcast i32* %A to i8* %tmp0 = bitcast i32* %A to i8*
%tmp1 = load <4 x i32>, <4 x i32>* %B %tmp1 = load <4 x i32>, <4 x i32>* %B
call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1) call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
ret void ret void
} }
@ -354,7 +354,7 @@ define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst4.32 ;CHECK: vst4.32
%tmp0 = bitcast float* %A to i8* %tmp0 = bitcast float* %A to i8*
%tmp1 = load <4 x float>, <4 x float>* %B %tmp1 = load <4 x float>, <4 x float>* %B
call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1) call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
ret void ret void
} }
@ -365,11 +365,11 @@ define <8 x i16> @variable_insertelement(<8 x i16> %a, i16 %b, i32 %c) nounwind
ret <8 x i16> %r ret <8 x i16> %r
} }
declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind declare void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind declare void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind declare void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind declare void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind declare void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind declare void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind declare void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind

View File

@ -15,11 +15,11 @@ entry:
%6 = bitcast i32* %sp3 to <4 x i32>* ; <<4 x i32>*> [#uses=1] %6 = bitcast i32* %sp3 to <4 x i32>* ; <<4 x i32>*> [#uses=1]
%7 = load <4 x i32>, <4 x i32>* %6, align 16 ; <<4 x i32>> [#uses=1] %7 = load <4 x i32>, <4 x i32>* %6, align 16 ; <<4 x i32>> [#uses=1]
%8 = bitcast i32* %dp to i8* ; <i8*> [#uses=1] %8 = bitcast i32* %dp to i8* ; <i8*> [#uses=1]
tail call void @llvm.arm.neon.vst4.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7, i32 1) tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* %8, <4 x i32> %1, <4 x i32> %3, <4 x i32> %5, <4 x i32> %7, i32 1)
ret void ret void
} }
declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
@sbuf = common global [16 x i32] zeroinitializer, align 16 ; <[16 x i32]*> [#uses=5] @sbuf = common global [16 x i32] zeroinitializer, align 16 ; <[16 x i32]*> [#uses=5]
@dbuf = common global [16 x i32] zeroinitializer ; <[16 x i32]*> [#uses=2] @dbuf = common global [16 x i32] zeroinitializer ; <[16 x i32]*> [#uses=2]
@ -45,7 +45,7 @@ bb2: ; preds = %bb
%3 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 4) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1] %3 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 4) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
%4 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 8) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1] %4 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 8) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
%5 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 12) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1] %5 = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @sbuf, i32 0, i32 12) to <4 x i32>*), align 16 ; <<4 x i32>> [#uses=1]
tail call void @llvm.arm.neon.vst4.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, i32 1) nounwind tail call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* bitcast ([16 x i32]* @dbuf to i8*), <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, i32 1) nounwind
ret i32 0 ret i32 0
} }
@ -53,15 +53,15 @@ bb2: ; preds = %bb
; Make sure the DPair register class can spill. ; Make sure the DPair register class can spill.
define void @pr12389(i8* %p) nounwind ssp { define void @pr12389(i8* %p) nounwind ssp {
entry: entry:
%vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %p, i32 1) %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %p, i32 1)
tail call void asm sideeffect "", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15}"() nounwind tail call void asm sideeffect "", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15}"() nounwind
tail call void @llvm.arm.neon.vst1.v4f32(i8* %p, <4 x float> %vld1, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %vld1, i32 1)
ret void ret void
} }
declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
; <rdar://problem/11101911> ; <rdar://problem/11101911>
; When an strd is expanded into two str instructions, make sure the first str ; When an strd is expanded into two str instructions, make sure the first str

View File

@ -59,10 +59,10 @@ bb1:
%indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ] %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
%tmp1 = shl i32 %indvar, 2 %tmp1 = shl i32 %indvar, 2
%gep1 = getelementptr i8, i8* %ptr1, i32 %tmp1 %gep1 = getelementptr i8, i8* %ptr1, i32 %tmp1
%tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %gep1, i32 1) %tmp2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %gep1, i32 1)
%tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %tmp2) %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> %tmp2)
%gep2 = getelementptr i8, i8* %ptr2, i32 %tmp1 %gep2 = getelementptr i8, i8* %ptr2, i32 %tmp1
call void @llvm.arm.neon.vst1.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1) call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %gep2, <4 x float> %tmp3, i32 1)
%indvar.next = add i32 %indvar, 1 %indvar.next = add i32 %indvar, 1
%cond = icmp eq i32 %indvar.next, 10 %cond = icmp eq i32 %indvar.next, 10
br i1 %cond, label %bb2, label %bb1 br i1 %cond, label %bb2, label %bb1
@ -73,9 +73,9 @@ bb2:
; CHECK-NOT: LCPI1_0: ; CHECK-NOT: LCPI1_0:
declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind
declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone

View File

@ -7,7 +7,7 @@
%quux = type { i32 (...)**, %baz*, i32 } %quux = type { i32 (...)**, %baz*, i32 }
%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo } %quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly
define void @aaa(%quuz* %this, i8* %block) { define void @aaa(%quuz* %this, i8* %block) {
; CHECK-LABEL: aaa: ; CHECK-LABEL: aaa:
@ -18,30 +18,30 @@ entry:
%aligned_vec = alloca <4 x float>, align 16 %aligned_vec = alloca <4 x float>, align 16
%"alloca point" = bitcast i32 0 to i32 %"alloca point" = bitcast i32 0 to i32
%vecptr = bitcast <4 x float>* %aligned_vec to i8* %vecptr = bitcast <4 x float>* %aligned_vec to i8*
%0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %vecptr, i32 1) nounwind %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %vecptr, i32 1) nounwind
store float 6.300000e+01, float* undef, align 4 store float 6.300000e+01, float* undef, align 4
%1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1] %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1] %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
%ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* undef, i32 1) nounwind
store float 0.000000e+00, float* undef, align 4 store float 0.000000e+00, float* undef, align 4
%val173 = load <4 x float>, <4 x float>* undef ; <<4 x float>> [#uses=1] %val173 = load <4 x float>, <4 x float>* undef ; <<4 x float>> [#uses=1]
br label %bb4 br label %bb4

View File

@ -6,12 +6,12 @@
;CHECK: bx ;CHECK: bx
define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) { define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
entry: entry:
%vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1) %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %bar, i32 1)
%and = and i32 %avail, 1 %and = and i32 %avail, 1
%tobool = icmp eq i32 %and, 0 %tobool = icmp eq i32 %and, 0
%vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
ret <16 x i8> %vld1. ret <16 x i8> %vld1.
} }
declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 ) declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* , i32 )

View File

@ -3,8 +3,8 @@
; The alignment arguments for NEON load/store intrinsics can be increased ; The alignment arguments for NEON load/store intrinsics can be increased
; by instcombine. Check for this. ; by instcombine. Check for this.
; CHECK: vld4.v2i32({{.*}}, i32 32) ; CHECK: vld4.v2i32.p0i8({{.*}}, i32 32)
; CHECK: vst4.v2i32({{.*}}, i32 16) ; CHECK: vst4.p0i8.v2i32({{.*}}, i32 16)
@x = common global [8 x i32] zeroinitializer, align 32 @x = common global [8 x i32] zeroinitializer, align 32
@y = common global [8 x i32] zeroinitializer, align 16 @y = common global [8 x i32] zeroinitializer, align 16
@ -12,14 +12,14 @@
%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
define void @test() nounwind ssp { define void @test() nounwind ssp {
%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* bitcast ([8 x i32]* @x to i8*), i32 1) %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* bitcast ([8 x i32]* @x to i8*), i32 1)
%tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0 %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 1 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 1
%tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
%tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 3 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 3
call void @llvm.arm.neon.vst4.v2i32(i8* bitcast ([8 x i32]* @y to i8*), <2 x i32> %tmp2, <2 x i32> %tmp3, <2 x i32> %tmp4, <2 x i32> %tmp5, i32 1) call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* bitcast ([8 x i32]* @y to i8*), <2 x i32> %tmp2, <2 x i32> %tmp3, <2 x i32> %tmp4, <2 x i32> %tmp5, i32 1)
ret void ret void
} }
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind

View File

@ -239,33 +239,33 @@ define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i
%counter.04 = phi i32 [ 0, %.lr.ph ], [ %44, %11 ] %counter.04 = phi i32 [ 0, %.lr.ph ], [ %44, %11 ]
%result.03 = phi <16 x i8> [ zeroinitializer, %.lr.ph ], [ %41, %11 ] %result.03 = phi <16 x i8> [ zeroinitializer, %.lr.ph ], [ %41, %11 ]
%.012 = phi <16 x i8>* [ %data, %.lr.ph ], [ %43, %11 ] %.012 = phi <16 x i8>* [ %data, %.lr.ph ], [ %43, %11 ]
%12 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %.05, i32 1) nounwind %12 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %.05, i32 1) nounwind
%13 = getelementptr inbounds i8, i8* %.05, i32 %ref_stride %13 = getelementptr inbounds i8, i8* %.05, i32 %ref_stride
%14 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %13, i32 1) nounwind %14 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %13, i32 1) nounwind
%15 = shufflevector <1 x i64> %12, <1 x i64> %14, <2 x i32> <i32 0, i32 1> %15 = shufflevector <1 x i64> %12, <1 x i64> %14, <2 x i32> <i32 0, i32 1>
%16 = bitcast <2 x i64> %15 to <16 x i8> %16 = bitcast <2 x i64> %15 to <16 x i8>
%17 = getelementptr inbounds <16 x i8>, <16 x i8>* %.012, i32 1 %17 = getelementptr inbounds <16 x i8>, <16 x i8>* %.012, i32 1
store <16 x i8> %16, <16 x i8>* %.012, align 4 store <16 x i8> %16, <16 x i8>* %.012, align 4
%18 = getelementptr inbounds i8, i8* %.05, i32 %2 %18 = getelementptr inbounds i8, i8* %.05, i32 %2
%19 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %18, i32 1) nounwind %19 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %18, i32 1) nounwind
%20 = getelementptr inbounds i8, i8* %.05, i32 %3 %20 = getelementptr inbounds i8, i8* %.05, i32 %3
%21 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %20, i32 1) nounwind %21 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %20, i32 1) nounwind
%22 = shufflevector <1 x i64> %19, <1 x i64> %21, <2 x i32> <i32 0, i32 1> %22 = shufflevector <1 x i64> %19, <1 x i64> %21, <2 x i32> <i32 0, i32 1>
%23 = bitcast <2 x i64> %22 to <16 x i8> %23 = bitcast <2 x i64> %22 to <16 x i8>
%24 = getelementptr inbounds <16 x i8>, <16 x i8>* %.012, i32 2 %24 = getelementptr inbounds <16 x i8>, <16 x i8>* %.012, i32 2
store <16 x i8> %23, <16 x i8>* %17, align 4 store <16 x i8> %23, <16 x i8>* %17, align 4
%25 = getelementptr inbounds i8, i8* %.05, i32 %4 %25 = getelementptr inbounds i8, i8* %.05, i32 %4
%26 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %25, i32 1) nounwind %26 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %25, i32 1) nounwind
%27 = getelementptr inbounds i8, i8* %.05, i32 %5 %27 = getelementptr inbounds i8, i8* %.05, i32 %5
%28 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %27, i32 1) nounwind %28 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %27, i32 1) nounwind
%29 = shufflevector <1 x i64> %26, <1 x i64> %28, <2 x i32> <i32 0, i32 1> %29 = shufflevector <1 x i64> %26, <1 x i64> %28, <2 x i32> <i32 0, i32 1>
%30 = bitcast <2 x i64> %29 to <16 x i8> %30 = bitcast <2 x i64> %29 to <16 x i8>
%31 = getelementptr inbounds <16 x i8>, <16 x i8>* %.012, i32 3 %31 = getelementptr inbounds <16 x i8>, <16 x i8>* %.012, i32 3
store <16 x i8> %30, <16 x i8>* %24, align 4 store <16 x i8> %30, <16 x i8>* %24, align 4
%32 = getelementptr inbounds i8, i8* %.05, i32 %6 %32 = getelementptr inbounds i8, i8* %.05, i32 %6
%33 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %32, i32 1) nounwind %33 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %32, i32 1) nounwind
%34 = getelementptr inbounds i8, i8* %.05, i32 %7 %34 = getelementptr inbounds i8, i8* %.05, i32 %7
%35 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %34, i32 1) nounwind %35 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %34, i32 1) nounwind
%36 = shufflevector <1 x i64> %33, <1 x i64> %35, <2 x i32> <i32 0, i32 1> %36 = shufflevector <1 x i64> %33, <1 x i64> %35, <2 x i32> <i32 0, i32 1>
%37 = bitcast <2 x i64> %36 to <16 x i8> %37 = bitcast <2 x i64> %36 to <16 x i8>
store <16 x i8> %37, <16 x i8>* %31, align 4 store <16 x i8> %37, <16 x i8>* %31, align 4
@ -290,7 +290,7 @@ define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i
ret void ret void
} }
declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32) nounwind readonly
; Handle chains in which the same offset is used for both loads and ; Handle chains in which the same offset is used for both loads and
; stores to the same array. ; stores to the same array.
@ -328,32 +328,32 @@ for.body: ; preds = %for.body, %entry
%i.0110 = phi i32 [ 0, %entry ], [ %inc, %for.body ] %i.0110 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%src.addr = phi i8* [ %src, %entry ], [ %add.ptr45, %for.body ] %src.addr = phi i8* [ %src, %entry ], [ %add.ptr45, %for.body ]
%add.ptr = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg %add.ptr = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg
%vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr, i32 1) %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %add.ptr, i32 1)
%add.ptr3 = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg2 %add.ptr3 = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg2
%vld2 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr3, i32 1) %vld2 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %add.ptr3, i32 1)
%add.ptr7 = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg6 %add.ptr7 = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg6
%vld3 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr7, i32 1) %vld3 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %add.ptr7, i32 1)
%add.ptr11 = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg10 %add.ptr11 = getelementptr inbounds i8, i8* %src.addr, i32 %idx.neg10
%vld4 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr11, i32 1) %vld4 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %add.ptr11, i32 1)
%vld5 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %src.addr, i32 1) %vld5 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %src.addr, i32 1)
%add.ptr17 = getelementptr inbounds i8, i8* %src.addr, i32 %stride %add.ptr17 = getelementptr inbounds i8, i8* %src.addr, i32 %stride
%vld6 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr17, i32 1) %vld6 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %add.ptr17, i32 1)
%add.ptr20 = getelementptr inbounds i8, i8* %src.addr, i32 %mul5 %add.ptr20 = getelementptr inbounds i8, i8* %src.addr, i32 %mul5
%vld7 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr20, i32 1) %vld7 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %add.ptr20, i32 1)
%add.ptr23 = getelementptr inbounds i8, i8* %src.addr, i32 %mul1 %add.ptr23 = getelementptr inbounds i8, i8* %src.addr, i32 %mul1
%vld8 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr23, i32 1) %vld8 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %add.ptr23, i32 1)
%vadd1 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld1, <8 x i8> %vld2) nounwind %vadd1 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld1, <8 x i8> %vld2) nounwind
%vadd2 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld2, <8 x i8> %vld3) nounwind %vadd2 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld2, <8 x i8> %vld3) nounwind
%vadd3 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld3, <8 x i8> %vld4) nounwind %vadd3 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld3, <8 x i8> %vld4) nounwind
%vadd4 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld4, <8 x i8> %vld5) nounwind %vadd4 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld4, <8 x i8> %vld5) nounwind
%vadd5 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld5, <8 x i8> %vld6) nounwind %vadd5 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld5, <8 x i8> %vld6) nounwind
%vadd6 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld6, <8 x i8> %vld7) nounwind %vadd6 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld6, <8 x i8> %vld7) nounwind
tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr3, <8 x i8> %vadd1, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %add.ptr3, <8 x i8> %vadd1, i32 1)
tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr7, <8 x i8> %vadd2, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %add.ptr7, <8 x i8> %vadd2, i32 1)
tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr11, <8 x i8> %vadd3, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %add.ptr11, <8 x i8> %vadd3, i32 1)
tail call void @llvm.arm.neon.vst1.v8i8(i8* %src.addr, <8 x i8> %vadd4, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %src.addr, <8 x i8> %vadd4, i32 1)
tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr17, <8 x i8> %vadd5, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %add.ptr17, <8 x i8> %vadd5, i32 1)
tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr20, <8 x i8> %vadd6, i32 1) tail call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %add.ptr20, <8 x i8> %vadd6, i32 1)
%inc = add nsw i32 %i.0110, 1 %inc = add nsw i32 %i.0110, 1
%add.ptr45 = getelementptr inbounds i8, i8* %src.addr, i32 8 %add.ptr45 = getelementptr inbounds i8, i8* %src.addr, i32 8
%exitcond = icmp eq i32 %inc, 4 %exitcond = icmp eq i32 %inc, 4
@ -363,8 +363,8 @@ for.end: ; preds = %for.body
ret void ret void
} }
declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly declare <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8*, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone