llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll
Oliver Stannard 5e487f8dc7 Teach the AArch64 backend about v4f16 and v8f16
This teaches the AArch64 backend to deal with the operations required
to deal with the operations on v4f16 and v8f16 which are exposed by
NEON intrinsics, plus the add, sub, mul and div operations.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216555 91177308-0d34-0410-b5e6-96231b3b80d8
2014-08-27 16:16:04 +00:00

529 lines
22 KiB
LLVM

; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
; Simple load of v4i16
define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
; CHECK-LABEL: load_64:
; CHECK: ldr d0, [x0]
entry:
%0 = load <4 x half>* %a, align 8
ret <4 x half> %0
}
; Simple load of v8i16
define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
; CHECK-LABEL: load_128:
; CHECK: ldr q0, [x0]
entry:
%0 = load <8 x half>* %a, align 16
ret <8 x half> %0
}
; Duplicating load to v4i16
define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
; CHECK-LABEL: load_dup_64:
; CHECK: ld1r { v0.4h }, [x0]
entry:
%0 = load half* %a, align 2
%1 = insertelement <4 x half> undef, half %0, i32 0
%2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
ret <4 x half> %2
}
; Duplicating load to v8i16
define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
; CHECK-LABEL: load_dup_128:
; CHECK: ld1r { v0.8h }, [x0]
entry:
%0 = load half* %a, align 2
%1 = insertelement <8 x half> undef, half %0, i32 0
%2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
ret <8 x half> %2
}
; Load to one lane of v4f16
define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
; CHECK-LABEL: load_lane_64:
; CHECK: ld1 { v0.h }[2], [x0]
entry:
%0 = load half* %a, align 2
%1 = insertelement <4 x half> %b, half %0, i32 2
ret <4 x half> %1
}
; Load to one lane of v8f16
define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
; CHECK-LABEL: load_lane_128:
; CHECK: ld1 { v0.h }[5], [x0]
entry:
%0 = load half* %a, align 2
%1 = insertelement <8 x half> %b, half %0, i32 5
ret <8 x half> %1
}
; Simple store of v4f16
define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
; CHECK-LABEL: store_64:
; CHECK: str d0, [x0]
entry:
store <4 x half> %b, <4 x half>* %a, align 8
ret void
}
; Simple store of v8f16
define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
; CHECK-LABEL: store_128:
; CHECK: str q0, [x0]
entry:
store <8 x half> %b, <8 x half>* %a, align 16
ret void
}
; Store from one lane of v4f16
define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
; CHECK-LABEL: store_lane_64:
; CHECK: st1 { v0.h }[2], [x0]
entry:
%0 = extractelement <4 x half> %b, i32 2
store half %0, half* %a, align 2
ret void
}
; Store from one lane of v8f16
define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
; CHECK-LABEL: store_lane_128:
; CHECK: st1 { v0.h }[5], [x0]
entry:
%0 = extractelement <8 x half> %b, i32 5
store half %0, half* %a, align 2
ret void
}
; NEON intrinsics - (de-)interleaving loads and stores
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
; Load 2 x v4f16 with de-interleaving
define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
; CHECK-LABEL: load_interleave_64_2:
; CHECK: ld2 { v0.4h, v1.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
ret { <4 x half>, <4 x half> } %0
}
; Load 3 x v4f16 with de-interleaving
define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
; CHECK-LABEL: load_interleave_64_3:
; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
ret { <4 x half>, <4 x half>, <4 x half> } %0
}
; Load 4 x v4f16 with de-interleaving
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
; CHECK-LABEL: load_interleave_64_4:
; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
}
; Store 2 x v4f16 with interleaving
define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
; CHECK-LABEL: store_interleave_64_2:
; CHECK: st2 { v0.4h, v1.4h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
ret void
}
; Store 3 x v4f16 with interleaving
define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
; CHECK-LABEL: store_interleave_64_3:
; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
ret void
}
; Store 4 x v4f16 with interleaving
define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
; CHECK-LABEL: store_interleave_64_4:
; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
ret void
}
; Load 2 x v8f16 with de-interleaving
define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
; CHECK-LABEL: load_interleave_128_2:
; CHECK: ld2 { v0.8h, v1.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
ret { <8 x half>, <8 x half> } %0
}
; Load 3 x v8f16 with de-interleaving
define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
; CHECK-LABEL: load_interleave_128_3:
; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
ret { <8 x half>, <8 x half>, <8 x half> } %0
}
; Load 8 x v8f16 with de-interleaving
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
; CHECK-LABEL: load_interleave_128_4:
; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
}
; Store 2 x v8f16 with interleaving
define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
; CHECK-LABEL: store_interleave_128_2:
; CHECK: st2 { v0.8h, v1.8h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
ret void
}
; Store 3 x v8f16 with interleaving
define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
; CHECK-LABEL: store_interleave_128_3:
; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
ret void
}
; Store 8 x v8f16 with interleaving
define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
; CHECK-LABEL: store_interleave_128_4:
; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
ret void
}
; NEON intrinsics - duplicating loads
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
; Load 2 x v4f16 with duplication
define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
; CHECK-LABEL: load_dup_64_2:
; CHECK: ld2r { v0.4h, v1.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
ret { <4 x half>, <4 x half> } %0
}
; Load 3 x v4f16 with duplication
define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
; CHECK-LABEL: load_dup_64_3:
; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
ret { <4 x half>, <4 x half>, <4 x half> } %0
}
; Load 4 x v4f16 with duplication
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
; CHECK-LABEL: load_dup_64_4:
; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
}
; Load 2 x v8f16 with duplication
define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
; CHECK-LABEL: load_dup_128_2:
; CHECK: ld2r { v0.8h, v1.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
ret { <8 x half>, <8 x half> } %0
}
; Load 3 x v8f16 with duplication
define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
; CHECK-LABEL: load_dup_128_3:
; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
ret { <8 x half>, <8 x half>, <8 x half> } %0
}
; Load 8 x v8f16 with duplication
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
; CHECK-LABEL: load_dup_128_4:
; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
}
; NEON intrinsics - loads and stores to/from one lane
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
; Load one lane of 2 x v4f16
define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
; CHECK-LABEL: load_lane_64_2:
; CHECK: ld2 { v0.h, v1.h }[2], [x0]
entry:
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
ret { <4 x half>, <4 x half> } %0
}
; Load one lane of 3 x v4f16
define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
; CHECK-LABEL: load_lane_64_3:
; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
ret { <4 x half>, <4 x half>, <4 x half> } %0
}
; Load one lane of 4 x v4f16
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
; CHECK-LABEL: load_lane_64_4:
; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
}
; Store one lane of 2 x v4f16
define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
; CHECK-LABEL: store_lane_64_2:
; CHECK: st2 { v0.h, v1.h }[2], [x0]
entry:
tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
ret void
}
; Store one lane of 3 x v4f16
define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
; CHECK-LABEL: store_lane_64_3:
; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
entry:
tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
ret void
}
; Store one lane of 4 x v4f16
define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
; CHECK-LABEL: store_lane_64_4:
; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
entry:
tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
ret void
}
; Load one lane of 2 x v8f16
define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
; CHECK-LABEL: load_lane_128_2:
; CHECK: ld2 { v0.h, v1.h }[2], [x0]
entry:
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
ret { <8 x half>, <8 x half> } %0
}
; Load one lane of 3 x v8f16
define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
; CHECK-LABEL: load_lane_128_3:
; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
ret { <8 x half>, <8 x half>, <8 x half> } %0
}
; Load one lane of 8 x v8f16
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
; CHECK-LABEL: load_lane_128_4:
; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
}
; Store one lane of 2 x v8f16
define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
; CHECK-LABEL: store_lane_128_2:
; CHECK: st2 { v0.h, v1.h }[2], [x0]
entry:
tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
ret void
}
; Store one lane of 3 x v8f16
define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
; CHECK-LABEL: store_lane_128_3:
; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
entry:
tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
ret void
}
; Store one lane of 8 x v8f16
define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
; CHECK-LABEL: store_lane_128_4:
; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
entry:
tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
ret void
}
; NEON intrinsics - load/store without interleaving
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
; Load 2 x v4f16 without de-interleaving
define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
; CHECK-LABEL: load_64_2:
; CHECK: ld1 { v0.4h, v1.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
ret { <4 x half>, <4 x half> } %0
}
; Load 3 x v4f16 without de-interleaving
define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
; CHECK-LABEL: load_64_3:
; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
ret { <4 x half>, <4 x half>, <4 x half> } %0
}
; Load 4 x v4f16 without de-interleaving
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
; CHECK-LABEL: load_64_4:
; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
entry:
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
}
; Store 2 x v4f16 without interleaving
define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
; CHECK-LABEL: store_64_2:
; CHECK: st1 { v0.4h, v1.4h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
ret void
}
; Store 3 x v4f16 without interleaving
define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
; CHECK-LABEL: store_64_3:
; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
ret void
}
; Store 4 x v4f16 without interleaving
define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
; CHECK-LABEL: store_64_4:
; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
ret void
}
; Load 2 x v8f16 without de-interleaving
define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
; CHECK-LABEL: load_128_2:
; CHECK: ld1 { v0.8h, v1.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
ret { <8 x half>, <8 x half> } %0
}
; Load 3 x v8f16 without de-interleaving
define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
; CHECK-LABEL: load_128_3:
; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
ret { <8 x half>, <8 x half>, <8 x half> } %0
}
; Load 8 x v8f16 without de-interleaving
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
; CHECK-LABEL: load_128_4:
; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
entry:
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
}
; Store 2 x v8f16 without interleaving
define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
; CHECK-LABEL: store_128_2:
; CHECK: st1 { v0.8h, v1.8h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
ret void
}
; Store 3 x v8f16 without interleaving
define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
; CHECK-LABEL: store_128_3:
; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
ret void
}
; Store 8 x v8f16 without interleaving
define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
; CHECK-LABEL: store_128_4:
; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
entry:
tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
ret void
}