mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-02 16:36:40 +00:00
32ec7fd07b
When storing the 0th lane of a vector, use a simpler and usually more efficient scalar store instead. In this case, also using the unscaled offset. Differential revision: https://reviews.llvm.org/D46762 llvm-svn: 332394
607 lines
24 KiB
LLVM
607 lines
24 KiB
LLVM
; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
|
|
|
|
; Simple load of v4i16
|
|
define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
|
|
; CHECK-LABEL: load_64:
|
|
; CHECK: ldr d0, [x0]
|
|
entry:
|
|
%0 = load <4 x half>, <4 x half>* %a, align 8
|
|
ret <4 x half> %0
|
|
}
|
|
|
|
; Simple load of v8i16
|
|
define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
|
|
; CHECK-LABEL: load_128:
|
|
; CHECK: ldr q0, [x0]
|
|
entry:
|
|
%0 = load <8 x half>, <8 x half>* %a, align 16
|
|
ret <8 x half> %0
|
|
}
|
|
|
|
; Duplicating load to v4i16
|
|
define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
|
|
; CHECK-LABEL: load_dup_64:
|
|
; CHECK: ld1r { v0.4h }, [x0]
|
|
entry:
|
|
%0 = load half, half* %a, align 2
|
|
%1 = insertelement <4 x half> undef, half %0, i32 0
|
|
%2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
|
|
ret <4 x half> %2
|
|
}
|
|
|
|
; Duplicating load to v8i16
|
|
define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
|
|
; CHECK-LABEL: load_dup_128:
|
|
; CHECK: ld1r { v0.8h }, [x0]
|
|
entry:
|
|
%0 = load half, half* %a, align 2
|
|
%1 = insertelement <8 x half> undef, half %0, i32 0
|
|
%2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
|
|
ret <8 x half> %2
|
|
}
|
|
|
|
; Load to one lane of v4f16
|
|
define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
|
|
; CHECK-LABEL: load_lane_64:
|
|
; CHECK: ld1 { v0.h }[2], [x0]
|
|
entry:
|
|
%0 = load half, half* %a, align 2
|
|
%1 = insertelement <4 x half> %b, half %0, i32 2
|
|
ret <4 x half> %1
|
|
}
|
|
|
|
; Load to one lane of v8f16
|
|
define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
|
|
; CHECK-LABEL: load_lane_128:
|
|
; CHECK: ld1 { v0.h }[5], [x0]
|
|
entry:
|
|
%0 = load half, half* %a, align 2
|
|
%1 = insertelement <8 x half> %b, half %0, i32 5
|
|
ret <8 x half> %1
|
|
}
|
|
|
|
; Simple store of v4f16
|
|
define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
|
|
; CHECK-LABEL: store_64:
|
|
; CHECK: str d0, [x0]
|
|
entry:
|
|
store <4 x half> %b, <4 x half>* %a, align 8
|
|
ret void
|
|
}
|
|
|
|
; Simple store of v8f16
|
|
define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
|
|
; CHECK-LABEL: store_128:
|
|
; CHECK: str q0, [x0]
|
|
entry:
|
|
store <8 x half> %b, <8 x half>* %a, align 16
|
|
ret void
|
|
}
|
|
|
|
; Store from one lane of v4f16
|
|
define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
|
|
; CHECK-LABEL: store_lane_64:
|
|
; CHECK: st1 { v0.h }[2], [x0]
|
|
entry:
|
|
%0 = extractelement <4 x half> %b, i32 2
|
|
store half %0, half* %a, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @store_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
|
|
; CHECK-LABEL: store_lane0_64:
|
|
; CHECK: str h0, [x0]
|
|
entry:
|
|
%0 = extractelement <4 x half> %b, i32 0
|
|
store half %0, half* %a, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
|
|
; CHECK-LABEL: storeu_lane0_64:
|
|
; CHECK: stur h0, [x{{[0-9]+}}, #-2]
|
|
entry:
|
|
%0 = getelementptr half, half* %a, i64 -1
|
|
%1 = extractelement <4 x half> %b, i32 0
|
|
store half %1, half* %0, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @storero_lane_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
|
|
; CHECK-LABEL: storero_lane_64:
|
|
; CHECK: st1 { v0.h }[2], [x{{[0-9]+}}]
|
|
entry:
|
|
%0 = getelementptr half, half* %a, i64 %c
|
|
%1 = extractelement <4 x half> %b, i32 2
|
|
store half %1, half* %0, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @storero_lane0_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
|
|
; CHECK-LABEL: storero_lane0_64:
|
|
; CHECK: str h0, [x0, x1, lsl #1]
|
|
entry:
|
|
%0 = getelementptr half, half* %a, i64 %c
|
|
%1 = extractelement <4 x half> %b, i32 0
|
|
store half %1, half* %0, align 2
|
|
ret void
|
|
}
|
|
|
|
; Store from one lane of v8f16
|
|
define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
|
|
; CHECK-LABEL: store_lane_128:
|
|
; CHECK: st1 { v0.h }[5], [x0]
|
|
entry:
|
|
%0 = extractelement <8 x half> %b, i32 5
|
|
store half %0, half* %a, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @store_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
|
|
; CHECK-LABEL: store_lane0_128:
|
|
; CHECK: str h0, [x0]
|
|
entry:
|
|
%0 = extractelement <8 x half> %b, i32 0
|
|
store half %0, half* %a, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
|
|
; CHECK-LABEL: storeu_lane0_128:
|
|
; CHECK: stur h0, [x{{[0-9]+}}, #-2]
|
|
entry:
|
|
%0 = getelementptr half, half* %a, i64 -1
|
|
%1 = extractelement <8 x half> %b, i32 0
|
|
store half %1, half* %0, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @storero_lane_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
|
|
; CHECK-LABEL: storero_lane_128:
|
|
; CHECK: st1 { v0.h }[4], [x{{[0-9]+}}]
|
|
entry:
|
|
%0 = getelementptr half, half* %a, i64 %c
|
|
%1 = extractelement <8 x half> %b, i32 4
|
|
store half %1, half* %0, align 2
|
|
ret void
|
|
}
|
|
|
|
define void @storero_lane0_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
|
|
; CHECK-LABEL: storero_lane0_128:
|
|
; CHECK: str h0, [x0, x1, lsl #1]
|
|
entry:
|
|
%0 = getelementptr half, half* %a, i64 %c
|
|
%1 = extractelement <8 x half> %b, i32 0
|
|
store half %1, half* %0, align 2
|
|
ret void
|
|
}
|
|
|
|
; NEON intrinsics - (de-)interleaving loads and stores
|
|
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
|
|
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
|
|
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
|
|
declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
|
|
declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
|
|
declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
|
|
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
|
|
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
|
|
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
|
|
declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
|
|
declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
|
|
declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
|
|
|
|
; Load 2 x v4f16 with de-interleaving
|
|
define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_interleave_64_2:
|
|
; CHECK: ld2 { v0.4h, v1.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
|
|
ret { <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load 3 x v4f16 with de-interleaving
|
|
define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_interleave_64_3:
|
|
; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load 4 x v4f16 with de-interleaving
|
|
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_interleave_64_4:
|
|
; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Store 2 x v4f16 with interleaving
|
|
define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
|
|
; CHECK-LABEL: store_interleave_64_2:
|
|
; CHECK: st2 { v0.4h, v1.4h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 3 x v4f16 with interleaving
|
|
define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
|
|
; CHECK-LABEL: store_interleave_64_3:
|
|
; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 4 x v4f16 with interleaving
|
|
define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
|
|
; CHECK-LABEL: store_interleave_64_4:
|
|
; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Load 2 x v8f16 with de-interleaving
|
|
define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_interleave_128_2:
|
|
; CHECK: ld2 { v0.8h, v1.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
|
|
ret { <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load 3 x v8f16 with de-interleaving
|
|
define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_interleave_128_3:
|
|
; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load 8 x v8f16 with de-interleaving
|
|
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_interleave_128_4:
|
|
; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Store 2 x v8f16 with interleaving
|
|
define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
|
|
; CHECK-LABEL: store_interleave_128_2:
|
|
; CHECK: st2 { v0.8h, v1.8h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 3 x v8f16 with interleaving
|
|
define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
|
|
; CHECK-LABEL: store_interleave_128_3:
|
|
; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 8 x v8f16 with interleaving
|
|
define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
|
|
; CHECK-LABEL: store_interleave_128_4:
|
|
; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; NEON intrinsics - duplicating loads
|
|
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
|
|
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
|
|
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
|
|
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
|
|
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
|
|
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
|
|
|
|
; Load 2 x v4f16 with duplication
|
|
define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
|
|
; CHECK-LABEL: load_dup_64_2:
|
|
; CHECK: ld2r { v0.4h, v1.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
|
|
ret { <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load 3 x v4f16 with duplication
|
|
define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
|
|
; CHECK-LABEL: load_dup_64_3:
|
|
; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load 4 x v4f16 with duplication
|
|
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
|
|
; CHECK-LABEL: load_dup_64_4:
|
|
; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load 2 x v8f16 with duplication
|
|
define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
|
|
; CHECK-LABEL: load_dup_128_2:
|
|
; CHECK: ld2r { v0.8h, v1.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
|
|
ret { <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load 3 x v8f16 with duplication
|
|
define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
|
|
; CHECK-LABEL: load_dup_128_3:
|
|
; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load 8 x v8f16 with duplication
|
|
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
|
|
; CHECK-LABEL: load_dup_128_4:
|
|
; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
|
|
; NEON intrinsics - loads and stores to/from one lane
|
|
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
|
|
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
|
|
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
|
|
declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
|
|
declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
|
|
declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
|
|
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
|
|
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
|
|
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
|
|
declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
|
|
declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
|
|
declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
|
|
|
|
; Load one lane of 2 x v4f16
|
|
define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
|
|
; CHECK-LABEL: load_lane_64_2:
|
|
; CHECK: ld2 { v0.h, v1.h }[2], [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
|
|
ret { <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load one lane of 3 x v4f16
|
|
define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
|
|
; CHECK-LABEL: load_lane_64_3:
|
|
; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load one lane of 4 x v4f16
|
|
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
|
|
; CHECK-LABEL: load_lane_64_4:
|
|
; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Store one lane of 2 x v4f16
|
|
define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
|
|
; CHECK-LABEL: store_lane_64_2:
|
|
; CHECK: st2 { v0.h, v1.h }[2], [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store one lane of 3 x v4f16
|
|
define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
|
|
; CHECK-LABEL: store_lane_64_3:
|
|
; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store one lane of 4 x v4f16
|
|
define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
|
|
; CHECK-LABEL: store_lane_64_4:
|
|
; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
|
|
ret void
|
|
}
|
|
|
|
; Load one lane of 2 x v8f16
|
|
define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
|
|
; CHECK-LABEL: load_lane_128_2:
|
|
; CHECK: ld2 { v0.h, v1.h }[2], [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
|
|
ret { <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load one lane of 3 x v8f16
|
|
define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
|
|
; CHECK-LABEL: load_lane_128_3:
|
|
; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load one lane of 8 x v8f16
|
|
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
|
|
; CHECK-LABEL: load_lane_128_4:
|
|
; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Store one lane of 2 x v8f16
|
|
define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
|
|
; CHECK-LABEL: store_lane_128_2:
|
|
; CHECK: st2 { v0.h, v1.h }[2], [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store one lane of 3 x v8f16
|
|
define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
|
|
; CHECK-LABEL: store_lane_128_3:
|
|
; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store one lane of 8 x v8f16
|
|
define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
|
|
; CHECK-LABEL: store_lane_128_4:
|
|
; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
|
|
ret void
|
|
}
|
|
|
|
; NEON intrinsics - load/store without interleaving
|
|
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
|
|
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
|
|
declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
|
|
declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
|
|
declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
|
|
declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
|
|
declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
|
|
declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
|
|
declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
|
|
declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
|
|
declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
|
|
declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
|
|
|
|
; Load 2 x v4f16 without de-interleaving
|
|
define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_64_2:
|
|
; CHECK: ld1 { v0.4h, v1.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
|
|
ret { <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load 3 x v4f16 without de-interleaving
|
|
define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_64_3:
|
|
; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Load 4 x v4f16 without de-interleaving
|
|
define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_64_4:
|
|
; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
|
entry:
|
|
%0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
|
|
ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
|
|
}
|
|
|
|
; Store 2 x v4f16 without interleaving
|
|
define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
|
|
; CHECK-LABEL: store_64_2:
|
|
; CHECK: st1 { v0.4h, v1.4h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 3 x v4f16 without interleaving
|
|
define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
|
|
; CHECK-LABEL: store_64_3:
|
|
; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 4 x v4f16 without interleaving
|
|
define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
|
|
; CHECK-LABEL: store_64_4:
|
|
; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Load 2 x v8f16 without de-interleaving
|
|
define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_128_2:
|
|
; CHECK: ld1 { v0.8h, v1.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
|
|
ret { <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load 3 x v8f16 without de-interleaving
|
|
define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_128_3:
|
|
; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Load 8 x v8f16 without de-interleaving
|
|
define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
|
|
; CHECK-LABEL: load_128_4:
|
|
; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
|
entry:
|
|
%0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
|
|
ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
|
|
}
|
|
|
|
; Store 2 x v8f16 without interleaving
|
|
define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
|
|
; CHECK-LABEL: store_128_2:
|
|
; CHECK: st1 { v0.8h, v1.8h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 3 x v8f16 without interleaving
|
|
define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
|
|
; CHECK-LABEL: store_128_3:
|
|
; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
|
|
ret void
|
|
}
|
|
|
|
; Store 8 x v8f16 without interleaving
|
|
define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
|
|
; CHECK-LABEL: store_128_4:
|
|
; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
|
|
entry:
|
|
tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
|
|
ret void
|
|
}
|