mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-05 10:39:21 +00:00
8689a52c10
callee-saved registers at the end of the lists. Also prefer to avoid using the low registers that are in register subclasses required by certain instructions, so that those registers will more likely be available when needed. This change makes a huge improvement in spilling in some cases. Thanks to Jakob for helping me realize the problem. Most of this patch is fixing the testsuite. There are quite a few places where we're checking for specific registers. I changed those to wildcards in places where that doesn't weaken the tests. The spill-q.ll and thumb2-spill-q.ll tests stopped spilling with this change, so I added a bunch of live values to force spills on those tests. llvm-svn: 116055
213 lines
6.3 KiB
LLVM
213 lines
6.3 KiB
LLVM
; RUN: llc < %s -mattr=+neon | FileCheck %s
|
|
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
|
|
target triple = "thumbv7-elf"
|
|
|
|
define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
|
|
;CHECK: vget_lanes8:
|
|
;CHECK: vmov.s8
|
|
%tmp1 = load <8 x i8>* %A
|
|
%tmp2 = extractelement <8 x i8> %tmp1, i32 1
|
|
%tmp3 = sext i8 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define i32 @vget_lanes16(<4 x i16>* %A) nounwind {
|
|
;CHECK: vget_lanes16:
|
|
;CHECK: vmov.s16
|
|
%tmp1 = load <4 x i16>* %A
|
|
%tmp2 = extractelement <4 x i16> %tmp1, i32 1
|
|
%tmp3 = sext i16 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define i32 @vget_laneu8(<8 x i8>* %A) nounwind {
|
|
;CHECK: vget_laneu8:
|
|
;CHECK: vmov.u8
|
|
%tmp1 = load <8 x i8>* %A
|
|
%tmp2 = extractelement <8 x i8> %tmp1, i32 1
|
|
%tmp3 = zext i8 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define i32 @vget_laneu16(<4 x i16>* %A) nounwind {
|
|
;CHECK: vget_laneu16:
|
|
;CHECK: vmov.u16
|
|
%tmp1 = load <4 x i16>* %A
|
|
%tmp2 = extractelement <4 x i16> %tmp1, i32 1
|
|
%tmp3 = zext i16 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
; Do a vector add to keep the extraction from being done directly from memory.
|
|
define i32 @vget_lanei32(<2 x i32>* %A) nounwind {
|
|
;CHECK: vget_lanei32:
|
|
;CHECK: vmov.32
|
|
%tmp1 = load <2 x i32>* %A
|
|
%tmp2 = add <2 x i32> %tmp1, %tmp1
|
|
%tmp3 = extractelement <2 x i32> %tmp2, i32 1
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind {
|
|
;CHECK: vgetQ_lanes8:
|
|
;CHECK: vmov.s8
|
|
%tmp1 = load <16 x i8>* %A
|
|
%tmp2 = extractelement <16 x i8> %tmp1, i32 1
|
|
%tmp3 = sext i8 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind {
|
|
;CHECK: vgetQ_lanes16:
|
|
;CHECK: vmov.s16
|
|
%tmp1 = load <8 x i16>* %A
|
|
%tmp2 = extractelement <8 x i16> %tmp1, i32 1
|
|
%tmp3 = sext i16 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind {
|
|
;CHECK: vgetQ_laneu8:
|
|
;CHECK: vmov.u8
|
|
%tmp1 = load <16 x i8>* %A
|
|
%tmp2 = extractelement <16 x i8> %tmp1, i32 1
|
|
%tmp3 = zext i8 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind {
|
|
;CHECK: vgetQ_laneu16:
|
|
;CHECK: vmov.u16
|
|
%tmp1 = load <8 x i16>* %A
|
|
%tmp2 = extractelement <8 x i16> %tmp1, i32 1
|
|
%tmp3 = zext i16 %tmp2 to i32
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
; Do a vector add to keep the extraction from being done directly from memory.
|
|
define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
|
|
;CHECK: vgetQ_lanei32:
|
|
;CHECK: vmov.32
|
|
%tmp1 = load <4 x i32>* %A
|
|
%tmp2 = add <4 x i32> %tmp1, %tmp1
|
|
%tmp3 = extractelement <4 x i32> %tmp2, i32 1
|
|
ret i32 %tmp3
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
|
|
entry:
|
|
; CHECK: vmov.u16 r0, d{{.*}}[1]
|
|
%arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
|
|
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
|
|
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
|
%0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1]
|
|
%1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1]
|
|
store i16 %1, i16* %out_uint16_t, align 2
|
|
br label %return
|
|
|
|
return: ; preds = %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
|
|
entry:
|
|
; CHECK: vmov.u8 r0, d{{.*}}[1]
|
|
%arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
|
|
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
|
|
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
|
%0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1]
|
|
%1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1]
|
|
store i8 %1, i8* %out_uint8_t, align 1
|
|
br label %return
|
|
|
|
return: ; preds = %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
|
|
entry:
|
|
; CHECK: vmov.u16 r0, d{{.*}}[1]
|
|
%arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
|
|
%out_uint16_t = alloca i16 ; <i16*> [#uses=1]
|
|
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
|
%0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1]
|
|
%1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1]
|
|
store i16 %1, i16* %out_uint16_t, align 2
|
|
br label %return
|
|
|
|
return: ; preds = %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
|
|
entry:
|
|
; CHECK: vmov.u8 r0, d{{.*}}[1]
|
|
%arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
|
|
%out_uint8_t = alloca i8 ; <i8*> [#uses=1]
|
|
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
|
%0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1]
|
|
%1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1]
|
|
store i8 %1, i8* %out_uint8_t, align 1
|
|
br label %return
|
|
|
|
return: ; preds = %entry
|
|
ret void
|
|
}
|
|
|
|
define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind {
|
|
;CHECK: vset_lane8:
|
|
;CHECK: vmov.8
|
|
%tmp1 = load <8 x i8>* %A
|
|
%tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1
|
|
ret <8 x i8> %tmp2
|
|
}
|
|
|
|
define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind {
|
|
;CHECK: vset_lane16:
|
|
;CHECK: vmov.16
|
|
%tmp1 = load <4 x i16>* %A
|
|
%tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1
|
|
ret <4 x i16> %tmp2
|
|
}
|
|
|
|
define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind {
|
|
;CHECK: vset_lane32:
|
|
;CHECK: vmov.32
|
|
%tmp1 = load <2 x i32>* %A
|
|
%tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1
|
|
ret <2 x i32> %tmp2
|
|
}
|
|
|
|
define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind {
|
|
;CHECK: vsetQ_lane8:
|
|
;CHECK: vmov.8
|
|
%tmp1 = load <16 x i8>* %A
|
|
%tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1
|
|
ret <16 x i8> %tmp2
|
|
}
|
|
|
|
define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
|
|
;CHECK: vsetQ_lane16:
|
|
;CHECK: vmov.16
|
|
%tmp1 = load <8 x i16>* %A
|
|
%tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1
|
|
ret <8 x i16> %tmp2
|
|
}
|
|
|
|
define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
|
|
;CHECK: vsetQ_lane32:
|
|
;CHECK: vmov.32
|
|
%tmp1 = load <4 x i32>* %A
|
|
%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
|
|
ret <4 x i32> %tmp2
|
|
}
|
|
|
|
define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind {
|
|
;CHECK: test_vset_lanef32:
|
|
;CHECK: vmov.f32 s3, s0
|
|
;CHECK: vmov.f64 d0, d1
|
|
entry:
|
|
%0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1]
|
|
ret <2 x float> %0
|
|
}
|