[AArch64] Improve single vector lane unscaled stores

When storing the 0th lane of a vector, use a simpler and usually more
efficient scalar store instead.  In this case, also using the unscaled
offset.

Differential revision: https://reviews.llvm.org/D46762

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@332394 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Evandro Menezes 2018-05-15 20:41:12 +00:00
parent 63f181a914
commit 8d19d135af
3 changed files with 92 additions and 55 deletions

View File

@ -2477,6 +2477,22 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
SubRegIndex SubRegIdx, Instruction STR> {
defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
}
let AddedComplexity = 19 in {
defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v8f16, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v4i32, i32, ssub, STURSi>;
defm : VecStoreULane0Pat<store, v4f32, f32, ssub, STURSi>;
defm : VecStoreULane0Pat<store, v2i64, i64, dsub, STURDi>;
defm : VecStoreULane0Pat<store, v2f64, f64, dsub, STURDi>;
}
//---
// STR mnemonics fall back to STUR for negative or unaligned offsets.
def : InstAlias<"str $Rt, [$Rn, $offset]",

View File

@ -4,23 +4,25 @@
define void @st1lane_16b(<16 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane_16b
; CHECK: st1.b
; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i8, i8* %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 1
store i8 %tmp, i8* %D
store i8 %tmp, i8* %ptr
ret void
}
define void @st1lane0_16b(<16 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane0_16b
; CHECK: st1.b
; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
%ptr = getelementptr i8, i8* %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, i8* %D
store i8 %tmp, i8* %ptr
ret void
}
define void @st1lane0u_16b(<16 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane0u_16b
; CHECK: st1.b
; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
%ptr = getelementptr i8, i8* %D, i64 -1
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, i8* %ptr
@ -49,23 +51,25 @@ define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) {
define void @st1lane_8h(<8 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane_8h
; CHECK: st1.h
; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <8 x i16> %A, i32 1
store i16 %tmp, i16* %D
store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0_8h(<8 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0_8h
; CHECK: str
; CHECK: str h0, [x0, #2]
%ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <8 x i16> %A, i32 0
store i16 %tmp, i16* %D
store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0u_8h(<8 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0u_8h
; CHECK: st1.h
; CHECK: stur h0, [x0, #-2]
%ptr = getelementptr i16, i16* %D, i64 -1
%tmp = extractelement <8 x i16> %A, i32 0
store i16 %tmp, i16* %ptr
@ -93,23 +97,25 @@ define void @st1lane0_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) {
define void @st1lane_4s(<4 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane_4s
; CHECK: st1.s
; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <4 x i32> %A, i32 1
store i32 %tmp, i32* %D
store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0_4s(<4 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0_4s
; CHECK: str
; CHECK: str s0, [x0, #4]
%ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <4 x i32> %A, i32 0
store i32 %tmp, i32* %D
store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0u_4s(<4 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0u_4s
; CHECK: st1.s
; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr i32, i32* %D, i64 -1
%tmp = extractelement <4 x i32> %A, i32 0
store i32 %tmp, i32* %ptr
@ -137,23 +143,25 @@ define void @st1lane0_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) {
define void @st1lane_4s_float(<4 x float> %A, float* %D) {
; CHECK-LABEL: st1lane_4s_float
; CHECK: st1.s
; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <4 x float> %A, i32 1
store float %tmp, float* %D
store float %tmp, float* %ptr
ret void
}
define void @st1lane0_4s_float(<4 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0_4s_float
; CHECK: str
; CHECK: str s0, [x0, #4]
%ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <4 x float> %A, i32 0
store float %tmp, float* %D
store float %tmp, float* %ptr
ret void
}
define void @st1lane0u_4s_float(<4 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0u_4s_float
; CHECK: st1.s
; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr float, float* %D, i64 -1
%tmp = extractelement <4 x float> %A, i32 0
store float %tmp, float* %ptr
@ -181,23 +189,25 @@ define void @st1lane0_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) {
define void @st1lane_2d(<2 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane_2d
; CHECK: st1.d
; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i64, i64* %D, i64 1
%tmp = extractelement <2 x i64> %A, i32 1
store i64 %tmp, i64* %D
store i64 %tmp, i64* %ptr
ret void
}
define void @st1lane0_2d(<2 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0_2d
; CHECK: str
; CHECK: str d0, [x0, #8]
%ptr = getelementptr i64, i64* %D, i64 1
%tmp = extractelement <2 x i64> %A, i32 0
store i64 %tmp, i64* %D
store i64 %tmp, i64* %ptr
ret void
}
define void @st1lane0u_2d(<2 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0u_2d
; CHECK: st1.d
; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr i64, i64* %D, i64 -1
%tmp = extractelement <2 x i64> %A, i32 0
store i64 %tmp, i64* %ptr
@ -225,23 +235,25 @@ define void @st1lane0_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) {
define void @st1lane_2d_double(<2 x double> %A, double* %D) {
; CHECK-LABEL: st1lane_2d_double
; CHECK: st1.d
; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr double, double* %D, i64 1
%tmp = extractelement <2 x double> %A, i32 1
store double %tmp, double* %D
store double %tmp, double* %ptr
ret void
}
define void @st1lane0_2d_double(<2 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0_2d_double
; CHECK: str
; CHECK: str d0, [x0, #8]
%ptr = getelementptr double, double* %D, i64 1
%tmp = extractelement <2 x double> %A, i32 0
store double %tmp, double* %D
store double %tmp, double* %ptr
ret void
}
define void @st1lane0u_2d_double(<2 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0u_2d_double
; CHECK: st1.d
; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr double, double* %D, i64 -1
%tmp = extractelement <2 x double> %A, i32 0
store double %tmp, double* %ptr
@ -269,9 +281,10 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) {
define void @st1lane_8b(<8 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane_8b
; CHECK: st1.b
; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i8, i8* %D, i64 1
%tmp = extractelement <8 x i8> %A, i32 1
store i8 %tmp, i8* %D
store i8 %tmp, i8* %ptr
ret void
}
@ -297,23 +310,25 @@ define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) {
define void @st1lane_4h(<4 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane_4h
; CHECK: st1.h
; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <4 x i16> %A, i32 1
store i16 %tmp, i16* %D
store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0_4h(<4 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0_4h
; CHECK: str
; CHECK: str h0, [x0, #2]
%ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <4 x i16> %A, i32 0
store i16 %tmp, i16* %D
store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0u_4h(<4 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0u_4h
; CHECK: st1.h
; CHECK: stur h0, [x0, #-2]
%ptr = getelementptr i16, i16* %D, i64 -1
%tmp = extractelement <4 x i16> %A, i32 0
store i16 %tmp, i16* %ptr
@ -341,23 +356,25 @@ define void @st1lane0_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) {
define void @st1lane_2s(<2 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane_2s
; CHECK: st1.s
; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <2 x i32> %A, i32 1
store i32 %tmp, i32* %D
store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0_2s(<2 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0_2s
; CHECK: str
; CHECK: str s0, [x0, #4]
%ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <2 x i32> %A, i32 0
store i32 %tmp, i32* %D
store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0u_2s(<2 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0u_2s
; CHECK: st1.s
; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr i32, i32* %D, i64 -1
%tmp = extractelement <2 x i32> %A, i32 0
store i32 %tmp, i32* %ptr
@ -385,23 +402,25 @@ define void @st1lane0_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) {
define void @st1lane_2s_float(<2 x float> %A, float* %D) {
; CHECK-LABEL: st1lane_2s_float
; CHECK: st1.s
; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <2 x float> %A, i32 1
store float %tmp, float* %D
store float %tmp, float* %ptr
ret void
}
define void @st1lane0_2s_float(<2 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0_2s_float
; CHECK: str
; CHECK: str s0, [x0, #4]
%ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <2 x float> %A, i32 0
store float %tmp, float* %D
store float %tmp, float* %ptr
ret void
}
define void @st1lane0u_2s_float(<2 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0u_2s_float
; CHECK: st1.s
; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr float, float* %D, i64 -1
%tmp = extractelement <2 x float> %A, i32 0
store float %tmp, float* %ptr
@ -429,15 +448,16 @@ define void @st1lane0_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) {
define void @st1lane0_1d(<1 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0_1d
; CHECK: str
; CHECK: str d0, [x0, #8]
%ptr = getelementptr i64, i64* %D, i64 1
%tmp = extractelement <1 x i64> %A, i32 0
store i64 %tmp, i64* %D
store i64 %tmp, i64* %ptr
ret void
}
define void @st1lane0u_1d(<1 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0u_1d
; CHECK: st1.d
; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr i64, i64* %D, i64 -1
%tmp = extractelement <1 x i64> %A, i32 0
store i64 %tmp, i64* %ptr
@ -455,15 +475,16 @@ define void @st1lane0_ro_1d(<1 x i64> %A, i64* %D, i64 %offset) {
define void @st1lane0_1d_double(<1 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0_1d_double
; CHECK: str
; CHECK: str d0, [x0, #8]
%ptr = getelementptr double, double* %D, i64 1
%tmp = extractelement <1 x double> %A, i32 0
store double %tmp, double* %D
store double %tmp, double* %ptr
ret void
}
define void @st1lane0u_1d_double(<1 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0u_1d_double
; CHECK: stur
; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr double, double* %D, i64 -1
%tmp = extractelement <1 x double> %A, i32 0
store double %tmp, double* %ptr

View File

@ -99,7 +99,7 @@ entry:
define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
; CHECK-LABEL: storeu_lane0_64:
; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
; CHECK: stur h0, [x{{[0-9]+}}, #-2]
entry:
%0 = getelementptr half, half* %a, i64 -1
%1 = extractelement <4 x half> %b, i32 0
@ -148,7 +148,7 @@ entry:
define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
; CHECK-LABEL: storeu_lane0_128:
; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
; CHECK: stur h0, [x{{[0-9]+}}, #-2]
entry:
%0 = getelementptr half, half* %a, i64 -1
%1 = extractelement <8 x half> %b, i32 0