[AArch64] Handle vector types in replaceZeroVectorStore.

Summary:
Extend replaceZeroVectorStore to handle more vector type stores,
floating point zero vectors and set alignment more accurately on split
stores.

This is a follow-up change to r286875.

This change fixes PR31038.

Reviewers: MatzeB

Subscribers: mcrosier, aemerson, llvm-commits, rengolin

Differential Revision: https://reviews.llvm.org/D26682

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287142 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Geoff Berry 2016-11-16 19:35:19 +00:00
parent 23e86b5ddf
commit 42fcf0e528
3 changed files with 115 additions and 23 deletions

View File

@ -8844,13 +8844,10 @@ static SDValue performExtendCombine(SDNode *N,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
} }
static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode &St, static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
SDValue SplatVal, unsigned NumVecElts) { SDValue SplatVal, unsigned NumVecElts) {
assert((NumVecElts == 4 || NumVecElts == 2) && "Unexpected NumVecElts");
unsigned OrigAlignment = St.getAlignment(); unsigned OrigAlignment = St.getAlignment();
unsigned EltOffset = NumVecElts == 4 ? 4 : 8; unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
unsigned Alignment = std::min(OrigAlignment, EltOffset);
// Create scalar stores. This is at least as good as the code sequence for a // Create scalar stores. This is at least as good as the code sequence for a
// split unaligned store which is a dup.s, ext.b, and two stores. // split unaligned store which is a dup.s, ext.b, and two stores.
@ -8860,10 +8857,11 @@ static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
SDValue BasePtr = St.getBasePtr(); SDValue BasePtr = St.getBasePtr();
SDValue NewST1 = SDValue NewST1 =
DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(), DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(),
St.getAlignment(), St.getMemOperand()->getFlags()); OrigAlignment, St.getMemOperand()->getFlags());
unsigned Offset = EltOffset; unsigned Offset = EltOffset;
while (--NumVecElts) { while (--NumVecElts) {
unsigned Alignment = MinAlign(OrigAlignment, Offset);
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(Offset, DL, MVT::i64)); DAG.getConstant(Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
@ -8893,9 +8891,13 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue(); SDValue StVal = St.getValue();
EVT VT = StVal.getValueType(); EVT VT = StVal.getValueType();
// We can express a splat as store pair(s) for 2 or 4 elements. // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
// 2, 3 or 4 i32 elements.
int NumVecElts = VT.getVectorNumElements(); int NumVecElts = VT.getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 2) if (!(((NumVecElts == 2 || NumVecElts == 3) &&
VT.getVectorElementType().getSizeInBits() == 64) ||
((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
VT.getVectorElementType().getSizeInBits() == 32)))
return SDValue(); return SDValue();
if (StVal.getOpcode() != ISD::BUILD_VECTOR) if (StVal.getOpcode() != ISD::BUILD_VECTOR)
@ -8917,16 +8919,16 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
for (int I = 0; I < NumVecElts; ++I) { for (int I = 0; I < NumVecElts; ++I) {
SDValue EltVal = StVal.getOperand(I); SDValue EltVal = StVal.getOperand(I);
if (!isa<ConstantSDNode>(EltVal) || if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
!cast<ConstantSDNode>(EltVal)->isNullValue())
return SDValue(); return SDValue();
} }
// Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
// undoing this transformation. // undoing this transformation.
return split16BStoreSplat( SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32) ? DAG.getRegister(AArch64::WZR, MVT::i32)
: DAG.getRegister(AArch64::XZR, MVT::i64), : DAG.getRegister(AArch64::XZR, MVT::i64);
NumVecElts); return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
} }
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
@ -8979,12 +8981,12 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
if (IndexNotInserted.any()) if (IndexNotInserted.any())
return SDValue(); return SDValue();
return split16BStoreSplat(DAG, St, SplatVal, NumVecElts); return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
} }
static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) { const AArch64Subtarget *Subtarget) {
if (!DCI.isBeforeLegalize()) if (!DCI.isBeforeLegalize())
return SDValue(); return SDValue();
@ -9174,7 +9176,7 @@ static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) { const AArch64Subtarget *Subtarget) {
if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget)) if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
return Split; return Split;
if (Subtarget->supportsAddressTopByteIgnored() && if (Subtarget->supportsAddressTopByteIgnored() &&

View File

@ -6174,11 +6174,10 @@ define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i6
} }
; Check for dependencies between the vector and the scalar load. ; Check for dependencies between the vector and the scalar load.
define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, float** %ptr, i64 %inc, <4 x float>* %dep_ptr_1, <4 x float>* %dep_ptr_2) { define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, float** %ptr, i64 %inc, <4 x float>* %dep_ptr_1, <4 x float>* %dep_ptr_2, <4 x float> %vec) {
; CHECK-LABEL: test_v4f32_post_reg_ld1lane_dep_vec_on_load: ; CHECK-LABEL: test_v4f32_post_reg_ld1lane_dep_vec_on_load:
; CHECK: BB#0: ; CHECK: BB#0:
; CHECK-NEXT: ldr s[[LD:[0-9]+]], [x0] ; CHECK-NEXT: ldr s[[LD:[0-9]+]], [x0]
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: str q0, [x3] ; CHECK-NEXT: str q0, [x3]
; CHECK-NEXT: ldr q0, [x4] ; CHECK-NEXT: ldr q0, [x4]
; CHECK-NEXT: ins.s v0[1], v[[LD]][0] ; CHECK-NEXT: ins.s v0[1], v[[LD]][0]
@ -6186,7 +6185,7 @@ define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(float* %bar, flo
; CHECK-NEXT: str [[POST]], [x1] ; CHECK-NEXT: str [[POST]], [x1]
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%tmp1 = load float, float* %bar %tmp1 = load float, float* %bar
store <4 x float> zeroinitializer, <4 x float>* %dep_ptr_1, align 16 store <4 x float> %vec, <4 x float>* %dep_ptr_1, align 16
%A = load <4 x float>, <4 x float>* %dep_ptr_2, align 16 %A = load <4 x float>, <4 x float>* %dep_ptr_2, align 16
%tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1 %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
%tmp3 = getelementptr float, float* %bar, i64 %inc %tmp3 = getelementptr float, float* %bar, i64 %inc

View File

@ -1433,6 +1433,62 @@ entry:
ret void ret void
} }
; Like merge_zr32, but with 2-vector type.
define void @merge_zr32_2vec(<2 x i32>* %p) {
; CHECK-LABEL: merge_zr32_2vec:
; CHECK: // %entry
; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store <2 x i32> zeroinitializer, <2 x i32>* %p
ret void
}
; Like merge_zr32, but with 3-vector type.
define void @merge_zr32_3vec(<3 x i32>* %p) {
; CHECK-LABEL: merge_zr32_3vec:
; CHECK: // %entry
; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
; CHECK-NEXT: str wzr, [x{{[0-9]+}}, #8]
; CHECK-NEXT: ret
entry:
store <3 x i32> zeroinitializer, <3 x i32>* %p
ret void
}
; Like merge_zr32, but with 4-vector type.
define void @merge_zr32_4vec(<4 x i32>* %p) {
; CHECK-LABEL: merge_zr32_4vec:
; CHECK: // %entry
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store <4 x i32> zeroinitializer, <4 x i32>* %p
ret void
}
; Like merge_zr32, but with 2-vector float type.
define void @merge_zr32_2vecf(<2 x float>* %p) {
; CHECK-LABEL: merge_zr32_2vecf:
; CHECK: // %entry
; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store <2 x float> zeroinitializer, <2 x float>* %p
ret void
}
; Like merge_zr32, but with 4-vector float type.
define void @merge_zr32_4vecf(<4 x float>* %p) {
; CHECK-LABEL: merge_zr32_4vecf:
; CHECK: // %entry
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store <4 x float> zeroinitializer, <4 x float>* %p
ret void
}
; Similar to merge_zr32, but for 64-bit values. ; Similar to merge_zr32, but for 64-bit values.
define void @merge_zr64(i64* %p) { define void @merge_zr64(i64* %p) {
; CHECK-LABEL: merge_zr64: ; CHECK-LABEL: merge_zr64:
@ -1464,3 +1520,38 @@ entry:
store i64 0, i64* %p3 store i64 0, i64* %p3
ret void ret void
} }
; Like merge_zr64, but with 2-vector double type.
define void @merge_zr64_2vecd(<2 x double>* %p) {
; CHECK-LABEL: merge_zr64_2vecd:
; CHECK: // %entry
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store <2 x double> zeroinitializer, <2 x double>* %p
ret void
}
; Like merge_zr64, but with 3-vector i64 type.
define void @merge_zr64_3vec(<3 x i64>* %p) {
; CHECK-LABEL: merge_zr64_3vec:
; CHECK: // %entry
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
; CHECK-NEXT: str xzr, [x{{[0-9]+}}, #16]
; CHECK-NEXT: ret
entry:
store <3 x i64> zeroinitializer, <3 x i64>* %p
ret void
}
; Like merge_zr64_2, but with 4-vector double type.
define void @merge_zr64_4vecd(<4 x double>* %p) {
; CHECK-LABEL: merge_zr64_4vecd:
; CHECK: // %entry
; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store <4 x double> zeroinitializer, <4 x double>* %p
ret void
}