[AArch64] Split 0 vector stores into scalar store pairs.

Summary:
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
The load store optimizer pass will merge them to store pair stores.
This should be better than a movi to create the vector zero followed by
a vector store if the zero constant is not re-used, since one
instructions and one register live range will be removed.

For example, the final generated code should be:

  stp xzr, xzr, [x0]

instead of:

  movi v0.2d, #0
  str q0, [x0]

Reviewers: t.p.northover, mcrosier, MatzeB, jmolloy

Subscribers: aemerson, rengolin, llvm-commits

Differential Revision: https://reviews.llvm.org/D26561

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286875 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Geoff Berry 2016-11-14 19:39:04 +00:00
parent 0763004de3
commit 9640691bf4
4 changed files with 206 additions and 12 deletions

View File

@ -8799,6 +8799,61 @@ static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode *St,
return NewST1;
}
/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
/// load store optimizer pass will merge them to store pair stores. This should
/// be better than a movi to create the vector zero followed by a vector store
/// if the zero constant is not re-used, since one instructions and one register
/// live range will be removed.
///
/// For example, the final generated code should be:
///
/// stp xzr, xzr, [x0]
///
/// instead of:
///
/// movi v0.2d, #0
/// str q0, [x0]
///
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
SDValue StVal = St->getValue();
EVT VT = StVal.getValueType();
// We can express a splat as store pair(s) for 2 or 4 elements.
int NumVecElts = VT.getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 2)
return SDValue();
if (StVal.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// If the zero constant has more than one use then the vector store could be
// better since the constant mov will be amortized and stp q instructions
// should be able to be formed.
if (!StVal.hasOneUse())
return SDValue();
// If the immediate offset of the address operand is too large for the stp
// instruction, then bail out.
if (DAG.isBaseWithConstantOffset(St->getBasePtr())) {
int64_t Offset = St->getBasePtr()->getConstantOperandVal(1);
if (Offset < -512 || Offset > 504)
return SDValue();
}
for (int I = 0; I < NumVecElts; ++I) {
SDValue EltVal = StVal.getOperand(I);
if (!isa<ConstantSDNode>(EltVal) ||
!cast<ConstantSDNode>(EltVal)->isNullValue())
return SDValue();
}
// Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
// undoing this transformation.
return split16BStoreSplat(
DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32)
: DAG.getRegister(AArch64::XZR, MVT::i64),
NumVecElts);
}
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
/// value. The load store optimizer pass will merge them to store pair stores.
/// This has better performance than a splat of the scalar followed by a split
@ -8862,6 +8917,17 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (S->isVolatile())
return SDValue();
SDValue StVal = S->getValue();
EVT VT = StVal.getValueType();
if (!VT.isVector())
return SDValue();
// If we get a splat of zeros, convert this vector store to a store of
// scalars. They will be merged into store pairs of xzr thereby removing one
// instruction and one register.
if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, S))
return ReplacedZeroSplat;
// FIXME: The logic for deciding if an unaligned store should be split should
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
@ -8873,12 +8939,9 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
SDValue StVal = S->getValue();
EVT VT = StVal.getValueType();
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
// those up regresses performance on micro-benchmarks and olden/bh.
if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
return SDValue();
// Split unaligned 16B stores. They are terrible for performance.

View File

@ -182,22 +182,22 @@ declare void @llvm.trap()
; CHECK: LD4Fourv2d
; CHECK: STRQui
; CHECK: ********** INTERVALS **********
define void @testLdStConflict() {
define void @testLdStConflict(<2 x i64> %v) {
entry:
br label %loop
loop:
%0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null)
%ptr = bitcast i8* undef to <2 x i64>*
store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4
store <2 x i64> %v, <2 x i64>* %ptr, align 4
%ptr1 = bitcast i8* undef to <2 x i64>*
store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4
store <2 x i64> %v, <2 x i64>* %ptr1, align 4
%ptr2 = bitcast i8* undef to <2 x i64>*
store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4
store <2 x i64> %v, <2 x i64>* %ptr2, align 4
%ptr3 = bitcast i8* undef to <2 x i64>*
store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4
store <2 x i64> %v, <2 x i64>* %ptr3, align 4
%ptr4 = bitcast i8* undef to <2 x i64>*
store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4
store <2 x i64> %v, <2 x i64>* %ptr4, align 4
br label %loop
}

View File

@ -1333,3 +1333,134 @@ for.body:
end:
ret void
}
; DAGCombiner::MergeConsecutiveStores merges this into a vector store,
; replaceZeroVectorStore should split the vector store back into
; scalar stores which should get merged by AArch64LoadStoreOptimizer.
define void @merge_zr32(i32* %p) {
; CHECK-LABEL: merge_zr32:
; CHECK: // %entry
; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store i32 0, i32* %p
%p1 = getelementptr i32, i32* %p, i32 1
store i32 0, i32* %p1
ret void
}
; Same sa merge_zr32 but the merged stores should also get paried.
define void @merge_zr32_2(i32* %p) {
; CHECK-LABEL: merge_zr32_2:
; CHECK: // %entry
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store i32 0, i32* %p
%p1 = getelementptr i32, i32* %p, i32 1
store i32 0, i32* %p1
%p2 = getelementptr i32, i32* %p, i64 2
store i32 0, i32* %p2
%p3 = getelementptr i32, i32* %p, i64 3
store i32 0, i32* %p3
ret void
}
; Like merge_zr32_2, but checking the largest allowed stp immediate offset.
define void @merge_zr32_2_offset(i32* %p) {
; CHECK-LABEL: merge_zr32_2_offset:
; CHECK: // %entry
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
; CHECK-NEXT: ret
entry:
%p0 = getelementptr i32, i32* %p, i32 126
store i32 0, i32* %p0
%p1 = getelementptr i32, i32* %p, i32 127
store i32 0, i32* %p1
%p2 = getelementptr i32, i32* %p, i64 128
store i32 0, i32* %p2
%p3 = getelementptr i32, i32* %p, i64 129
store i32 0, i32* %p3
ret void
}
; Like merge_zr32, but replaceZeroVectorStore should not split this
; vector store since the address offset is too large for the stp
; instruction.
define void @no_merge_zr32_2_offset(i32* %p) {
; CHECK-LABEL: no_merge_zr32_2_offset:
; CHECK: // %entry
; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
; CHECK-NEXT: ret
entry:
%p0 = getelementptr i32, i32* %p, i32 1024
store i32 0, i32* %p0
%p1 = getelementptr i32, i32* %p, i32 1025
store i32 0, i32* %p1
%p2 = getelementptr i32, i32* %p, i64 1026
store i32 0, i32* %p2
%p3 = getelementptr i32, i32* %p, i64 1027
store i32 0, i32* %p3
ret void
}
; Like merge_zr32, but replaceZeroVectorStore should not split the
; vector store since the zero constant vector has multiple uses, so we
; err on the side that allows for stp q instruction generation.
define void @merge_zr32_3(i32* %p) {
; CHECK-LABEL: merge_zr32_3:
; CHECK: // %entry
; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store i32 0, i32* %p
%p1 = getelementptr i32, i32* %p, i32 1
store i32 0, i32* %p1
%p2 = getelementptr i32, i32* %p, i64 2
store i32 0, i32* %p2
%p3 = getelementptr i32, i32* %p, i64 3
store i32 0, i32* %p3
%p4 = getelementptr i32, i32* %p, i64 4
store i32 0, i32* %p4
%p5 = getelementptr i32, i32* %p, i64 5
store i32 0, i32* %p5
%p6 = getelementptr i32, i32* %p, i64 6
store i32 0, i32* %p6
%p7 = getelementptr i32, i32* %p, i64 7
store i32 0, i32* %p7
ret void
}
; Similar to merge_zr32, but for 64-bit values.
define void @merge_zr64(i64* %p) {
; CHECK-LABEL: merge_zr64:
; CHECK: // %entry
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store i64 0, i64* %p
%p1 = getelementptr i64, i64* %p, i64 1
store i64 0, i64* %p1
ret void
}
; Similar to merge_zr32_3, replaceZeroVectorStore should not split the
; vector store since the zero constant vector has multiple uses.
define void @merge_zr64_2(i64* %p) {
; CHECK-LABEL: merge_zr64_2:
; CHECK: // %entry
; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
; CHECK-NEXT: ret
entry:
store i64 0, i64* %p
%p1 = getelementptr i64, i64* %p, i64 1
store i64 0, i64* %p1
%p2 = getelementptr i64, i64* %p, i64 2
store i64 0, i64* %p2
%p3 = getelementptr i64, i64* %p, i64 3
store i64 0, i64* %p3
ret void
}

View File

@ -10,11 +10,11 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
define i32 @main() local_unnamed_addr #1 {
; Make sure the stores happen in the correct order (the exact instructions could change).
; CHECK-LABEL: main:
; CHECK: stp xzr, xzr, [sp, #72]
; CHECK: str w9, [sp, #80]
; CHECK: str q0, [sp, #48]
; CHECK: ldr w8, [sp, #48]
; CHECK: stur q1, [sp, #72]
; CHECK: str q0, [sp, #64]
; CHECK: str w9, [sp, #80]
for.body.lr.ph.i.i.i.i.i.i63:
%b1 = alloca [10 x i32], align 16