[AArch64] Split 0 vector stores into scalar store pairs.

Summary: Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The load store optimizer pass will merge them to store pair stores. This should be better than a movi to create the vector zero followed by a vector store if the zero constant is not re-used, since one instructions and one register live range will be removed. For example, the final generated code should be: stp xzr, xzr, [x0] instead of: movi v0.2d, #0 str q0, [x0] Reviewers: t.p.northover, mcrosier, MatzeB, jmolloy Subscribers: aemerson, rengolin, llvm-commits Differential Revision: https://reviews.llvm.org/D26561 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286875 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-10 06:03:52 +00:00 · 2016-11-14 19:39:04 +00:00 · 2016-11-14 19:39:04 +00:00 · 9640691bf4
commit 9640691bf4
parent 0763004de3
4 changed files with 206 additions and 12 deletions
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -8799,6 +8799,61 @@ static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode *St,
  return NewST1;
 }

+/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
+/// load store optimizer pass will merge them to store pair stores.  This should
+/// be better than a movi to create the vector zero followed by a vector store
+/// if the zero constant is not re-used, since one instructions and one register
+/// live range will be removed.
+///
+/// For example, the final generated code should be:
+///
+///   stp xzr, xzr, [x0]
+///
+/// instead of:
+///
+///   movi v0.2d, #0
+///   str q0, [x0]
+///
+static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  int NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+
+  if (StVal.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // If the zero constant has more than one use then the vector store could be
+  // better since the constant mov will be amortized and stp q instructions
+  // should be able to be formed.
+  if (!StVal.hasOneUse())
+    return SDValue();
+
+  // If the immediate offset of the address operand is too large for the stp
+  // instruction, then bail out.
+  if (DAG.isBaseWithConstantOffset(St->getBasePtr())) {
+    int64_t Offset = St->getBasePtr()->getConstantOperandVal(1);
+    if (Offset < -512 || Offset > 504)
+      return SDValue();
+  }
+
+  for (int I = 0; I < NumVecElts; ++I) {
+    SDValue EltVal = StVal.getOperand(I);
+    if (!isa<ConstantSDNode>(EltVal) ||
+        !cast<ConstantSDNode>(EltVal)->isNullValue())
+      return SDValue();
+  }
+  // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
+  // undoing this transformation.
+  return split16BStoreSplat(
+      DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32)
+                               : DAG.getRegister(AArch64::XZR, MVT::i64),
+      NumVecElts);
+}
+
 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
 /// value. The load store optimizer pass will merge them to store pair stores.
 /// This has better performance than a splat of the scalar followed by a split
@ -8862,6 +8917,17 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  if (S->isVolatile())
    return SDValue();

+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+  if (!VT.isVector())
+    return SDValue();
+
+  // If we get a splat of zeros, convert this vector store to a store of
+  // scalars. They will be merged into store pairs of xzr thereby removing one
+  // instruction and one register.
+  if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, S))
+    return ReplacedZeroSplat;
+
  // FIXME: The logic for deciding if an unaligned store should be split should
  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
  // a call to that function here.
@ -8873,12 +8939,9 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
  if (DAG.getMachineFunction().getFunction()->optForMinSize())
    return SDValue();

-  SDValue StVal = S->getValue();
-  EVT VT = StVal.getValueType();
-
  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
  // those up regresses performance on micro-benchmarks and olden/bh.
-  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+  if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
    return SDValue();

  // Split unaligned 16B stores. They are terrible for performance.
--- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@ -182,22 +182,22 @@ declare void @llvm.trap()
 ; CHECK: LD4Fourv2d
 ; CHECK: STRQui
 ; CHECK: ********** INTERVALS **********
-define void @testLdStConflict() {
+define void @testLdStConflict(<2 x i64> %v) {
 entry:
  br label %loop

 loop:
  %0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null)
  %ptr = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr, align 4
  %ptr1 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr1, align 4
  %ptr2 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr2, align 4
  %ptr3 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr3, align 4
  %ptr4 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr4, align 4
  br label %loop
 }

--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@ -1333,3 +1333,134 @@ for.body:
 end:
  ret void
 }
+
+; DAGCombiner::MergeConsecutiveStores merges this into a vector store,
+; replaceZeroVectorStore should split the vector store back into
+; scalar stores which should get merged by AArch64LoadStoreOptimizer.
+define void @merge_zr32(i32* %p) {
+; CHECK-LABEL: merge_zr32:
+; CHECK: // %entry
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i32 0, i32* %p
+  %p1 = getelementptr i32, i32* %p, i32 1
+  store i32 0, i32* %p1
+  ret void
+}
+
+; Same sa merge_zr32 but the merged stores should also get paried.
+define void @merge_zr32_2(i32* %p) {
+; CHECK-LABEL: merge_zr32_2:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i32 0, i32* %p
+  %p1 = getelementptr i32, i32* %p, i32 1
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 2
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 3
+  store i32 0, i32* %p3
+  ret void
+}
+
+; Like merge_zr32_2, but checking the largest allowed stp immediate offset.
+define void @merge_zr32_2_offset(i32* %p) {
+; CHECK-LABEL: merge_zr32_2_offset:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; CHECK-NEXT: ret
+entry:
+  %p0 = getelementptr i32, i32* %p, i32 126
+  store i32 0, i32* %p0
+  %p1 = getelementptr i32, i32* %p, i32 127
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 128
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 129
+  store i32 0, i32* %p3
+  ret void
+}
+
+; Like merge_zr32, but replaceZeroVectorStore should not split this
+; vector store since the address offset is too large for the stp
+; instruction.
+define void @no_merge_zr32_2_offset(i32* %p) {
+; CHECK-LABEL: no_merge_zr32_2_offset:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
+; CHECK-NEXT: ret
+entry:
+  %p0 = getelementptr i32, i32* %p, i32 1024
+  store i32 0, i32* %p0
+  %p1 = getelementptr i32, i32* %p, i32 1025
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 1026
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 1027
+  store i32 0, i32* %p3
+  ret void
+}
+
+; Like merge_zr32, but replaceZeroVectorStore should not split the
+; vector store since the zero constant vector has multiple uses, so we
+; err on the side that allows for stp q instruction generation.
+define void @merge_zr32_3(i32* %p) {
+; CHECK-LABEL: merge_zr32_3:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i32 0, i32* %p
+  %p1 = getelementptr i32, i32* %p, i32 1
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 2
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 3
+  store i32 0, i32* %p3
+  %p4 = getelementptr i32, i32* %p, i64 4
+  store i32 0, i32* %p4
+  %p5 = getelementptr i32, i32* %p, i64 5
+  store i32 0, i32* %p5
+  %p6 = getelementptr i32, i32* %p, i64 6
+  store i32 0, i32* %p6
+  %p7 = getelementptr i32, i32* %p, i64 7
+  store i32 0, i32* %p7
+  ret void
+}
+
+; Similar to merge_zr32, but for 64-bit values.
+define void @merge_zr64(i64* %p) {
+; CHECK-LABEL: merge_zr64:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i64 0, i64* %p
+  %p1 = getelementptr i64, i64* %p, i64 1
+  store i64 0, i64* %p1
+  ret void
+}
+
+; Similar to merge_zr32_3, replaceZeroVectorStore should not split the
+; vector store since the zero constant vector has multiple uses.
+define void @merge_zr64_2(i64* %p) {
+; CHECK-LABEL: merge_zr64_2:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i64 0, i64* %p
+  %p1 = getelementptr i64, i64* %p, i64 1
+  store i64 0, i64* %p1
+  %p2 = getelementptr i64, i64* %p, i64 2
+  store i64 0, i64* %p2
+  %p3 = getelementptr i64, i64* %p, i64 3
+  store i64 0, i64* %p3
+  ret void
+}
--- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll
+++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll
@ -10,11 +10,11 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
 define i32 @main() local_unnamed_addr #1 {
 ; Make sure the stores happen in the correct order (the exact instructions could change).
 ; CHECK-LABEL: main:
+; CHECK: stp xzr, xzr, [sp, #72]
+; CHECK: str w9, [sp, #80]
 ; CHECK: str q0, [sp, #48]
 ; CHECK: ldr w8, [sp, #48]
-; CHECK: stur q1, [sp, #72]
 ; CHECK: str q0, [sp, #64]
-; CHECK: str w9, [sp, #80]

 for.body.lr.ph.i.i.i.i.i.i63:
  %b1 = alloca [10 x i32], align 16