mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-10 06:03:52 +00:00
[AArch64] Split 0 vector stores into scalar store pairs.
Summary: Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The load store optimizer pass will merge them to store pair stores. This should be better than a movi to create the vector zero followed by a vector store if the zero constant is not re-used, since one instructions and one register live range will be removed. For example, the final generated code should be: stp xzr, xzr, [x0] instead of: movi v0.2d, #0 str q0, [x0] Reviewers: t.p.northover, mcrosier, MatzeB, jmolloy Subscribers: aemerson, rengolin, llvm-commits Differential Revision: https://reviews.llvm.org/D26561 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286875 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0763004de3
commit
9640691bf4
@ -8799,6 +8799,61 @@ static SDValue split16BStoreSplat(SelectionDAG &DAG, StoreSDNode *St,
|
||||
return NewST1;
|
||||
}
|
||||
|
||||
/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
|
||||
/// load store optimizer pass will merge them to store pair stores. This should
|
||||
/// be better than a movi to create the vector zero followed by a vector store
|
||||
/// if the zero constant is not re-used, since one instructions and one register
|
||||
/// live range will be removed.
|
||||
///
|
||||
/// For example, the final generated code should be:
|
||||
///
|
||||
/// stp xzr, xzr, [x0]
|
||||
///
|
||||
/// instead of:
|
||||
///
|
||||
/// movi v0.2d, #0
|
||||
/// str q0, [x0]
|
||||
///
|
||||
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
|
||||
SDValue StVal = St->getValue();
|
||||
EVT VT = StVal.getValueType();
|
||||
|
||||
// We can express a splat as store pair(s) for 2 or 4 elements.
|
||||
int NumVecElts = VT.getVectorNumElements();
|
||||
if (NumVecElts != 4 && NumVecElts != 2)
|
||||
return SDValue();
|
||||
|
||||
if (StVal.getOpcode() != ISD::BUILD_VECTOR)
|
||||
return SDValue();
|
||||
|
||||
// If the zero constant has more than one use then the vector store could be
|
||||
// better since the constant mov will be amortized and stp q instructions
|
||||
// should be able to be formed.
|
||||
if (!StVal.hasOneUse())
|
||||
return SDValue();
|
||||
|
||||
// If the immediate offset of the address operand is too large for the stp
|
||||
// instruction, then bail out.
|
||||
if (DAG.isBaseWithConstantOffset(St->getBasePtr())) {
|
||||
int64_t Offset = St->getBasePtr()->getConstantOperandVal(1);
|
||||
if (Offset < -512 || Offset > 504)
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
for (int I = 0; I < NumVecElts; ++I) {
|
||||
SDValue EltVal = StVal.getOperand(I);
|
||||
if (!isa<ConstantSDNode>(EltVal) ||
|
||||
!cast<ConstantSDNode>(EltVal)->isNullValue())
|
||||
return SDValue();
|
||||
}
|
||||
// Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
|
||||
// undoing this transformation.
|
||||
return split16BStoreSplat(
|
||||
DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32)
|
||||
: DAG.getRegister(AArch64::XZR, MVT::i64),
|
||||
NumVecElts);
|
||||
}
|
||||
|
||||
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
|
||||
/// value. The load store optimizer pass will merge them to store pair stores.
|
||||
/// This has better performance than a splat of the scalar followed by a split
|
||||
@ -8862,6 +8917,17 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
|
||||
if (S->isVolatile())
|
||||
return SDValue();
|
||||
|
||||
SDValue StVal = S->getValue();
|
||||
EVT VT = StVal.getValueType();
|
||||
if (!VT.isVector())
|
||||
return SDValue();
|
||||
|
||||
// If we get a splat of zeros, convert this vector store to a store of
|
||||
// scalars. They will be merged into store pairs of xzr thereby removing one
|
||||
// instruction and one register.
|
||||
if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, S))
|
||||
return ReplacedZeroSplat;
|
||||
|
||||
// FIXME: The logic for deciding if an unaligned store should be split should
|
||||
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
|
||||
// a call to that function here.
|
||||
@ -8873,12 +8939,9 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
|
||||
if (DAG.getMachineFunction().getFunction()->optForMinSize())
|
||||
return SDValue();
|
||||
|
||||
SDValue StVal = S->getValue();
|
||||
EVT VT = StVal.getValueType();
|
||||
|
||||
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
|
||||
// those up regresses performance on micro-benchmarks and olden/bh.
|
||||
if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
|
||||
if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
|
||||
return SDValue();
|
||||
|
||||
// Split unaligned 16B stores. They are terrible for performance.
|
||||
|
@ -182,22 +182,22 @@ declare void @llvm.trap()
|
||||
; CHECK: LD4Fourv2d
|
||||
; CHECK: STRQui
|
||||
; CHECK: ********** INTERVALS **********
|
||||
define void @testLdStConflict() {
|
||||
define void @testLdStConflict(<2 x i64> %v) {
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null)
|
||||
%ptr = bitcast i8* undef to <2 x i64>*
|
||||
store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4
|
||||
store <2 x i64> %v, <2 x i64>* %ptr, align 4
|
||||
%ptr1 = bitcast i8* undef to <2 x i64>*
|
||||
store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4
|
||||
store <2 x i64> %v, <2 x i64>* %ptr1, align 4
|
||||
%ptr2 = bitcast i8* undef to <2 x i64>*
|
||||
store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4
|
||||
store <2 x i64> %v, <2 x i64>* %ptr2, align 4
|
||||
%ptr3 = bitcast i8* undef to <2 x i64>*
|
||||
store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4
|
||||
store <2 x i64> %v, <2 x i64>* %ptr3, align 4
|
||||
%ptr4 = bitcast i8* undef to <2 x i64>*
|
||||
store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4
|
||||
store <2 x i64> %v, <2 x i64>* %ptr4, align 4
|
||||
br label %loop
|
||||
}
|
||||
|
||||
|
@ -1333,3 +1333,134 @@ for.body:
|
||||
end:
|
||||
ret void
|
||||
}
|
||||
|
||||
; DAGCombiner::MergeConsecutiveStores merges this into a vector store,
|
||||
; replaceZeroVectorStore should split the vector store back into
|
||||
; scalar stores which should get merged by AArch64LoadStoreOptimizer.
|
||||
define void @merge_zr32(i32* %p) {
|
||||
; CHECK-LABEL: merge_zr32:
|
||||
; CHECK: // %entry
|
||||
; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
store i32 0, i32* %p
|
||||
%p1 = getelementptr i32, i32* %p, i32 1
|
||||
store i32 0, i32* %p1
|
||||
ret void
|
||||
}
|
||||
|
||||
; Same sa merge_zr32 but the merged stores should also get paried.
|
||||
define void @merge_zr32_2(i32* %p) {
|
||||
; CHECK-LABEL: merge_zr32_2:
|
||||
; CHECK: // %entry
|
||||
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
store i32 0, i32* %p
|
||||
%p1 = getelementptr i32, i32* %p, i32 1
|
||||
store i32 0, i32* %p1
|
||||
%p2 = getelementptr i32, i32* %p, i64 2
|
||||
store i32 0, i32* %p2
|
||||
%p3 = getelementptr i32, i32* %p, i64 3
|
||||
store i32 0, i32* %p3
|
||||
ret void
|
||||
}
|
||||
|
||||
; Like merge_zr32_2, but checking the largest allowed stp immediate offset.
|
||||
define void @merge_zr32_2_offset(i32* %p) {
|
||||
; CHECK-LABEL: merge_zr32_2_offset:
|
||||
; CHECK: // %entry
|
||||
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%p0 = getelementptr i32, i32* %p, i32 126
|
||||
store i32 0, i32* %p0
|
||||
%p1 = getelementptr i32, i32* %p, i32 127
|
||||
store i32 0, i32* %p1
|
||||
%p2 = getelementptr i32, i32* %p, i64 128
|
||||
store i32 0, i32* %p2
|
||||
%p3 = getelementptr i32, i32* %p, i64 129
|
||||
store i32 0, i32* %p3
|
||||
ret void
|
||||
}
|
||||
|
||||
; Like merge_zr32, but replaceZeroVectorStore should not split this
|
||||
; vector store since the address offset is too large for the stp
|
||||
; instruction.
|
||||
define void @no_merge_zr32_2_offset(i32* %p) {
|
||||
; CHECK-LABEL: no_merge_zr32_2_offset:
|
||||
; CHECK: // %entry
|
||||
; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
|
||||
; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%p0 = getelementptr i32, i32* %p, i32 1024
|
||||
store i32 0, i32* %p0
|
||||
%p1 = getelementptr i32, i32* %p, i32 1025
|
||||
store i32 0, i32* %p1
|
||||
%p2 = getelementptr i32, i32* %p, i64 1026
|
||||
store i32 0, i32* %p2
|
||||
%p3 = getelementptr i32, i32* %p, i64 1027
|
||||
store i32 0, i32* %p3
|
||||
ret void
|
||||
}
|
||||
|
||||
; Like merge_zr32, but replaceZeroVectorStore should not split the
|
||||
; vector store since the zero constant vector has multiple uses, so we
|
||||
; err on the side that allows for stp q instruction generation.
|
||||
define void @merge_zr32_3(i32* %p) {
|
||||
; CHECK-LABEL: merge_zr32_3:
|
||||
; CHECK: // %entry
|
||||
; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
|
||||
; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
store i32 0, i32* %p
|
||||
%p1 = getelementptr i32, i32* %p, i32 1
|
||||
store i32 0, i32* %p1
|
||||
%p2 = getelementptr i32, i32* %p, i64 2
|
||||
store i32 0, i32* %p2
|
||||
%p3 = getelementptr i32, i32* %p, i64 3
|
||||
store i32 0, i32* %p3
|
||||
%p4 = getelementptr i32, i32* %p, i64 4
|
||||
store i32 0, i32* %p4
|
||||
%p5 = getelementptr i32, i32* %p, i64 5
|
||||
store i32 0, i32* %p5
|
||||
%p6 = getelementptr i32, i32* %p, i64 6
|
||||
store i32 0, i32* %p6
|
||||
%p7 = getelementptr i32, i32* %p, i64 7
|
||||
store i32 0, i32* %p7
|
||||
ret void
|
||||
}
|
||||
|
||||
; Similar to merge_zr32, but for 64-bit values.
|
||||
define void @merge_zr64(i64* %p) {
|
||||
; CHECK-LABEL: merge_zr64:
|
||||
; CHECK: // %entry
|
||||
; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
store i64 0, i64* %p
|
||||
%p1 = getelementptr i64, i64* %p, i64 1
|
||||
store i64 0, i64* %p1
|
||||
ret void
|
||||
}
|
||||
|
||||
; Similar to merge_zr32_3, replaceZeroVectorStore should not split the
|
||||
; vector store since the zero constant vector has multiple uses.
|
||||
define void @merge_zr64_2(i64* %p) {
|
||||
; CHECK-LABEL: merge_zr64_2:
|
||||
; CHECK: // %entry
|
||||
; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
|
||||
; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
store i64 0, i64* %p
|
||||
%p1 = getelementptr i64, i64* %p, i64 1
|
||||
store i64 0, i64* %p1
|
||||
%p2 = getelementptr i64, i64* %p, i64 2
|
||||
store i64 0, i64* %p2
|
||||
%p3 = getelementptr i64, i64* %p, i64 3
|
||||
store i64 0, i64* %p3
|
||||
ret void
|
||||
}
|
||||
|
@ -10,11 +10,11 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
|
||||
define i32 @main() local_unnamed_addr #1 {
|
||||
; Make sure the stores happen in the correct order (the exact instructions could change).
|
||||
; CHECK-LABEL: main:
|
||||
; CHECK: stp xzr, xzr, [sp, #72]
|
||||
; CHECK: str w9, [sp, #80]
|
||||
; CHECK: str q0, [sp, #48]
|
||||
; CHECK: ldr w8, [sp, #48]
|
||||
; CHECK: stur q1, [sp, #72]
|
||||
; CHECK: str q0, [sp, #64]
|
||||
; CHECK: str w9, [sp, #80]
|
||||
|
||||
for.body.lr.ph.i.i.i.i.i.i63:
|
||||
%b1 = alloca [10 x i32], align 16
|
||||
|
Loading…
Reference in New Issue
Block a user