From 4ef753a11895d580c095a73de6039e6b9fbda064 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 23 Feb 2017 10:57:15 +0000 Subject: [PATCH] [SLP] Fix for PR32036: Vectorized horizontal reduction returning wrong result Summary: If the same value is used several times as an extra value, SLP vectorizer takes it into account only once instead of actual number of using. For example: ``` int val = 1; for (int y = 0; y < 8; y++) { for (int x = 0; x < 8; x++) { val = val + input[y * 8 + x] + 3; } } ``` We have 2 extra rguments: `1` - initial value of horizontal reduction and `3`, which is added 8*8 times to the reduction. Before the patch we added `1` to the reduction value and added once `3`, though it must be added 64 times. Reviewers: mkuper, mzolotukhin Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D30262 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295956 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 35 +++++++++++-------- .../SLPVectorizer/X86/horizontal-list.ll | 10 +++--- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index cbb0e7a0c31..925f5ebbbb7 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -304,6 +304,7 @@ public: typedef SmallVector InstrList; typedef SmallPtrSet ValueSet; typedef SmallVector StoreList; + typedef MapVector> ExtraValueToDebugLocsMap; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, @@ -333,7 +334,7 @@ public: /// Vectorize the tree but with the list of externally used values \p /// ExternallyUsedValues. Values in this MapVector can be replaced but the /// generated extractvalue instructions. - Value *vectorizeTree(MapVector &ExternallyUsedValues); + Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. @@ -352,7 +353,7 @@ public: /// into account (anf updating it, if required) list of externally used /// values stored in \p ExternallyUsedValues. void buildTree(ArrayRef Roots, - MapVector &ExternallyUsedValues, + ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. @@ -953,11 +954,11 @@ private: void BoUpSLP::buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst) { - MapVector ExternallyUsedValues; + ExtraValueToDebugLocsMap ExternallyUsedValues; buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); } void BoUpSLP::buildTree(ArrayRef Roots, - MapVector &ExternallyUsedValues, + ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst) { deleteTree(); UserIgnoreList = UserIgnoreLst; @@ -2801,12 +2802,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL, TreeEntry *E) { } Value *BoUpSLP::vectorizeTree() { - MapVector ExternallyUsedValues; + ExtraValueToDebugLocsMap ExternallyUsedValues; return vectorizeTree(ExternallyUsedValues); } Value * -BoUpSLP::vectorizeTree(MapVector &ExternallyUsedValues) { +BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { @@ -2868,7 +2869,6 @@ BoUpSLP::vectorizeTree(MapVector &ExternallyUsedValues) { assert(ExternallyUsedValues.count(Scalar) && "Scalar with nullptr as an external user must be registered in " "ExternallyUsedValues map"); - DebugLoc DL = ExternallyUsedValues[Scalar]; if (auto *VecI = dyn_cast(Vec)) { Builder.SetInsertPoint(VecI->getParent(), std::next(VecI->getIterator())); @@ -2878,8 +2878,8 @@ BoUpSLP::vectorizeTree(MapVector &ExternallyUsedValues) { Value *Ex = Builder.CreateExtractElement(Vec, Lane); Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(cast(Scalar)->getParent()); - ExternallyUsedValues.erase(Scalar); - ExternallyUsedValues[Ex] = DL; + std::swap(ExternallyUsedValues[Ex], ExternallyUsedValues[Scalar]); + assert(ExternallyUsedValues[Scalar].empty()); continue; } @@ -4439,9 +4439,11 @@ public: Builder.setFastMathFlags(Unsafe); unsigned i = 0; - MapVector ExternallyUsedValues; + BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; + // The same extra argument may be used several time, so log each attempt + // to use it. for (auto &Pair : ExtraArgs) - ExternallyUsedValues[Pair.second] = Pair.first->getDebugLoc(); + ExternallyUsedValues[Pair.second].push_back(Pair.first->getDebugLoc()); while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, ReductionOps); @@ -4489,9 +4491,14 @@ public: Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); } for (auto &Pair : ExternallyUsedValues) { - Builder.SetCurrentDebugLocation(Pair.second); - VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, - Pair.first, "bin.extra"); + if (Pair.second.empty()) + continue; + // Add each externally used value to the final reduction. + for (auto &DL : Pair.second) { + Builder.SetCurrentDebugLocation(DL); + VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, + Pair.first, "bin.extra"); + } } // Update users. if (ReductionPHI && !isa(ReductionPHI)) { diff --git a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 814c3a60f56..2eb0b2234ef 100644 --- a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -1473,9 +1473,10 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 ; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00 -; CHECK-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], [[CONV]] +; CHECK-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00 +; CHECK-NEXT: [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]] ; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; CHECK-NEXT: ret float [[BIN_EXTRA6]] +; CHECK-NEXT: ret float [[BIN_EXTRA7]] ; ; THRESHOLD-LABEL: @extra_args_same_several_times( ; THRESHOLD-NEXT: entry: @@ -1510,9 +1511,10 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a ; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 ; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00 -; THRESHOLD-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], [[CONV]] +; THRESHOLD-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00 +; THRESHOLD-NEXT: [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]] ; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] -; THRESHOLD-NEXT: ret float [[BIN_EXTRA6]] +; THRESHOLD-NEXT: ret float [[BIN_EXTRA7]] ; entry: %mul = mul nsw i32 %b, %a