mirror of
https://github.com/RPCSX/llvm.git
synced 2024-12-12 14:17:59 +00:00
[SLP] Emit optimization remarks
The approach I followed was to emit the remark after getTreeCost concludes that SLP is profitable. I initially tried emitting them after the vectorizeRootInstruction calls in vectorizeChainsInBlock but I vaguely remember missing a few cases for example in HorizontalReduction::tryToReduce. ORE is placed in BoUpSLP so that it's available from everywhere (notably HorizontalReduction::tryToReduce). We use the first instruction in the root bundle as the locator for the remark. In order to get a sense how far the tree is spanning I've include the size of the tree in the remark. This is not perfect of course but it gives you at least a rough idea about the tree. Then you can follow up with -view-slp-tree to really see the actual tree. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@302811 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
0470a16690
commit
65ad27f81e
@ -24,6 +24,7 @@
|
||||
#include "llvm/Analysis/AssumptionCache.h"
|
||||
#include "llvm/Analysis/DemandedBits.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
|
||||
#include "llvm/Analysis/ScalarEvolution.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
@ -59,7 +60,8 @@ public:
|
||||
// Glue for old PM.
|
||||
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
|
||||
TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
|
||||
DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_);
|
||||
DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
|
||||
OptimizationRemarkEmitter *ORE_);
|
||||
|
||||
private:
|
||||
/// \brief Collect store and getelementptr instructions and organize them
|
||||
|
@ -299,10 +299,10 @@ public:
|
||||
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
|
||||
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
|
||||
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
|
||||
const DataLayout *DL)
|
||||
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
|
||||
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
|
||||
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
|
||||
DL(DL), Builder(Se->getContext()) {
|
||||
DL(DL), ORE(ORE), Builder(Se->getContext()) {
|
||||
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
|
||||
// Use the vector register size specified by the target unless overridden
|
||||
// by a command-line option.
|
||||
@ -361,6 +361,8 @@ public:
|
||||
MinBWs.clear();
|
||||
}
|
||||
|
||||
unsigned getTreeSize() const { return VectorizableTree.size(); }
|
||||
|
||||
/// \brief Perform LICM and CSE on the newly generated gather sequences.
|
||||
void optimizeGatherSequence();
|
||||
|
||||
@ -399,6 +401,8 @@ public:
|
||||
/// vectorizable. We do not vectorize such trees.
|
||||
bool isTreeTinyAndNotFullyVectorizable();
|
||||
|
||||
OptimizationRemarkEmitter *getORE() { return ORE; }
|
||||
|
||||
private:
|
||||
struct TreeEntry;
|
||||
|
||||
@ -928,6 +932,8 @@ private:
|
||||
AssumptionCache *AC;
|
||||
DemandedBits *DB;
|
||||
const DataLayout *DL;
|
||||
OptimizationRemarkEmitter *ORE;
|
||||
|
||||
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
|
||||
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
|
||||
/// Instruction builder to construct the vectorized tree.
|
||||
@ -3772,8 +3778,9 @@ struct SLPVectorizer : public FunctionPass {
|
||||
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
||||
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
|
||||
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
|
||||
|
||||
return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
|
||||
return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
@ -3785,6 +3792,7 @@ struct SLPVectorizer : public FunctionPass {
|
||||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<DemandedBitsWrapperPass>();
|
||||
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
|
||||
AU.addPreserved<LoopInfoWrapperPass>();
|
||||
AU.addPreserved<DominatorTreeWrapperPass>();
|
||||
AU.addPreserved<AAResultsWrapperPass>();
|
||||
@ -3803,8 +3811,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
|
||||
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
|
||||
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
|
||||
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
|
||||
auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
|
||||
|
||||
bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
|
||||
bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
|
||||
if (!Changed)
|
||||
return PreservedAnalyses::all();
|
||||
|
||||
@ -3819,7 +3828,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
|
||||
TargetTransformInfo *TTI_,
|
||||
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
|
||||
LoopInfo *LI_, DominatorTree *DT_,
|
||||
AssumptionCache *AC_, DemandedBits *DB_) {
|
||||
AssumptionCache *AC_, DemandedBits *DB_,
|
||||
OptimizationRemarkEmitter *ORE_) {
|
||||
SE = SE_;
|
||||
TTI = TTI_;
|
||||
TLI = TLI_;
|
||||
@ -3847,7 +3857,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
|
||||
|
||||
// Use the bottom up slp vectorizer to construct chains that start with
|
||||
// store instructions.
|
||||
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
|
||||
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
|
||||
|
||||
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
|
||||
// delete instructions.
|
||||
@ -3936,6 +3946,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
|
||||
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
|
||||
if (Cost < -SLPCostThreshold) {
|
||||
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
|
||||
using namespace ore;
|
||||
R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
|
||||
cast<StoreInst>(Chain[i]))
|
||||
<< "Stores SLP vectorized with cost " << NV("Cost", Cost)
|
||||
<< " and with tree size "
|
||||
<< NV("TreeSize", R.getTreeSize()));
|
||||
|
||||
R.vectorizeTree();
|
||||
|
||||
// Move to the next bundle.
|
||||
@ -4149,6 +4166,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
|
||||
|
||||
if (Cost < -SLPCostThreshold) {
|
||||
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
|
||||
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
|
||||
cast<Instruction>(Ops[0]))
|
||||
<< "SLP vectorized with cost " << ore::NV("Cost", Cost)
|
||||
<< " and with tree size "
|
||||
<< ore::NV("TreeSize", R.getTreeSize()));
|
||||
|
||||
Value *VectorizedRoot = R.vectorizeTree();
|
||||
|
||||
// Reconstruct the build vector by extracting the vectorized root. This
|
||||
@ -4492,6 +4515,12 @@ public:
|
||||
|
||||
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
|
||||
<< ". (HorRdx)\n");
|
||||
auto *I0 = cast<Instruction>(VL[0]);
|
||||
V.getORE()->emit(
|
||||
OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
|
||||
<< "Vectorized horizontal reduction with cost "
|
||||
<< ore::NV("Cost", Cost) << " and with tree size "
|
||||
<< ore::NV("TreeSize", V.getTreeSize()));
|
||||
|
||||
// Vectorize a tree.
|
||||
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
|
||||
@ -5146,6 +5175,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
|
||||
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
|
||||
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
|
||||
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
|
||||
|
||||
namespace llvm {
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
|
||||
; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: cat %t | FileCheck -check-prefix=YAML %s
|
||||
|
||||
target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
|
||||
target triple = "aarch64--linux-gnu"
|
||||
@ -23,7 +24,25 @@ target triple = "aarch64--linux-gnu"
|
||||
; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
|
||||
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
|
||||
; CHECK: sext i32 [[X]] to i64
|
||||
;
|
||||
|
||||
; YAML: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedList
|
||||
; YAML-NEXT: Function: getelementptr_4x32
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
||||
; YAML-NEXT: - Cost: '11'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '5'
|
||||
|
||||
; YAML: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedList
|
||||
; YAML-NEXT: Function: getelementptr_4x32
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
||||
; YAML-NEXT: - Cost: '16'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '3'
|
||||
|
||||
define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
|
||||
entry:
|
||||
%cmp31 = icmp sgt i32 %n, 0
|
||||
@ -69,7 +88,25 @@ for.body:
|
||||
; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
|
||||
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
|
||||
; CHECK: sext i32 [[X]] to i64
|
||||
;
|
||||
|
||||
; YAML: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedList
|
||||
; YAML-NEXT: Function: getelementptr_2x32
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
||||
; YAML-NEXT: - Cost: '11'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '5'
|
||||
|
||||
; YAML: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedList
|
||||
; YAML-NEXT: Function: getelementptr_2x32
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
||||
; YAML-NEXT: - Cost: '6'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '3'
|
||||
|
||||
define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
|
||||
entry:
|
||||
%cmp31 = icmp sgt i32 %n, 0
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: opt -slp-vectorizer -slp-threshold=-6 -S < %s | FileCheck %s
|
||||
; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s
|
||||
; RUN: cat %t | FileCheck -check-prefix=YAML %s
|
||||
|
||||
; FIXME: The threshold is changed to keep this test case a bit smaller.
|
||||
; The AArch64 cost model should not give such high costs to select statements.
|
||||
@ -10,6 +11,16 @@ target triple = "aarch64--linux"
|
||||
; CHECK: load <4 x i32>
|
||||
; CHECK: load <4 x i32>
|
||||
; CHECK: select <4 x i1>
|
||||
|
||||
; YAML: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedHorizontalReduction
|
||||
; YAML-NEXT: Function: test_select
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
||||
; YAML-NEXT: - Cost: '4'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '8'
|
||||
|
||||
define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
|
||||
entry:
|
||||
%cmp.22 = icmp sgt i32 %h, 0
|
||||
@ -93,6 +104,16 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
|
||||
; CHECK: load <4 x i32>
|
||||
; CHECK: load <4 x i32>
|
||||
; CHECK: mul nsw <4 x i32>
|
||||
|
||||
; YAML: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedHorizontalReduction
|
||||
; YAML-NEXT: Function: reduction_with_br
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
||||
; YAML-NEXT: - Cost: '1'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '3'
|
||||
|
||||
entry:
|
||||
%cmp.16 = icmp sgt i32 %h, 0
|
||||
br i1 %cmp.16, label %for.body.lr.ph, label %for.end
|
||||
@ -150,6 +171,16 @@ for.end: ; preds = %for.end.loopexit, %
|
||||
; CHECK: load <8 x i8>
|
||||
; CHECK: load <8 x i8>
|
||||
; CHECK: select <8 x i1>
|
||||
|
||||
; YAML: Pass: slp-vectorizer
|
||||
; YAML-NEXT: Name: VectorizedHorizontalReduction
|
||||
; YAML-NEXT: Function: test_unrolled_select
|
||||
; YAML-NEXT: Args:
|
||||
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
|
||||
; YAML-NEXT: - Cost: '-33'
|
||||
; YAML-NEXT: - String: ' and with tree size '
|
||||
; YAML-NEXT: - TreeSize: '10'
|
||||
|
||||
define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
|
||||
entry:
|
||||
%cmp.43 = icmp sgt i32 %h, 0
|
||||
|
32
test/Transforms/SLPVectorizer/AArch64/remarks.ll
Normal file
32
test/Transforms/SLPVectorizer/AArch64/remarks.ll
Normal file
@ -0,0 +1,32 @@
|
||||
; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
|
||||
|
||||
define void @f(double* %r, double* %w) {
|
||||
%r0 = getelementptr inbounds double, double* %r, i64 0
|
||||
%r1 = getelementptr inbounds double, double* %r, i64 1
|
||||
%f0 = load double, double* %r0
|
||||
%f1 = load double, double* %r1
|
||||
%add0 = fadd double %f0, %f0
|
||||
%add1 = fadd double %f1, %f1
|
||||
%w0 = getelementptr inbounds double, double* %w, i64 0
|
||||
%w1 = getelementptr inbounds double, double* %w, i64 1
|
||||
; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
|
||||
store double %add0, double* %w0, !dbg !9
|
||||
store double %add1, double* %w1
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!3, !4, !5}
|
||||
!llvm.ident = !{!6}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
|
||||
!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
|
||||
!2 = !{}
|
||||
!3 = !{i32 2, !"Dwarf Version", i32 4}
|
||||
!4 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!5 = !{i32 1, !"PIC Level", i32 2}
|
||||
!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
|
||||
!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2)
|
||||
!8 = !DISubroutineType(types: !2)
|
||||
!9 = !DILocation(line: 5, column: 10, scope: !7)
|
Loading…
Reference in New Issue
Block a user