mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-01 07:10:37 +00:00
Temporairly revert "[SimplifyCFG][LoopRotate] SimplifyCFG: disable common instruction hoisting by default, enable late in pipeline"
As disscussed in post-commit review starting with https://reviews.llvm.org/D84108#2227365 while this appears to be mostly a win overall, especially code-size-wise, this appears to shake //certain// code pattens in a way that is extremely unfavorable for performance (+30% runtime regression) on certain CPU's (i personally can't reproduce). So until the behaviour is better understood, and a path forward is mapped, let's back this out for now. This reverts commit 1d51dc38d89bd33fb8874e242ab87b265b4dec1c.
This commit is contained in:
parent
33479a624a
commit
29a87631f2
@ -25,7 +25,7 @@ struct SimplifyCFGOptions {
|
||||
bool ForwardSwitchCondToPhi = false;
|
||||
bool ConvertSwitchToLookupTable = false;
|
||||
bool NeedCanonicalLoop = true;
|
||||
bool HoistCommonInsts = false;
|
||||
bool HoistCommonInsts = true;
|
||||
bool SinkCommonInsts = false;
|
||||
bool SimplifyCondBranch = true;
|
||||
bool FoldTwoEntryPHINode = true;
|
||||
|
@ -1147,14 +1147,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
|
||||
// convert to more optimized IR using more aggressive simplify CFG options.
|
||||
// The extra sinking transform can create larger basic blocks, so do this
|
||||
// before SLP vectorization.
|
||||
// FIXME: study whether hoisting and/or sinking of common instructions should
|
||||
// be delayed until after SLP vectorizer.
|
||||
OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
|
||||
.forwardSwitchCondToPhi(true)
|
||||
.convertSwitchToLookupTable(true)
|
||||
.needCanonicalLoops(false)
|
||||
.hoistCommonInsts(true)
|
||||
.sinkCommonInsts(true)));
|
||||
OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions().
|
||||
forwardSwitchCondToPhi(true).
|
||||
convertSwitchToLookupTable(true).
|
||||
needCanonicalLoops(false).
|
||||
sinkCommonInsts(true)));
|
||||
|
||||
// Optimize parallel scalar instruction chains into SIMD instructions.
|
||||
if (PTO.SLPVectorization)
|
||||
|
@ -457,7 +457,6 @@ void AArch64PassConfig::addIRPasses() {
|
||||
.forwardSwitchCondToPhi(true)
|
||||
.convertSwitchToLookupTable(true)
|
||||
.needCanonicalLoops(false)
|
||||
.hoistCommonInsts(true)
|
||||
.sinkCommonInsts(true)));
|
||||
|
||||
// Run LoopDataPrefetch
|
||||
|
@ -409,8 +409,7 @@ void ARMPassConfig::addIRPasses() {
|
||||
// ldrex/strex loops to simplify this, but it needs tidying up.
|
||||
if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
|
||||
addPass(createCFGSimplificationPass(
|
||||
SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true),
|
||||
[this](const Function &F) {
|
||||
SimplifyCFGOptions().sinkCommonInsts(true), [this](const Function &F) {
|
||||
const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
|
||||
return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
|
||||
}));
|
||||
|
@ -329,7 +329,6 @@ void HexagonPassConfig::addIRPasses() {
|
||||
.forwardSwitchCondToPhi(true)
|
||||
.convertSwitchToLookupTable(true)
|
||||
.needCanonicalLoops(false)
|
||||
.hoistCommonInsts(true)
|
||||
.sinkCommonInsts(true)));
|
||||
if (EnableLoopPrefetch)
|
||||
addPass(createLoopDataPrefetchPass());
|
||||
|
@ -784,13 +784,10 @@ void PassManagerBuilder::populateModulePassManager(
|
||||
// convert to more optimized IR using more aggressive simplify CFG options.
|
||||
// The extra sinking transform can create larger basic blocks, so do this
|
||||
// before SLP vectorization.
|
||||
// FIXME: study whether hoisting and/or sinking of common instructions should
|
||||
// be delayed until after SLP vectorizer.
|
||||
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
|
||||
.forwardSwitchCondToPhi(true)
|
||||
.convertSwitchToLookupTable(true)
|
||||
.needCanonicalLoops(false)
|
||||
.hoistCommonInsts(true)
|
||||
.sinkCommonInsts(true)));
|
||||
|
||||
if (SLPVectorize) {
|
||||
|
@ -63,8 +63,8 @@ static cl::opt<bool> UserForwardSwitchCond(
|
||||
cl::desc("Forward switch condition to phi ops (default = false)"));
|
||||
|
||||
static cl::opt<bool> UserHoistCommonInsts(
|
||||
"hoist-common-insts", cl::Hidden, cl::init(false),
|
||||
cl::desc("hoist common instructions (default = false)"));
|
||||
"hoist-common-insts", cl::Hidden, cl::init(true),
|
||||
cl::desc("hoist common instructions (default = true)"));
|
||||
|
||||
static cl::opt<bool> UserSinkCommonInsts(
|
||||
"sink-common-insts", cl::Hidden, cl::init(false),
|
||||
|
@ -2008,16 +2008,9 @@ define i64 @test_chr_22(i1 %i, i64* %j, i64 %v0) !prof !14 {
|
||||
; CHECK-NEXT: bb0:
|
||||
; CHECK-NEXT: [[REASS_ADD:%.*]] = shl i64 [[V0:%.*]], 1
|
||||
; CHECK-NEXT: [[V2:%.*]] = add i64 [[REASS_ADD]], 3
|
||||
; CHECK-NEXT: [[C1:%.*]] = icmp slt i64 [[V2]], 100
|
||||
; CHECK-NEXT: br i1 [[C1]], label [[BB0_SPLIT:%.*]], label [[BB0_SPLIT_NONCHR:%.*]], !prof !15
|
||||
; CHECK: bb0.split:
|
||||
; CHECK-NEXT: [[V299:%.*]] = mul i64 [[V2]], 7860086430977039991
|
||||
; CHECK-NEXT: store i64 [[V299]], i64* [[J:%.*]], align 4
|
||||
; CHECK-NEXT: ret i64 99
|
||||
; CHECK: bb0.split.nonchr:
|
||||
; CHECK-NEXT: [[V299_NONCHR:%.*]] = mul i64 [[V2]], 7860086430977039991
|
||||
; CHECK-NEXT: store i64 [[V299_NONCHR]], i64* [[J]], align 4
|
||||
; CHECK-NEXT: ret i64 99
|
||||
;
|
||||
bb0:
|
||||
%v1 = add i64 %v0, 3
|
||||
|
@ -5,11 +5,14 @@
|
||||
; RUN: opt -O3 -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK2
|
||||
; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK3
|
||||
|
||||
; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK4
|
||||
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK5
|
||||
; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK4
|
||||
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK5
|
||||
|
||||
; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK6
|
||||
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK7
|
||||
; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK6
|
||||
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK7
|
||||
|
||||
; RUN: opt -O3 -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK8
|
||||
; RUN: opt -passes='default<O3>' -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK9
|
||||
|
||||
; This example is produced from a very basic C code:
|
||||
;
|
||||
@ -58,8 +61,8 @@ define void @_Z4loopi(i32 %width) {
|
||||
; HOIST-NEXT: br label [[FOR_COND:%.*]]
|
||||
; HOIST: for.cond:
|
||||
; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
||||
; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
|
||||
; HOIST-NEXT: tail call void @f0()
|
||||
; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
|
||||
; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
|
||||
; HOIST: for.cond.cleanup:
|
||||
; HOIST-NEXT: tail call void @f2()
|
||||
@ -77,17 +80,17 @@ define void @_Z4loopi(i32 %width) {
|
||||
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
||||
; ROTATED_LATER_OLDPM: for.cond.preheader:
|
||||
; ROTATED_LATER_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
||||
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
|
||||
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
|
||||
; ROTATED_LATER_OLDPM: for.cond.cleanup:
|
||||
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_OLDPM-NEXT: tail call void @f2()
|
||||
; ROTATED_LATER_OLDPM-NEXT: br label [[RETURN]]
|
||||
; ROTATED_LATER_OLDPM: for.body:
|
||||
; ROTATED_LATER_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
||||
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_OLDPM-NEXT: tail call void @f1()
|
||||
; ROTATED_LATER_OLDPM-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
|
||||
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
|
||||
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
||||
; ROTATED_LATER_OLDPM: return:
|
||||
@ -99,19 +102,19 @@ define void @_Z4loopi(i32 %width) {
|
||||
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
||||
; ROTATED_LATER_NEWPM: for.cond.preheader:
|
||||
; ROTATED_LATER_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
||||
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
|
||||
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE:%.*]]
|
||||
; ROTATED_LATER_NEWPM: for.cond.preheader.for.body_crit_edge:
|
||||
; ROTATED_LATER_NEWPM-NEXT: [[INC_1:%.*]] = add nuw i32 0, 1
|
||||
; ROTATED_LATER_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; ROTATED_LATER_NEWPM: for.cond.cleanup:
|
||||
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_NEWPM-NEXT: tail call void @f2()
|
||||
; ROTATED_LATER_NEWPM-NEXT: br label [[RETURN]]
|
||||
; ROTATED_LATER_NEWPM: for.body:
|
||||
; ROTATED_LATER_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE]] ]
|
||||
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_NEWPM-NEXT: tail call void @f1()
|
||||
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
|
||||
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
|
||||
; ROTATED_LATER_NEWPM: for.body.for.body_crit_edge:
|
||||
@ -126,19 +129,19 @@ define void @_Z4loopi(i32 %width) {
|
||||
; ROTATE_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
||||
; ROTATE_OLDPM: for.cond.preheader:
|
||||
; ROTATE_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
||||
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATE_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
|
||||
; ROTATE_OLDPM: for.body.preheader:
|
||||
; ROTATE_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
||||
; ROTATE_OLDPM-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; ROTATE_OLDPM: for.cond.cleanup:
|
||||
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATE_OLDPM-NEXT: tail call void @f2()
|
||||
; ROTATE_OLDPM-NEXT: br label [[RETURN]]
|
||||
; ROTATE_OLDPM: for.body:
|
||||
; ROTATE_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
||||
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATE_OLDPM-NEXT: tail call void @f1()
|
||||
; ROTATE_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
||||
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
||||
; ROTATE_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
|
||||
; ROTATE_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
||||
; ROTATE_OLDPM: return:
|
||||
@ -150,19 +153,19 @@ define void @_Z4loopi(i32 %width) {
|
||||
; ROTATE_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
||||
; ROTATE_NEWPM: for.cond.preheader:
|
||||
; ROTATE_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
||||
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATE_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
|
||||
; ROTATE_NEWPM: for.body.preheader:
|
||||
; ROTATE_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
||||
; ROTATE_NEWPM-NEXT: [[INC_1:%.*]] = add nuw nsw i32 0, 1
|
||||
; ROTATE_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; ROTATE_NEWPM: for.cond.cleanup:
|
||||
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATE_NEWPM-NEXT: tail call void @f2()
|
||||
; ROTATE_NEWPM-NEXT: br label [[RETURN]]
|
||||
; ROTATE_NEWPM: for.body:
|
||||
; ROTATE_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_BODY_PREHEADER]] ]
|
||||
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATE_NEWPM-NEXT: tail call void @f1()
|
||||
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
||||
; ROTATE_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
|
||||
; ROTATE_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
|
||||
; ROTATE_NEWPM: for.body.for.body_crit_edge:
|
||||
|
@ -1,7 +1,7 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -simplifycfg -hoist-common-insts=1 -S < %s | FileCheck %s --check-prefixes=HOIST
|
||||
; RUN: opt -simplifycfg -hoist-common-insts=0 -S < %s | FileCheck %s --check-prefixes=NOHOIST
|
||||
; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=NOHOIST,DEFAULT
|
||||
; RUN: opt -simplifycfg -S < %s | FileCheck %s --check-prefixes=HOIST,DEFAULT
|
||||
|
||||
; This example is produced from a very basic C code:
|
||||
;
|
||||
|
Loading…
Reference in New Issue
Block a user