mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-24 21:25:41 +00:00
Use continuous boosting factor for complete unroll.
Summary: The current loop complete unroll algorithm checks if unrolling complete will reduce the runtime by a certain percentage. If yes, it will apply a fixed boosting factor to the threshold (by discounting cost). The problem for this approach is that the threshold abruptly. This patch makes the boosting factor a function of runtime reduction percentage, capped by a fixed threshold. In this way, the threshold changes continuously. The patch also simplified the code by reducing one parameter in UP. The patch only affects code-gen of two speccpu2006 benchmark: 445.gobmk binary size decreases 0.08%, no performance change. 464.h264ref binary size increases 0.24%, no performance change. Reviewers: mzolotukhin, chandlerc Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D26989 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290737 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
07ddca829a
commit
ddd3bb716c
@ -243,13 +243,17 @@ public:
|
||||
/// profitable. Set this to UINT_MAX to disable the loop body cost
|
||||
/// restriction.
|
||||
unsigned Threshold;
|
||||
/// If complete unrolling will reduce the cost of the loop below its
|
||||
/// expected dynamic cost while rolled by this percentage, apply a discount
|
||||
/// (below) to its unrolled cost.
|
||||
unsigned PercentDynamicCostSavedThreshold;
|
||||
/// The discount applied to the unrolled cost when the *dynamic* cost
|
||||
/// savings of unrolling exceed the \c PercentDynamicCostSavedThreshold.
|
||||
unsigned DynamicCostSavingsDiscount;
|
||||
/// If complete unrolling will reduce the cost of the loop, we will boost
|
||||
/// the Threshold by a certain percent to allow more aggressive complete
|
||||
/// unrolling. This value provides the maximum boost percentage that we
|
||||
/// can apply to Threshold (The value should be no less than 100).
|
||||
/// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost,
|
||||
/// MaxPercentThresholdBoost / 100)
|
||||
/// E.g. if complete unrolling reduces the loop execution time by 50%
|
||||
/// then we boost the threshold by the factor of 2x. If unrolling is not
|
||||
/// expected to reduce the running time, then we do not increase the
|
||||
/// threshold.
|
||||
unsigned MaxPercentThresholdBoost;
|
||||
/// The cost threshold for the unrolled loop when optimizing for size (set
|
||||
/// to UINT_MAX to disable).
|
||||
unsigned OptSizeThreshold;
|
||||
|
@ -46,16 +46,14 @@ static cl::opt<unsigned>
|
||||
UnrollThreshold("unroll-threshold", cl::Hidden,
|
||||
cl::desc("The baseline cost threshold for loop unrolling"));
|
||||
|
||||
static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold(
|
||||
"unroll-percent-dynamic-cost-saved-threshold", cl::init(50), cl::Hidden,
|
||||
cl::desc("The percentage of estimated dynamic cost which must be saved by "
|
||||
"unrolling to allow unrolling up to the max threshold."));
|
||||
|
||||
static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount(
|
||||
"unroll-dynamic-cost-savings-discount", cl::init(100), cl::Hidden,
|
||||
cl::desc("This is the amount discounted from the total unroll cost when "
|
||||
"the unrolled form has a high dynamic cost savings (triggered by "
|
||||
"the '-unroll-perecent-dynamic-cost-saved-threshold' flag)."));
|
||||
static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
|
||||
"unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
|
||||
cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
|
||||
"to the threshold when aggressively unrolling a loop due to the "
|
||||
"dynamic cost savings. If completely unrolling a loop will reduce "
|
||||
"the total runtime from X to Y, we boost the loop unroll "
|
||||
"threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
|
||||
"X/Y). This limit avoids excessive code bloat."));
|
||||
|
||||
static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
|
||||
"unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
|
||||
@ -127,8 +125,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
|
||||
|
||||
// Set up the defaults
|
||||
UP.Threshold = 150;
|
||||
UP.PercentDynamicCostSavedThreshold = 50;
|
||||
UP.DynamicCostSavingsDiscount = 100;
|
||||
UP.MaxPercentThresholdBoost = 400;
|
||||
UP.OptSizeThreshold = 0;
|
||||
UP.PartialThreshold = UP.Threshold;
|
||||
UP.PartialOptSizeThreshold = 0;
|
||||
@ -160,11 +157,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
|
||||
UP.Threshold = UnrollThreshold;
|
||||
UP.PartialThreshold = UnrollThreshold;
|
||||
}
|
||||
if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0)
|
||||
UP.PercentDynamicCostSavedThreshold =
|
||||
UnrollPercentDynamicCostSavedThreshold;
|
||||
if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0)
|
||||
UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;
|
||||
if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
|
||||
UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
|
||||
if (UnrollMaxCount.getNumOccurrences() > 0)
|
||||
UP.MaxCount = UnrollMaxCount;
|
||||
if (UnrollFullMaxCount.getNumOccurrences() > 0)
|
||||
@ -662,56 +656,21 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
|
||||
L->setLoopID(NewLoopID);
|
||||
}
|
||||
|
||||
static bool canUnrollCompletely(Loop *L, unsigned Threshold,
|
||||
unsigned PercentDynamicCostSavedThreshold,
|
||||
unsigned DynamicCostSavingsDiscount,
|
||||
uint64_t UnrolledCost,
|
||||
uint64_t RolledDynamicCost) {
|
||||
if (Threshold == NoThreshold) {
|
||||
DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
if (UnrolledCost <= Threshold) {
|
||||
DEBUG(dbgs() << " Can fully unroll, because unrolled cost: "
|
||||
<< UnrolledCost << "<=" << Threshold << "\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
assert(UnrolledCost && "UnrolledCost can't be 0 at this point.");
|
||||
assert(RolledDynamicCost >= UnrolledCost &&
|
||||
"Cannot have a higher unrolled cost than a rolled cost!");
|
||||
|
||||
// Compute the percentage of the dynamic cost in the rolled form that is
|
||||
// saved when unrolled. If unrolling dramatically reduces the estimated
|
||||
// dynamic cost of the loop, we use a higher threshold to allow more
|
||||
// unrolling.
|
||||
unsigned PercentDynamicCostSaved =
|
||||
(uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost;
|
||||
|
||||
if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold &&
|
||||
(int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <=
|
||||
(int64_t)Threshold) {
|
||||
DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the "
|
||||
"expected dynamic cost by "
|
||||
<< PercentDynamicCostSaved << "% (threshold: "
|
||||
<< PercentDynamicCostSavedThreshold << "%)\n"
|
||||
<< " and the unrolled cost (" << UnrolledCost
|
||||
<< ") is less than the max threshold ("
|
||||
<< DynamicCostSavingsDiscount << ").\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << " Too large to fully unroll:\n");
|
||||
DEBUG(dbgs() << " Threshold: " << Threshold << "\n");
|
||||
DEBUG(dbgs() << " Max threshold: " << DynamicCostSavingsDiscount << "\n");
|
||||
DEBUG(dbgs() << " Percent cost saved threshold: "
|
||||
<< PercentDynamicCostSavedThreshold << "%\n");
|
||||
DEBUG(dbgs() << " Unrolled cost: " << UnrolledCost << "\n");
|
||||
DEBUG(dbgs() << " Rolled dynamic cost: " << RolledDynamicCost << "\n");
|
||||
DEBUG(dbgs() << " Percent cost saved: " << PercentDynamicCostSaved
|
||||
<< "\n");
|
||||
return false;
|
||||
// Computes the boosting factor for complete unrolling.
|
||||
// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
|
||||
// be beneficial to fully unroll the loop even if unrolledcost is large. We
|
||||
// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
|
||||
// the unroll threshold.
|
||||
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
|
||||
unsigned MaxPercentThresholdBoost) {
|
||||
if (Cost.RolledDynamicCost >= UINT_MAX / 100)
|
||||
return 100;
|
||||
else if (Cost.UnrolledCost != 0)
|
||||
// The boosting factor is RolledDynamicCost / UnrolledCost
|
||||
return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
|
||||
MaxPercentThresholdBoost);
|
||||
else
|
||||
return MaxPercentThresholdBoost;
|
||||
}
|
||||
|
||||
// Returns loop size estimation for unrolled loop.
|
||||
@ -787,9 +746,7 @@ static bool computeUnrollCount(
|
||||
if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
|
||||
// When computing the unrolled size, note that BEInsns are not replicated
|
||||
// like the rest of the loop body.
|
||||
if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount,
|
||||
getUnrolledLoopSize(LoopSize, UP),
|
||||
getUnrolledLoopSize(LoopSize, UP))) {
|
||||
if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
|
||||
UseUpperBound = (MaxTripCount == FullUnrollTripCount);
|
||||
TripCount = FullUnrollTripCount;
|
||||
TripMultiple = UP.UpperBound ? 1 : TripMultiple;
|
||||
@ -800,11 +757,10 @@ static bool computeUnrollCount(
|
||||
// To check that, run additional analysis on the loop.
|
||||
if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
|
||||
L, FullUnrollTripCount, DT, *SE, TTI,
|
||||
UP.Threshold + UP.DynamicCostSavingsDiscount))
|
||||
if (canUnrollCompletely(L, UP.Threshold,
|
||||
UP.PercentDynamicCostSavedThreshold,
|
||||
UP.DynamicCostSavingsDiscount,
|
||||
Cost->UnrolledCost, Cost->RolledDynamicCost)) {
|
||||
UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
|
||||
unsigned Boost =
|
||||
getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
|
||||
if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
|
||||
UseUpperBound = (MaxTripCount == FullUnrollTripCount);
|
||||
TripCount = FullUnrollTripCount;
|
||||
TripMultiple = UP.UpperBound ? 1 : TripMultiple;
|
||||
@ -812,6 +768,7 @@ static bool computeUnrollCount(
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4rd priority is partial unrolling.
|
||||
// Try partial unroll only when TripCount could be staticaly calculated.
|
||||
|
@ -1,5 +1,5 @@
|
||||
; Check that we don't crash on corner cases.
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -unroll-percent-dynamic-cost-saved-threshold=20 -o /dev/null
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -unroll-max-percent-threshold-boost=200 -o /dev/null
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
@known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=70 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
@unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=40 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
@known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=12 -unroll-max-percent-threshold-boost=400 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
@known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
; When examining gep-instructions we shouldn't consider them simplified if the
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define i64 @propagate_loop_phis() {
|
||||
|
@ -17,21 +17,18 @@
|
||||
; optimizations to remove ~55% of the instructions, the loop body size is 9,
|
||||
; and unrolled size is 65.
|
||||
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=20 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s -check-prefix=TEST1
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=20 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s -check-prefix=TEST2
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=80 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s -check-prefix=TEST3
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=100 -unroll-percent-dynamic-cost-saved-threshold=80 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s -check-prefix=TEST4
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST1
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=200 | FileCheck %s -check-prefix=TEST2
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST3
|
||||
|
||||
; If the absolute threshold is too low, or if we can't optimize away requested
|
||||
; percent of instructions, we shouldn't unroll:
|
||||
; If the absolute threshold is too low, we should not unroll:
|
||||
; TEST1: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
|
||||
; TEST3: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
|
||||
|
||||
; Otherwise, we should:
|
||||
; TEST2-NOT: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
|
||||
|
||||
; Also, we should unroll if the 'unroll-threshold' is big enough:
|
||||
; TEST4-NOT: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
|
||||
; If we do not boost threshold, the unroll will not happen:
|
||||
; TEST3: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv
|
||||
|
||||
; And check that we don't crash when we're not allowed to do any analysis.
|
||||
; RUN: opt < %s -loop-unroll -unroll-max-iteration-count-to-analyze=0 -disable-output
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt < %s -S -unroll-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-dynamic-cost-savings-discount=0 | FileCheck %s
|
||||
; RUN: opt < %s -S -unroll-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=100 | FileCheck %s
|
||||
|
||||
; The Loop TripCount is 9. However unroll factors 3 or 9 exceed given threshold.
|
||||
; The test checks that we choose a smaller, power-of-two, unroll count and do not give up on unrolling.
|
||||
@ -21,7 +21,7 @@ for.body: ; preds = %for.body, %entry
|
||||
%st = trunc i64 %indvars.iv to i32
|
||||
store i32 %st, i32* %arrayidx2, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 10
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 20
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end: ; preds = %for.body
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=40 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s
|
||||
; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=40 -unroll-max-percent-threshold-boost=100 | FileCheck %s
|
||||
|
||||
@known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user