[PowerPC] Enable late partial unrolling on the POWER7

The P7 benefits from not have really-small loops so that we either have
multiple dispatch groups in the loop and/or the ability to form more-full
dispatch groups during scheduling. Setting the partial unrolling threshold to
44 seems good, empirically, for the P7. Compared to using no late partial
unrolling, this yields the following test-suite speedups:

SingleSource/Benchmarks/Adobe-C++/simple_types_constant_folding
	-66.3253% +/- 24.1975%
SingleSource/Benchmarks/Misc-C++/oopack_v1p8
	-44.0169% +/- 29.4881%
SingleSource/Benchmarks/Misc/pi
	-27.8351% +/- 12.2712%
SingleSource/Benchmarks/Stanford/Bubblesort
	-30.9898% +/- 22.4647%

I've speculatively added a similar setting for the P8. Also, I've noticed that
the unroller does not quite calculate the unrolling factor correctly for really
tiny loops because it neglects to account for the fact that not every loop body
replicant contains an ending branch and counter increment. I'll fix that later.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225522 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Hal Finkel 2015-01-09 15:51:16 +00:00
parent bdab504afd
commit 139bfee84c
4 changed files with 59 additions and 0 deletions

View File

@ -380,6 +380,9 @@ def P7Model : SchedMachineModel {
// Itineraries are queried instead.
let MispredictPenalty = 16;
// Try to make sure we have at least 10-11 dispatch groups in a loop.
let LoopMicroOpBufferSize = 44;
let Itineraries = P7Itineraries;
}

View File

@ -389,6 +389,9 @@ def P8Model : SchedMachineModel {
// Itineraries are queried instead.
let MispredictPenalty = 16;
// Try to make sure we have at least 10-11 dispatch groups in a loop.
let LoopMicroOpBufferSize = 66;
let Itineraries = P8Itineraries;
}

View File

@ -277,6 +277,8 @@ void PPCTTI::getUnrollingPreferences(const Function *F, Loop *L,
// helps expose latency-hiding opportunities to the instruction scheduler.
UP.Partial = UP.Runtime = true;
}
TargetTransformInfo::getUnrollingPreferences(F, L, UP);
}
unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {

View File

@ -0,0 +1,51 @@
; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -loop-unroll | FileCheck %s
define void @unroll_opt_for_size() nounwind optsize {
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
%inc = add i32 %iv, 1
%exitcnd = icmp uge i32 %inc, 1024
br i1 %exitcnd, label %exit, label %loop
exit:
ret void
}
; CHECK-LABEL: @unroll_opt_for_size
; CHECK: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK: icmp
define void @unroll_default() nounwind {
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
%inc = add i32 %iv, 1
%exitcnd = icmp uge i32 %inc, 1024
br i1 %exitcnd, label %exit, label %loop
exit:
ret void
}
; CHECK-LABEL: @unroll_default
; CHECK: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK-NEXT: add
; CHECK: icmp