[CodeGen] Track trip counts per-scop for performance measurement.

- Add a counter that is incremented once on exit from a scop.

- Test cases got split into two: one to test the cycles, and another one
to test trip counts.

- Sample output:
```name=sample-output.txt
scop function, entry block name, exit block name, total time, trip count
warmup, %entry.split, %polly.merge_new_and_old, 5180, 1
f, %entry.split, %polly.merge_new_and_old, 409944, 500
g, %entry.split, %polly.merge_new_and_old, 1226, 1
```

Differential Revision: https://reviews.llvm.org/D33822

llvm-svn: 304543
This commit is contained in:
Siddharth Bhat 2017-06-02 11:36:52 +00:00
parent 01bf58d6ec
commit 726c28f8c4
5 changed files with 173 additions and 104 deletions

View File

@ -62,6 +62,9 @@ private:
/// The total number of cycles spent in the current scop S.
llvm::Value *CyclesInCurrentScopPtr;
/// The total number of times the current scop S is executed.
llvm::Value *TripCountForCurrentScopPtr;
/// The total number of cycles spent within scops.
llvm::Value *CyclesInScopsPtr;

View File

@ -87,15 +87,18 @@ static std::string GetScopUniqueVarname(const Scop &S) {
std::string EntryString, ExitString;
std::tie(EntryString, ExitString) = S.getEntryExitStr();
Name << "__polly_perf_cycles_in_" << std::string(S.getFunction().getName())
Name << "__polly_perf_in_" << std::string(S.getFunction().getName())
<< "_from__" << EntryString << "__to__" << ExitString;
return Name.str();
}
void PerfMonitor::addScopCounter() {
const std::string varname = GetScopUniqueVarname(S);
TryRegisterGlobal(M, varname.c_str(), Builder.getInt64(0),
TryRegisterGlobal(M, (varname + "_cycles").c_str(), Builder.getInt64(0),
&CyclesInCurrentScopPtr);
TryRegisterGlobal(M, (varname + "_trip_count").c_str(), Builder.getInt64(0),
&TripCountForCurrentScopPtr);
}
void PerfMonitor::addGlobalVariables() {
@ -160,7 +163,7 @@ Function *PerfMonitor::insertFinalReporting() {
RuntimeDebugBuilder::createCPUPrinter(
Builder, "scop function, "
"entry block name, exit block name, total time\n");
"entry block name, exit block name, total time, trip count\n");
ReturnFromFinal = Builder.CreateRetVoid();
return ExitFn;
}
@ -179,13 +182,17 @@ void PerfMonitor::AppendScopReporting() {
Value *CyclesInCurrentScop =
Builder.CreateLoad(this->CyclesInCurrentScopPtr, true);
Value *TripCountForCurrentScop =
Builder.CreateLoad(this->TripCountForCurrentScopPtr, true);
std::string EntryName, ExitName;
std::tie(EntryName, ExitName) = S.getEntryExitStr();
// print in CSV for easy parsing with other tools.
RuntimeDebugBuilder::createCPUPrinter(Builder, S.getFunction().getName(),
", ", EntryName, ", ", ExitName, ", ",
CyclesInCurrentScop, "\n");
RuntimeDebugBuilder::createCPUPrinter(
Builder, S.getFunction().getName(), ", ", EntryName, ", ", ExitName, ", ",
CyclesInCurrentScop, ", ", TripCountForCurrentScop, "\n");
ReturnFromFinal = Builder.CreateRetVoid();
}
@ -288,4 +295,11 @@ void PerfMonitor::insertRegionEnd(Instruction *InsertBefore) {
Value *CyclesInCurrentScop = Builder.CreateLoad(CyclesInCurrentScopPtr, true);
CyclesInCurrentScop = Builder.CreateAdd(CyclesInCurrentScop, CyclesInScop);
Builder.CreateStore(CyclesInCurrentScop, CyclesInCurrentScopPtr, true);
Value *TripCountForCurrentScop =
Builder.CreateLoad(TripCountForCurrentScopPtr, true);
TripCountForCurrentScop =
Builder.CreateAdd(TripCountForCurrentScop, Builder.getInt64(1));
Builder.CreateStore(TripCountForCurrentScop, TripCountForCurrentScopPtr,
true);
}

View File

@ -0,0 +1,75 @@
; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
; RUN: -S < %s | FileCheck %s
; void f(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
; void g(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @f(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
define void @g(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
; Declaration of globals - Check for cycles declaration.
; @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_cycles" = weak thread_local(initialexec) constant i64 0
; @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles" = weak thread_local(initialexec) constant i64 0
; Bumping up number of cycles in f
; CHECK: %10 = load volatile i64, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_cycles"
; CHECK-NEXT: %11 = add i64 %10, %7
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_cycles"
; Bumping up number of cycles in g
; CHECK: %10 = load volatile i64, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles"
; CHECK-NEXT: %11 = add i64 %10, %7
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_cycles"

View File

@ -1,98 +0,0 @@
; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
; RUN: -S < %s | FileCheck %s
; void f(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
; void g(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @f(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
define void @g(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
; Declaration of globals
; CHECK: @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0
; CHECK: @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0
; Bumping up counter in f
; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: %7 = sub i64 %6, %5
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %9 = add i64 %8, %7
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %11 = add i64 %10, %7
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: br label %return
; Bumping up counter in g
; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: %7 = sub i64 %6, %5
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %9 = add i64 %8, %7
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %11 = add i64 %10, %7
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: br label %return
; Final reporting prints
; CHECK: %20 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %21 = call i32 (...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @25, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @18, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([3 x i8], [3 x i8] addrspace(4)* @19, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @20, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([3 x i8], [3 x i8] addrspace(4)* @21, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @22, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([3 x i8], [3 x i8] addrspace(4)* @23, i32 0, i32 0), i64 %20, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @24, i32 0, i32 0))
; CHECK-NEXT: %22 = call i32 @fflush(i8* null)
; CHECK-NEXT: %23 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %24 = call i32 (...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @33, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @26, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([3 x i8], [3 x i8] addrspace(4)* @27, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @28, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([3 x i8], [3 x i8] addrspace(4)* @29, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @30, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([3 x i8], [3 x i8] addrspace(4)* @31, i32 0, i32 0), i64 %23, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @32, i32 0, i32 0))

View File

@ -0,0 +1,75 @@
; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
; RUN: -S < %s | FileCheck %s
; void f(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
; void g(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @f(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
define void @g(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
; Declaration of globals - Check for cycles declaration.
; CHECK: @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_trip_count" = weak thread_local(initialexec) constant i64 0
; CHECK: @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count" = weak thread_local(initialexec) constant i64 0
; Bumping up number of cycles in f
; CHECK: %12 = load volatile i64, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_trip_count"
; CHECK-NEXT: %13 = add i64 %12, 1
; CHECK-NEXT: store volatile i64 %13, i64* @"__polly_perf_in_f_from__%next__to__%polly.merge_new_and_old_trip_count"
; Bumping up number of cycles in g
; CHECK: %12 = load volatile i64, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count"
; CHECK-NEXT: %13 = add i64 %12, 1
; CHECK-NEXT: store volatile i64 %13, i64* @"__polly_perf_in_g_from__%next__to__%polly.merge_new_and_old_trip_count"