Explicitly annotate loops we want to run thread-parallel

We introduces a new flag -polly-parallel and use it to annotate the for-nodes in
the isl ast that we want to execute thread parallel (e.g., using OpenMP). We
previously already emmitted openmp annotations, but we did this for various
kinds of parallel loops, including some which we can not run in parallel.

With this patch we now have three annotations:

  1) #pragma known-parallel [reduction]
  2) #pragma omp for
  3) #pragma simd

meaning:

  1) loop has no loop carried dependences
  2) loop will be executed thread-parallel
  3) loop can possibly be vectorized

This patch introduces 1) and reduces the use of 2) to only the cases where we
will actually generate thread parallel code.

It is in preparation of openmp code generation in our isl backend.

Legacy:

- We also have a command line option -enable-polly-openmp. This option controls
  the OpenMP code generation in CLooG. It will become an alias of
  -polly-parallel after the CLooG code generation has been dropped.

http://reviews.llvm.org/D6142

llvm-svn: 221479
This commit is contained in:
Tobias Grosser 2014-11-06 19:35:21 +00:00
parent babee83257
commit 8b5344fda2
26 changed files with 51 additions and 35 deletions

View File

@ -126,6 +126,9 @@ public:
/// @brief Is this loop a reduction parallel loop?
static bool isReductionParallel(__isl_keep isl_ast_node *Node);
/// @brief Will the loop be run as thread parallel?
static bool isExecutedInParallel(__isl_keep isl_ast_node *Node);
/// @brief Get the nodes schedule or a nullptr if not available.
static __isl_give isl_union_map *getSchedule(__isl_keep isl_ast_node *Node);

View File

@ -42,6 +42,11 @@ using namespace polly;
using IslAstUserPayload = IslAstInfo::IslAstUserPayload;
static cl::opt<bool>
PollyParallel("polly-parallel",
cl::desc("Generate thread parallel code (isl codegen only)"),
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
static cl::opt<bool> UseContext("polly-ast-use-context",
cl::desc("Use context"), cl::Hidden,
cl::init(false), cl::ZeroOrMore,
@ -148,6 +153,7 @@ static isl_printer *cbPrintFor(__isl_take isl_printer *Printer,
isl_pw_aff *DD = IslAstInfo::getMinimalDependenceDistance(Node);
const std::string BrokenReductionsStr = getBrokenReductionsStr(Node);
const std::string KnownParallelStr = "#pragma known-parallel";
const std::string DepDisPragmaStr = "#pragma minimal dependence distance: ";
const std::string SimdPragmaStr = "#pragma simd";
const std::string OmpPragmaStr = "#pragma omp parallel for";
@ -158,8 +164,10 @@ static isl_printer *cbPrintFor(__isl_take isl_printer *Printer,
if (IslAstInfo::isInnermostParallel(Node))
Printer = printLine(Printer, SimdPragmaStr + BrokenReductionsStr);
if (IslAstInfo::isOutermostParallel(Node))
Printer = printLine(Printer, OmpPragmaStr + BrokenReductionsStr);
if (IslAstInfo::isExecutedInParallel(Node))
Printer = printLine(Printer, OmpPragmaStr);
else if (IslAstInfo::isOutermostParallel(Node))
Printer = printLine(Printer, KnownParallelStr + BrokenReductionsStr);
isl_pw_aff_free(DD);
return isl_ast_node_for_print(Node, Printer, Options);
@ -357,7 +365,8 @@ IslAst::IslAst(Scop *Scop, Dependences &D) : S(Scop) {
isl_union_map *Schedule =
isl_union_map_intersect_domain(S->getSchedule(), S->getDomains());
if (DetectParallel || PollyVectorizerChoice != VECTORIZER_NONE) {
if (PollyParallel || DetectParallel ||
PollyVectorizerChoice != VECTORIZER_NONE) {
BuildInfo.Deps = &D;
BuildInfo.InParallelFor = 0;
@ -444,6 +453,10 @@ bool IslAstInfo::isReductionParallel(__isl_keep isl_ast_node *Node) {
return Payload && Payload->IsReductionParallel;
}
bool IslAstInfo::isExecutedInParallel(__isl_keep isl_ast_node *Node) {
return PollyParallel && isOutermostParallel(Node) && !isReductionParallel(Node);
}
isl_union_map *IslAstInfo::getSchedule(__isl_keep isl_ast_node *Node) {
IslAstUserPayload *Payload = getNodePayload(Node);
return Payload ? isl_ast_build_get_schedule(Payload->Build) : nullptr;

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -analyze < %s | FileCheck %s
;
; void jd(int *A) {
; CHECK: #pragma omp parallel for

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze -polly-delinearize < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -analyze -polly-delinearize < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"
; int A[1024][1024];

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

View File

@ -1,4 +1,4 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -polly-dependences-computeout=1 -analyze < %s | FileCheck %s
; RUN: opt %loadPolly -polly-ast -polly-parallel -polly-dependences-computeout=1 -analyze < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"

View File

@ -1,6 +1,6 @@
; RUN: opt %loadPolly -polly-delinearize -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK: #pragma omp parallel for reduction (^ : sum)
; CHECK: #pragma known-parallel reduction (^ : sum)
; void f(int N, int M, int P, int sum[P][M]) {
; for (int i = 0; i < N; i++)
; for (int j = 0; j < P; j++)

View File

@ -1,6 +1,6 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK: #pragma omp parallel for reduction (^ : sum)
; CHECK: #pragma known-parallel reduction (^ : sum)
; void f(int N, int M, int *sum) {
; for (int i = 0; i < N; i++)
; CHECK: #pragma simd

View File

@ -1,7 +1,7 @@
; RUN: opt %loadPolly -basicaa -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK: #pragma simd reduction (+ : sum{{[1,2]}}, sum{{[1,2]}}) reduction (* : prod) reduction (| : or) reduction (& : and)
; CHECK: #pragma omp parallel for reduction (+ : sum{{[1,2]}}, sum{{[1,2]}}) reduction (* : prod) reduction (| : or) reduction (& : and)
; CHECK: #pragma known-parallel reduction (+ : sum{{[1,2]}}, sum{{[1,2]}}) reduction (* : prod) reduction (| : or) reduction (& : and)
; CHECK: for (int c1 = 0; c1 < N; c1 += 1)
; CHECK: Stmt_for_body(c1);
;

View File

@ -2,7 +2,7 @@
;
; Verify that we won't privatize anything in the outer dimension
;
; CHECK: #pragma omp parallel for
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 < 2 * n; c1 += 1)
; CHECK: #pragma simd reduction
; CHECK: for (int c3 = 0; c3 <= 1023; c3 += 1)

View File

@ -1,6 +1,6 @@
; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK: #pragma omp parallel for reduction
; CHECK: #pragma known-parallel reduction
; CHECK: for (int c0 = 0; c0 <= 2; c0 += 1) {
; CHECK: if (c0 == 2) {
; CHECK: #pragma simd reduction

View File

@ -1,6 +1,6 @@
; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK: #pragma omp parallel for
; CHECK: #pragma known-parallel
; CHECK: for (int c0 = 0; c0 <= 1; c0 += 1) {
; CHECK: if (c0 == 1) {
; CHECK: for (int c1 = 1; c1 < 2 * n; c1 += 2)

View File

@ -2,8 +2,8 @@
;
; Verify that the outer dimension doesnt't carry reduction dependences
;
; CHECK-NOT:#pragma omp parallel for reduction
; CHECK: #pragma omp parallel for
; CHECK-NOT:#pragma known-parallel reduction
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 < 2 * n; c1 += 1) {
; CHECK: if (c1 % 2 == 0) {
; CHECK: #pragma simd reduction

View File

@ -2,8 +2,8 @@
;
; Verify that the outer dimension doesnt't carry reduction dependences
;
; CHECK-NOT:#pragma omp parallel for reduction
; CHECK: #pragma omp parallel for
; CHECK-NOT:#pragma known-parallel reduction
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 < 2 * n; c1 += 1)
; CHECK: #pragma simd reduction
; CHECK: for (int c3 = 0; c3 <= 1023; c3 += 1) {

View File

@ -2,8 +2,8 @@
;
; Verify that the outer dimension doesnt't carry reduction dependences
;
; CHECK-NOT:#pragma omp parallel for reduction
; CHECK: #pragma omp parallel for
; CHECK-NOT:#pragma known-parallel reduction
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 < 2 * n; c1 += 1)
; CHECK: #pragma simd reduction
; CHECK: for (int c3 = -1023; c3 <= 1023; c3 += 1) {

View File

@ -2,7 +2,7 @@
;
; Verify that only the outer dimension needs privatization
;
; CHECK: #pragma omp parallel for reduction
; CHECK: #pragma known-parallel reduction
; CHECK: for (int c1 = 0; c1 <= 1023; c1 += 1) {
; CHECK: if (c1 % 2 == 0) {
; CHECK-NOT: #pragma simd reduction

View File

@ -1,7 +1,7 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK-NOT:#pragma omp parallel for reduction
; CHECK: #pragma omp parallel for
; CHECK-NOT:#pragma known-parallel reduction
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 <= 2047; c1 += 1)
; CHECK: for (int c3 = 0; c3 <= 1023; c3 += 1)
; CHECK: #pragma simd reduction

View File

@ -1,7 +1,7 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK-NOT:#pragma omp parallel for reduction
; CHECK: #pragma omp parallel for
; CHECK-NOT:#pragma known-parallel reduction
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 <= 2047; c1 += 1)
; CHECK: for (int c3 = 0; c3 <= 1023; c3 += 1)
; CHECK: #pragma simd reduction

View File

@ -1,7 +1,7 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK-NOT:#pragma omp parallel for reduction
; CHECK: #pragma omp parallel for
; CHECK-NOT:#pragma known-parallel reduction
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 <= 2047; c1 += 1)
; CHECK: for (int c3 = 0; c3 <= 1023; c3 += 1)
; CHECK: #pragma simd reduction

View File

@ -1,7 +1,7 @@
; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
;
; CHECK-NOT:#pragma omp parallel for reduction
; CHECK: #pragma omp parallel for
; CHECK-NOT:#pragma known-parallel reduction
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 <= 2047; c1 += 1)
; CHECK: for (int c3 = 0; c3 <= 1023; c3 += 1)
; CHECK: #pragma simd reduction

View File

@ -32,7 +32,7 @@ for.end: ; preds = %for.cond
; CHECK: for (int c1 = 0; c1 <= 1023; c1 += 1)
; CHECK: Stmt_for_body(c1);
; CHECK-VECTOR: #pragma omp parallel for
; CHECK-VECTOR: #pragma known-parallel
; CHECK-VECTOR: for (int c0 = 0; c0 <= 1023; c0 += 4)
; CHECK-VECTOR: #pragma simd
; CHECK-VECTOR: for (int c1 = c0; c1 <= c0 + 3; c1 += 1)

View File

@ -54,7 +54,7 @@ for.end30: ; preds = %for.inc28
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
; CHECK: #pragma omp parallel for
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 <= 1535; c1 += 32)
; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 32)
; CHECK: for (int c3 = c1; c3 <= c1 + 31; c3 += 1)
@ -62,7 +62,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
; CHECK: #pragma simd
; CHECK: for (int c5 = c4; c5 <= c4 + 3; c5 += 1)
; CHECK: Stmt_for_body3(c3, c5);
; CHECK: #pragma omp parallel for
; CHECK: #pragma known-parallel
; CHECK: for (int c1 = 0; c1 <= 1535; c1 += 32)
; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 32)
; CHECK: for (int c3 = 0; c3 <= 1535; c3 += 32)