llvm-capstone/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll

; RUN: opt -S %loadPolly -polly-pattern-matching-based-opts=false \
; RUN: -polly-vectorizer=stripmine -polly-opt-isl -polly-ast -analyze \
; RUN: < %s | FileCheck %s
; CHECK:          // 1st level tiling - Tiles
; CHECK-NEXT:    #pragma known-parallel
; CHECK-NEXT:    for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
; CHECK-NEXT:      for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)
; CHECK-NEXT:        #pragma minimal dependence distance: 1
; CHECK-NEXT:        for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {
; CHECK-NEXT:          // 1st level tiling - Points
; CHECK-NEXT:          for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {
; CHECK-NEXT:            for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)
; CHECK-NEXT:              #pragma minimal dependence distance: 1
; CHECK-NEXT:              for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
; CHECK-NEXT:                // SIMD
; CHECK-NEXT:                #pragma simd
; CHECK-NEXT:                for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK-NEXT:                  Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
; CHECK-NEXT:              }
; CHECK-NEXT:            if (32 * c1 + 31 >= nj)
; CHECK-NEXT:              #pragma minimal dependence distance: 1
; CHECK-NEXT:              for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
; CHECK-NEXT:                // SIMD
; CHECK-NEXT:                #pragma simd
; CHECK-NEXT:                for (int c6 = 0; c6 < nj % 4; c6 += 1)
; CHECK-NEXT:                  Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5);
; CHECK-NEXT:              }
; CHECK-NEXT:          }
; CHECK-NEXT:        }

; Function Attrs: nounwind uwtable
define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 {
entry:
  %cmp.27 = icmp sgt i32 %ni, 0
  br i1 %cmp.27, label %for.cond.1.preheader.lr.ph, label %for.end.22

for.cond.1.preheader.lr.ph:                       ; preds = %entry
  br label %for.cond.1.preheader

for.cond.1.preheader:                             ; preds = %for.cond.1.preheader.lr.ph, %for.inc.20
  %indvars.iv33 = phi i64 [ 0, %for.cond.1.preheader.lr.ph ], [ %indvars.iv.next34, %for.inc.20 ]
  %cmp2.25 = icmp sgt i32 %nj, 0
  br i1 %cmp2.25, label %for.cond.4.preheader.lr.ph, label %for.inc.20

for.cond.4.preheader.lr.ph:                       ; preds = %for.cond.1.preheader
  br label %for.cond.4.preheader

for.cond.4.preheader:                             ; preds = %for.cond.4.preheader.lr.ph, %for.inc.17
  %indvars.iv29 = phi i64 [ 0, %for.cond.4.preheader.lr.ph ], [ %indvars.iv.next30, %for.inc.17 ]
  %cmp5.23 = icmp sgt i32 %nk, 0
  br i1 %cmp5.23, label %for.body.6.lr.ph, label %for.inc.17

for.body.6.lr.ph:                                 ; preds = %for.cond.4.preheader
  br label %for.body.6

for.body.6:                                       ; preds = %for.body.6.lr.ph, %for.body.6
  %indvars.iv = phi i64 [ 0, %for.body.6.lr.ph ], [ %indvars.iv.next, %for.body.6 ]
  %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv33, i64 %indvars.iv
  %0 = load double, double* %arrayidx8, align 8
  %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv29
  %1 = load double, double* %arrayidx12, align 8
  %mul = fmul double %0, %1
  %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv33, i64 %indvars.iv29
  %2 = load double, double* %arrayidx16, align 8
  %add = fadd double %2, %mul
  store double %add, double* %arrayidx16, align 8
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp ne i32 %lftr.wideiv, %nk
  br i1 %exitcond, label %for.body.6, label %for.cond.4.for.inc.17_crit_edge

for.cond.4.for.inc.17_crit_edge:                  ; preds = %for.body.6
  br label %for.inc.17

for.inc.17:                                       ; preds = %for.cond.4.for.inc.17_crit_edge, %for.cond.4.preheader
  %indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1
  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
  %exitcond32 = icmp ne i32 %lftr.wideiv31, %nj
  br i1 %exitcond32, label %for.cond.4.preheader, label %for.cond.1.for.inc.20_crit_edge

for.cond.1.for.inc.20_crit_edge:                  ; preds = %for.inc.17
  br label %for.inc.20

for.inc.20:                                       ; preds = %for.cond.1.for.inc.20_crit_edge, %for.cond.1.preheader
  %indvars.iv.next34 = add nuw nsw i64 %indvars.iv33, 1
  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
  %exitcond36 = icmp ne i32 %lftr.wideiv35, %ni
  br i1 %exitcond36, label %for.cond.1.preheader, label %for.cond.for.end.22_crit_edge

for.cond.for.end.22_crit_edge:                    ; preds = %for.inc.20
  br label %for.end.22

for.end.22:                                       ; preds = %for.cond.for.end.22_crit_edge, %entry
  ret void
}
Make optimizations based on pattern matching be enabled by default Currently, pattern based optimizations of Polly can identify matrix multiplication and optimize it according to BLIS matmul optimization pattern (see ScheduleTreeOptimizer for details). This patch makes optimizations based on pattern matching be enabled by default. Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: https://reviews.llvm.org/D30293 llvm-svn: 295958 2017-02-23 11:44:12 +00:00			`; RUN: opt -S %loadPolly -polly-pattern-matching-based-opts=false \`
			`; RUN: -polly-vectorizer=stripmine -polly-opt-isl -polly-ast -analyze \`
			`; RUN: < %s \| FileCheck %s`
Prepare unit tests for update to ISL 0.16 ISL 0.16 will change how sets are printed which breaks 117 unit tests that text-compare printed sets. This patch re-formats most of these unit tests using a script and small manual editing on top of that. When actually updating ISL, most work is done by just re-running the script to adapt to the changed output. Some tests that compare IR and tests with single CHECK-lines that can be easily updated manually are not included here. The re-format script will also be committed afterwards. The per-test formatter invocation command lines options will not be added in the near future because it is ad hoc and would overwrite the manual edits. Ideally it also shouldn't be required anymore because ISL's set printing has become more stable in 0.16. Differential Revision: http://reviews.llvm.org/D16095 llvm-svn: 257851 2016-01-15 00:48:42 +00:00			`; CHECK: // 1st level tiling - Tiles`
Annotation of SIMD loops Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip parallelism checks. The buildbot shows the following compile/execution time changes: Compile time: Improvements Δ Previous Current σ …/gesummv -6.06% 0.2640 0.2480 0.0055 …/gemver -4.46% 0.4480 0.4280 0.0044 …/covariance -4.31% 0.8360 0.8000 0.0065 …/adi -3.23% 0.9920 0.9600 0.0065 …/doitgen -2.53% 0.9480 0.9240 0.0090 …/3mm -2.33% 1.0320 1.0080 0.0087 Execution time: Regressions Δ Previous Current σ …/viterbi 1.70% 5.1840 5.2720 0.0074 …/smallpt 1.06% 12.4920 12.6240 0.0040 Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: http://reviews.llvm.org/D14491 llvm-svn: 261620 2016-02-23 09:00:13 +00:00			`; CHECK-NEXT: #pragma known-parallel`
			`; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)`
			`; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)`
[IslAst] Fix InParallelFor nesting. IslAst could mark two nested outer loops as "OutermostParallel". It caused that the code generator tried to OpenMP-parallelize both loops, which it is not prepared loop. It was because the recursive AST build algorithm managed a flag "InParallelFor" to ensure that no nested loop is also marked as "OutermostParallel". Unfortunatetly the same flag was used by nodes marked as SIMD, and reset to false after the SIMD node. Since loops can be marked as SIMD inside "OutermostParallel" loops, the recursive algorithm again tried to mark loops as "OutermostParellel" although still nested inside another "OutermostParallel" loop. The fix exposed another bug: The function "astScheduleDimIsParallel" was only called when a loop was potentially "OutermostParallel" or "InnermostParallel", but as a side-effect also determines the minimum dependence distance. Hence, changing when we need to know whether a loop is "OutermostParallel" also changed which loop was annotated with "#pragma minimal dependence distance". Moreover, some complex condition linked with "InParallelFor" determined whether a loop should be an "InnermostParallel" loop. It missed some situations where it would not use mark as such although being inside an SIMD mark node, and therefore not be annotated using "#pragma simd". The changes in particular: 1. Split the "InParallelFor" flag into an "InParallelFor" and an "InSIMD" flag. 2. Unconditionally call "astScheduleDimIsParallel" for its side-effects and store the result in "InParallel" for later use. 3. Simplify the condition when a loop is "InnermostParallel". Fixes llvm.org/PR33153 and llvm.org/PR38073. llvm-svn: 343212 2018-09-27 13:39:37 +00:00			`; CHECK-NEXT: #pragma minimal dependence distance: 1`
Annotation of SIMD loops Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip parallelism checks. The buildbot shows the following compile/execution time changes: Compile time: Improvements Δ Previous Current σ …/gesummv -6.06% 0.2640 0.2480 0.0055 …/gemver -4.46% 0.4480 0.4280 0.0044 …/covariance -4.31% 0.8360 0.8000 0.0065 …/adi -3.23% 0.9920 0.9600 0.0065 …/doitgen -2.53% 0.9480 0.9240 0.0090 …/3mm -2.33% 1.0320 1.0080 0.0087 Execution time: Regressions Δ Previous Current σ …/viterbi 1.70% 5.1840 5.2720 0.0074 …/smallpt 1.06% 12.4920 12.6240 0.0040 Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: http://reviews.llvm.org/D14491 llvm-svn: 261620 2016-02-23 09:00:13 +00:00			`; CHECK-NEXT: for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {`
			`; CHECK-NEXT: // 1st level tiling - Points`
			`; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {`
			`; CHECK-NEXT: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)`
[IslAst] Fix InParallelFor nesting. IslAst could mark two nested outer loops as "OutermostParallel". It caused that the code generator tried to OpenMP-parallelize both loops, which it is not prepared loop. It was because the recursive AST build algorithm managed a flag "InParallelFor" to ensure that no nested loop is also marked as "OutermostParallel". Unfortunatetly the same flag was used by nodes marked as SIMD, and reset to false after the SIMD node. Since loops can be marked as SIMD inside "OutermostParallel" loops, the recursive algorithm again tried to mark loops as "OutermostParellel" although still nested inside another "OutermostParallel" loop. The fix exposed another bug: The function "astScheduleDimIsParallel" was only called when a loop was potentially "OutermostParallel" or "InnermostParallel", but as a side-effect also determines the minimum dependence distance. Hence, changing when we need to know whether a loop is "OutermostParallel" also changed which loop was annotated with "#pragma minimal dependence distance". Moreover, some complex condition linked with "InParallelFor" determined whether a loop should be an "InnermostParallel" loop. It missed some situations where it would not use mark as such although being inside an SIMD mark node, and therefore not be annotated using "#pragma simd". The changes in particular: 1. Split the "InParallelFor" flag into an "InParallelFor" and an "InSIMD" flag. 2. Unconditionally call "astScheduleDimIsParallel" for its side-effects and store the result in "InParallel" for later use. 3. Simplify the condition when a loop is "InnermostParallel". Fixes llvm.org/PR33153 and llvm.org/PR38073. llvm-svn: 343212 2018-09-27 13:39:37 +00:00			`; CHECK-NEXT: #pragma minimal dependence distance: 1`
Annotation of SIMD loops Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip parallelism checks. The buildbot shows the following compile/execution time changes: Compile time: Improvements Δ Previous Current σ …/gesummv -6.06% 0.2640 0.2480 0.0055 …/gemver -4.46% 0.4480 0.4280 0.0044 …/covariance -4.31% 0.8360 0.8000 0.0065 …/adi -3.23% 0.9920 0.9600 0.0065 …/doitgen -2.53% 0.9480 0.9240 0.0090 …/3mm -2.33% 1.0320 1.0080 0.0087 Execution time: Regressions Δ Previous Current σ …/viterbi 1.70% 5.1840 5.2720 0.0074 …/smallpt 1.06% 12.4920 12.6240 0.0040 Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: http://reviews.llvm.org/D14491 llvm-svn: 261620 2016-02-23 09:00:13 +00:00			`; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {`
			`; CHECK-NEXT: // SIMD`
[IslAst] Fix InParallelFor nesting. IslAst could mark two nested outer loops as "OutermostParallel". It caused that the code generator tried to OpenMP-parallelize both loops, which it is not prepared loop. It was because the recursive AST build algorithm managed a flag "InParallelFor" to ensure that no nested loop is also marked as "OutermostParallel". Unfortunatetly the same flag was used by nodes marked as SIMD, and reset to false after the SIMD node. Since loops can be marked as SIMD inside "OutermostParallel" loops, the recursive algorithm again tried to mark loops as "OutermostParellel" although still nested inside another "OutermostParallel" loop. The fix exposed another bug: The function "astScheduleDimIsParallel" was only called when a loop was potentially "OutermostParallel" or "InnermostParallel", but as a side-effect also determines the minimum dependence distance. Hence, changing when we need to know whether a loop is "OutermostParallel" also changed which loop was annotated with "#pragma minimal dependence distance". Moreover, some complex condition linked with "InParallelFor" determined whether a loop should be an "InnermostParallel" loop. It missed some situations where it would not use mark as such although being inside an SIMD mark node, and therefore not be annotated using "#pragma simd". The changes in particular: 1. Split the "InParallelFor" flag into an "InParallelFor" and an "InSIMD" flag. 2. Unconditionally call "astScheduleDimIsParallel" for its side-effects and store the result in "InParallel" for later use. 3. Simplify the condition when a loop is "InnermostParallel". Fixes llvm.org/PR33153 and llvm.org/PR38073. llvm-svn: 343212 2018-09-27 13:39:37 +00:00			`; CHECK-NEXT: #pragma simd`
Annotation of SIMD loops Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip parallelism checks. The buildbot shows the following compile/execution time changes: Compile time: Improvements Δ Previous Current σ …/gesummv -6.06% 0.2640 0.2480 0.0055 …/gemver -4.46% 0.4480 0.4280 0.0044 …/covariance -4.31% 0.8360 0.8000 0.0065 …/adi -3.23% 0.9920 0.9600 0.0065 …/doitgen -2.53% 0.9480 0.9240 0.0090 …/3mm -2.33% 1.0320 1.0080 0.0087 Execution time: Regressions Δ Previous Current σ …/viterbi 1.70% 5.1840 5.2720 0.0074 …/smallpt 1.06% 12.4920 12.6240 0.0040 Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: http://reviews.llvm.org/D14491 llvm-svn: 261620 2016-02-23 09:00:13 +00:00			`; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1)`
			`; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);`
			`; CHECK-NEXT: }`
			`; CHECK-NEXT: if (32 * c1 + 31 >= nj)`
			`; CHECK-NEXT: #pragma minimal dependence distance: 1`
			`; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {`
			`; CHECK-NEXT: // SIMD`
[IslAst] Fix InParallelFor nesting. IslAst could mark two nested outer loops as "OutermostParallel". It caused that the code generator tried to OpenMP-parallelize both loops, which it is not prepared loop. It was because the recursive AST build algorithm managed a flag "InParallelFor" to ensure that no nested loop is also marked as "OutermostParallel". Unfortunatetly the same flag was used by nodes marked as SIMD, and reset to false after the SIMD node. Since loops can be marked as SIMD inside "OutermostParallel" loops, the recursive algorithm again tried to mark loops as "OutermostParellel" although still nested inside another "OutermostParallel" loop. The fix exposed another bug: The function "astScheduleDimIsParallel" was only called when a loop was potentially "OutermostParallel" or "InnermostParallel", but as a side-effect also determines the minimum dependence distance. Hence, changing when we need to know whether a loop is "OutermostParallel" also changed which loop was annotated with "#pragma minimal dependence distance". Moreover, some complex condition linked with "InParallelFor" determined whether a loop should be an "InnermostParallel" loop. It missed some situations where it would not use mark as such although being inside an SIMD mark node, and therefore not be annotated using "#pragma simd". The changes in particular: 1. Split the "InParallelFor" flag into an "InParallelFor" and an "InSIMD" flag. 2. Unconditionally call "astScheduleDimIsParallel" for its side-effects and store the result in "InParallel" for later use. 3. Simplify the condition when a loop is "InnermostParallel". Fixes llvm.org/PR33153 and llvm.org/PR38073. llvm-svn: 343212 2018-09-27 13:39:37 +00:00			`; CHECK-NEXT: #pragma simd`
Annotation of SIMD loops Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip parallelism checks. The buildbot shows the following compile/execution time changes: Compile time: Improvements Δ Previous Current σ …/gesummv -6.06% 0.2640 0.2480 0.0055 …/gemver -4.46% 0.4480 0.4280 0.0044 …/covariance -4.31% 0.8360 0.8000 0.0065 …/adi -3.23% 0.9920 0.9600 0.0065 …/doitgen -2.53% 0.9480 0.9240 0.0090 …/3mm -2.33% 1.0320 1.0080 0.0087 Execution time: Regressions Δ Previous Current σ …/viterbi 1.70% 5.1840 5.2720 0.0074 …/smallpt 1.06% 12.4920 12.6240 0.0040 Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: http://reviews.llvm.org/D14491 llvm-svn: 261620 2016-02-23 09:00:13 +00:00			`; CHECK-NEXT: for (int c6 = 0; c6 < nj % 4; c6 += 1)`
IslAst: always use the context during ast generation Providing the context to the ast generator allows for additional simplifcations and -- more importantly -- allows to generate loops with only partially bounded domains, assuming the domains are bounded for all parameter configurations that are valid as defined by the context. This change fixes the crash reported in http://llvm.org/PR30956 The original reason why we did not include the context when generating an AST was that CLooG and later isl used to sometimes transfer some of the constraints that bound the size of parameters from the context into the generated AST. This resulted in operations with very large constants, which sometimes introduced problematic integer overflows. The latest versions of the isl AST generator are careful to not introduce such constants. Reported-by: Eli Friedman <efriedma@codeaurora.org> llvm-svn: 286442 2016-11-10 09:39:58 +00:00			`; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5);`
Annotation of SIMD loops Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip parallelism checks. The buildbot shows the following compile/execution time changes: Compile time: Improvements Δ Previous Current σ …/gesummv -6.06% 0.2640 0.2480 0.0055 …/gemver -4.46% 0.4480 0.4280 0.0044 …/covariance -4.31% 0.8360 0.8000 0.0065 …/adi -3.23% 0.9920 0.9600 0.0065 …/doitgen -2.53% 0.9480 0.9240 0.0090 …/3mm -2.33% 1.0320 1.0080 0.0087 Execution time: Regressions Δ Previous Current σ …/viterbi 1.70% 5.1840 5.2720 0.0074 …/smallpt 1.06% 12.4920 12.6240 0.0040 Reviewed-by: Tobias Grosser <tobias@grosser.es> Differential Revision: http://reviews.llvm.org/D14491 llvm-svn: 261620 2016-02-23 09:00:13 +00:00			`; CHECK-NEXT: }`
			`; CHECK-NEXT: }`
			`; CHECK-NEXT: }`
Full/partial tile separation for vectorization We isolate full tiles from partial tiles to be able to, for example, vectorize loops with parametric lower and/or upper bounds. If we use -polly-vectorizer=stripmine, we can see execution-time improvements: correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to 0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from 8m5565s to 7m4285s (-13.18 %). The current full/partial tile separation increases compile-time more than necessary. As a result, we see in compile time regressions, for example, for 3mm from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected as we generate more IR and consequently more time is spent in the LLVM backends. However, a first investiagation has shown that a larger portion of compile time is unnecessarily spent inside Polly's parallelism detection and could be eliminated by propagating existing knowledge about vector loop parallelism. Before enabling -polly-vectorizer=stripmine by default, it is necessary to address this compile-time issue. Contributed-by: Roman Gareev <gareevroman@gmail.com> Reviewers: jdoerfert, grosser Subscribers: grosser, #polly Differential Revision: http://reviews.llvm.org/D13779 llvm-svn: 250809 2015-10-20 09:12:21 +00:00
			`; Function Attrs: nounwind uwtable`
			`define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 {`
			`entry:`
			`%cmp.27 = icmp sgt i32 %ni, 0`
			`br i1 %cmp.27, label %for.cond.1.preheader.lr.ph, label %for.end.22`

			`for.cond.1.preheader.lr.ph: ; preds = %entry`
			`br label %for.cond.1.preheader`

			`for.cond.1.preheader: ; preds = %for.cond.1.preheader.lr.ph, %for.inc.20`
			`%indvars.iv33 = phi i64 [ 0, %for.cond.1.preheader.lr.ph ], [ %indvars.iv.next34, %for.inc.20 ]`
			`%cmp2.25 = icmp sgt i32 %nj, 0`
			`br i1 %cmp2.25, label %for.cond.4.preheader.lr.ph, label %for.inc.20`

			`for.cond.4.preheader.lr.ph: ; preds = %for.cond.1.preheader`
			`br label %for.cond.4.preheader`

			`for.cond.4.preheader: ; preds = %for.cond.4.preheader.lr.ph, %for.inc.17`
			`%indvars.iv29 = phi i64 [ 0, %for.cond.4.preheader.lr.ph ], [ %indvars.iv.next30, %for.inc.17 ]`
			`%cmp5.23 = icmp sgt i32 %nk, 0`
			`br i1 %cmp5.23, label %for.body.6.lr.ph, label %for.inc.17`

			`for.body.6.lr.ph: ; preds = %for.cond.4.preheader`
			`br label %for.body.6`

			`for.body.6: ; preds = %for.body.6.lr.ph, %for.body.6`
			`%indvars.iv = phi i64 [ 0, %for.body.6.lr.ph ], [ %indvars.iv.next, %for.body.6 ]`
			`%arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv33, i64 %indvars.iv`
			`%0 = load double, double* %arrayidx8, align 8`
			`%arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv29`
			`%1 = load double, double* %arrayidx12, align 8`
			`%mul = fmul double %0, %1`
			`%arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv33, i64 %indvars.iv29`
			`%2 = load double, double* %arrayidx16, align 8`
			`%add = fadd double %2, %mul`
			`store double %add, double* %arrayidx16, align 8`
			`%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1`
			`%lftr.wideiv = trunc i64 %indvars.iv.next to i32`
			`%exitcond = icmp ne i32 %lftr.wideiv, %nk`
			`br i1 %exitcond, label %for.body.6, label %for.cond.4.for.inc.17_crit_edge`

			`for.cond.4.for.inc.17_crit_edge: ; preds = %for.body.6`
			`br label %for.inc.17`

			`for.inc.17: ; preds = %for.cond.4.for.inc.17_crit_edge, %for.cond.4.preheader`
			`%indvars.iv.next30 = add nuw nsw i64 %indvars.iv29, 1`
			`%lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32`
			`%exitcond32 = icmp ne i32 %lftr.wideiv31, %nj`
			`br i1 %exitcond32, label %for.cond.4.preheader, label %for.cond.1.for.inc.20_crit_edge`

			`for.cond.1.for.inc.20_crit_edge: ; preds = %for.inc.17`
			`br label %for.inc.20`

			`for.inc.20: ; preds = %for.cond.1.for.inc.20_crit_edge, %for.cond.1.preheader`
			`%indvars.iv.next34 = add nuw nsw i64 %indvars.iv33, 1`
			`%lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32`
			`%exitcond36 = icmp ne i32 %lftr.wideiv35, %ni`
			`br i1 %exitcond36, label %for.cond.1.preheader, label %for.cond.for.end.22_crit_edge`

			`for.cond.for.end.22_crit_edge: ; preds = %for.inc.20`
			`br label %for.end.22`

			`for.end.22: ; preds = %for.cond.for.end.22_crit_edge, %entry`
			`ret void`
			`}`