llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
Ayal Zaks bfae62c2cb [LV] Optimize for size when vectorizing loops with tiny trip count
It may be detrimental to vectorize loops with very small trip count, as various
costs of the vectorized loop body as well as enclosing overheads including
runtime tests and scalar iterations may outweigh the gains of vectorizing. The
current cost model measures the cost of the vectorized loop body only, expecting
it will amortize other costs, and loops with known or expected very small trip
counts are not vectorized at all. This patch allows loops with very small trip
counts to be vectorized, but under OptForSize constraints, which ensure the cost
of the loop body is dominant, having no runtime guards nor scalar iterations.

Patch inspired by D32451.

Differential Revision: https://reviews.llvm.org/D34373


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306803 91177308-0d34-0410-b5e6-96231b3b80d8
2017-06-30 08:02:35 +00:00

101 lines
3.6 KiB
LLVM

; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
; REQUIRES: asserts
; CHECK: LV: Loop hints: force=enabled
; CHECK: LV: Loop hints: force=?
; CHECK: LV: Loop hints: force=?
; No more loops in the module
; CHECK-NOT: LV: Loop hints: force=
; CHECK: 3 loop-vectorize - Number of loops analyzed for vectorization
; CHECK: 2 loop-vectorize - Number of loops vectorized
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
;
; The source code for the test:
;
; void foo(float* restrict A, float* restrict B)
; {
; for (int i = 0; i < 20; ++i) A[i] += B[i];
; }
;
;
; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
;
define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
entry:
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
%arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
%add = fadd fast float %0, %1
store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 20
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
for.end:
ret void
}
!1 = !{!1, !2}
!2 = !{!"llvm.loop.vectorize.enable", i1 true}
;
; This loop will not be vectorized as the trip count is below the threshold.
;
define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
entry:
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
%arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
%add = fadd fast float %0, %1
store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 20
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
for.end:
ret void
}
!3 = !{!3}
;
; This loop will be vectorized as the trip count is below the threshold but no
; scalar iterations are needed.
;
define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
entry:
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
%arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
%add = fadd fast float %0, %1
store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 16
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
for.end:
ret void
}
!4 = !{!4}