Files
archived-llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
George Burgess IV 9276050d30 [LoopVectorize] Don't preserve nsw/nuw flags on shrunken ops.
If we're shrinking a binary operation, it may be the case that the new
operations wraps where the old didn't. If this happens, the behavior
should be well-defined. So, we can't always carry wrapping flags with us
when we shrink operations.

If we do, we get incorrect optimizations in cases like:

void foo(const unsigned char *from, unsigned char *to, int n) {
  for (int i = 0; i < n; i++)
    to[i] = from[i] - 128;
}

which gets optimized to:

void foo(const unsigned char *from, unsigned char *to, int n) {
  for (int i = 0; i < n; i++)
    to[i] = from[i] | 128;
}

Because:
- InstCombine turned `sub i32 %from.i, 128` into
  `add nuw nsw i32 %from.i, 128`.
- LoopVectorize vectorized the add to be `add nuw nsw <16 x i8>` with a
  vector full of `i8 128`s
- InstCombine took advantage of the fact that the newly-shrunken add
  "couldn't wrap", and changed the `add` to an `or`.

InstCombine seems happy to figure out whether we can add nuw/nsw on its
own, so I just decided to drop the flags. There are already a number of
places in LoopVectorize where we rely on InstCombine to clean up.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305053 91177308-0d34-0410-b5e6-96231b3b80d8
2017-06-09 03:56:15 +00:00

311 lines
12 KiB
LLVM

; RUN: opt -S < %s -basicaa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
; CHECK-LABEL: @add_a(
; CHECK: load <16 x i8>, <16 x i8>*
; CHECK: add <16 x i8>
; CHECK: store <16 x i8>
; Function Attrs: nounwind
define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i8
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv1, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
; working with.
; CHECK-LABEL: @add_a1(
; CHECK: load <16 x i8>, <16 x i8>*
; CHECK: add nuw nsw <16 x i8>
; CHECK: store <16 x i8>
; Function Attrs: nounwind
define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%add = add nuw nsw i8 %0, 2
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %add, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_b(
; CHECK: load <8 x i16>, <8 x i16>*
; CHECK: add <8 x i16>
; CHECK: store <8 x i16>
; Function Attrs: nounwind
define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp9 = icmp sgt i32 %len, 0
br i1 %cmp9, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv8 = zext i16 %0 to i32
%add = add nuw nsw i32 %conv8, 2
%conv1 = trunc i32 %add to i16
%arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
store i16 %conv1, i16* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_c(
; CHECK: load <8 x i8>, <8 x i8>*
; CHECK: add <8 x i16>
; CHECK: store <8 x i16>
; Function Attrs: nounwind
define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i16
%arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
store i16 %conv1, i16* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_d(
; CHECK: load <4 x i16>
; CHECK: add nsw <4 x i32>
; CHECK: store <4 x i32>
define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp7 = icmp sgt i32 %len, 0
br i1 %cmp7, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv = sext i16 %0 to i32
%add = add nsw i32 %conv, 2
%arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
store i32 %add, i32* %arrayidx2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_e(
; CHECK: load <16 x i8>
; CHECK: shl <16 x i8>
; CHECK: add <16 x i8>
; CHECK: or <16 x i8>
; CHECK: mul <16 x i8>
; CHECK: and <16 x i8>
; CHECK: xor <16 x i8>
; CHECK: mul <16 x i8>
; CHECK: store <16 x i8>
define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
entry:
%cmp.32 = icmp sgt i32 %len, 0
br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
%conv11 = zext i8 %arg2 to i32
%conv13 = zext i8 %arg1 to i32
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = shl i32 %conv, 4
%conv2 = add nuw nsw i32 %add, 32
%or = or i32 %conv, 51
%mul = mul nuw nsw i32 %or, 60
%and = and i32 %conv2, %conv13
%mul.masked = and i32 %mul, 252
%conv17 = xor i32 %mul.masked, %conv11
%mul18 = mul nuw nsw i32 %conv17, %and
%conv19 = trunc i32 %mul18 to i8
%arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv19, i8* %arrayidx21
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_f
; CHECK: load <8 x i16>
; CHECK: trunc <8 x i16>
; CHECK: shl <8 x i8>
; CHECK: add <8 x i8>
; CHECK: or <8 x i8>
; CHECK: mul <8 x i8>
; CHECK: and <8 x i8>
; CHECK: xor <8 x i8>
; CHECK: mul <8 x i8>
; CHECK: store <8 x i8>
define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
entry:
%cmp.32 = icmp sgt i32 %len, 0
br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
%conv11 = zext i8 %arg2 to i32
%conv13 = zext i8 %arg1 to i32
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv = sext i16 %0 to i32
%add = shl i32 %conv, 4
%conv2 = add nsw i32 %add, 32
%or = and i32 %conv, 204
%conv8 = or i32 %or, 51
%mul = mul nuw nsw i32 %conv8, 60
%and = and i32 %conv2, %conv13
%mul.masked = and i32 %mul, 252
%conv17 = xor i32 %mul.masked, %conv11
%mul18 = mul nuw nsw i32 %conv17, %and
%conv19 = trunc i32 %mul18 to i8
%arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv19, i8* %arrayidx21
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_phifail(
; CHECK: load <16 x i8>, <16 x i8>*
; CHECK: add nuw nsw <16 x i32>
; CHECK: store <16 x i8>
; Function Attrs: nounwind
define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i8
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv1, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; Function Attrs: nounwind
; When we vectorize this loop, we generate correct code
; even when %len exactly divides VF (since we extract from the second last index
; and pass this to the for.cond.cleanup block). Vectorized loop returns
; the correct value a_phi = p[len -2]
define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
; CHECK-LABEL: @add_phifail2(
; CHECK: vector.body:
; CHECK: %wide.load = load <16 x i8>, <16 x i8>*
; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
; CHECK: add nuw nsw <16 x i32>
; CHECK: store <16 x i8>
; CHECK: add i64 %index, 16
; CHECK: icmp eq i64 %index.next, %n.vec
; CHECK: middle.block:
; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
; CHECK: for.cond.cleanup:
; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
; CHECK: %ret = trunc i32 %a_phi.lcssa to i8
; CHECK: ret i8 %ret
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%ret = trunc i32 %a_phi to i8
ret i8 %ret
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i8
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv1, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
attributes #0 = { nounwind }