Reapply "[LV] Extend trunc optimization to all IVs with constant integer steps"

This reapplies commit r294967 with a fix for the execution time regressions
caught by the clang-cmake-aarch64-quick bot. We now extend the truncate
optimization to non-primary induction variables only if the truncate isn't
already free.

Differential Revision: https://reviews.llvm.org/D29847

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295063 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matthew Simpson 2017-02-14 16:28:32 +00:00
parent 2fce16a04e
commit 06f1a4b52b
4 changed files with 111 additions and 12 deletions

View File

@ -2014,6 +2014,42 @@ public:
return WideningDecisions[InstOnVF].second;
}
/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
// If the instruction is not a truncate, return false.
auto *Trunc = dyn_cast<TruncInst>(I);
if (!Trunc)
return false;
// Get the source and destination types of the truncate.
Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
// If the truncate is free for the given types, return false. Replacing a
// free truncate with an induction variable would add an induction variable
// update instruction to each iteration of the loop. We exclude from this
// check the primary induction variable since it will need an update
// instruction regardless.
Value *Op = Trunc->getOperand(0);
if (Op != Legal->getInduction() && TTI.isTruncateFree(SrcTy, DestTy))
return false;
// If the truncated value is not an induction variable, return false.
if (!Legal->isInductionVariable(Op))
return false;
// Lastly, we only consider an induction variable truncate to be
// optimizable if it has a constant step.
//
// TODO: Expand optimizable truncates to include truncations of induction
// variables having loop-invariant steps.
auto ID = Legal->getInductionVars()->lookup(cast<PHINode>(Op));
return ID.getConstIntStepValue();
}
private:
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@ -4879,10 +4915,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
// (c) other casts depend on pointer size.
auto ID = Legal->getInductionVars()->lookup(OldInduction);
if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
ID.getConstIntStepValue()) {
widenIntInduction(OldInduction, cast<TruncInst>(CI));
if (Cost->isOptimizableIVTruncate(CI, VF)) {
widenIntInduction(cast<PHINode>(CI->getOperand(0)),
cast<TruncInst>(CI));
break;
}
@ -7224,12 +7259,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
// We optimize the truncation of induction variable.
// The cost of these is the same as the scalar operation.
if (I->getOpcode() == Instruction::Trunc &&
Legal->isInductionVariable(I->getOperand(0)))
return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
I->getOperand(0)->getType());
// We optimize the truncation of induction variables having constant
// integer steps. The cost of these truncations is the same as the scalar
// operation.
if (isOptimizableIVTruncate(I, VF)) {
auto *Trunc = cast<TruncInst>(I);
return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
Trunc->getSrcTy());
}
Type *SrcScalarTy = I->getOperand(0)->getType();
Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);

View File

@ -0,0 +1,30 @@
; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
; CHECK-LABEL: @non_primary_iv_trunc_free(
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
define void @non_primary_iv_trunc_free(i64 %n) {
entry:
br label %for.body
for.body:
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%tmp0 = trunc i64 %i to i32
%i.next = add nuw nsw i64 %i, 5
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}

View File

@ -773,3 +773,34 @@ for.body:
exit:
ret void
}
; CHECK-LABEL: @non_primary_iv_trunc(
; CHECK: vector.body:
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
; CHECK: [[TMP3:%.*]] = add i64 %index, 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
; CHECK-NEXT: %index.next = add i64 %index, 2
; CHECK: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
define void @non_primary_iv_trunc(i32* %a, i64 %n) {
entry:
br label %for.body
for.body:
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
%tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
%tmp1 = trunc i64 %j to i32
store i32 %tmp1, i32* %tmp0, align 4
%i.next = add nuw nsw i64 %i, 1
%j.next = add nuw nsw i64 %j, 2
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}

View File

@ -2,7 +2,8 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; Make sure that the reverse iterators are calculated using 64bit arithmetic, not 32.
; PR15882: This test ensures that we do not produce wrapping arithmetic when
; creating constant reverse step vectors.
;
; int foo(int n, int *A) {
; int sum;
@ -13,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
;
;CHECK-LABEL: @foo(
;CHECK: <i64 0, i64 -1, i64 -2, i64 -3>
;CHECK: <i32 0, i32 -1, i32 -2, i32 -3>
;CHECK: ret
define i32 @foo(i32 %n, i32* nocapture %A) {
%1 = icmp sgt i32 %n, 0