[LV] Don't emit unused scalars for uniform instructions

If we identify an instruction as uniform after vectorization, we know that we
should only use the value corresponding to the first vector lane of each unroll
iteration. However, when scalarizing such instructions, we still produce values
for the other vector lanes. This patch prevents us from generating the unused
scalars.

Differential Revision: https://reviews.llvm.org/D24275

llvm-svn: 282087
This commit is contained in:
Matthew Simpson 2016-09-21 16:50:24 +00:00
parent 1aa913cd20
commit 90550420f2
3 changed files with 58 additions and 52 deletions

View File

@ -2281,11 +2281,28 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
"Val and Step should have the same integer type");
auto scalarUserIsUniform = [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return !OrigLoop->contains(I) || !Legal->isScalarAfterVectorization(I) ||
Legal->isUniformAfterVectorization(I);
};
// Determine the number of scalars we need to generate for each unroll
// iteration. If EntryVal is uniform or all it's scalar users are uniform, we
// only need to generate the first lane. Otherwise, we generate all VF
// values. We are essentially determining if the induction variable has no
// "multi-scalar" (non-uniform scalar) users.
unsigned Lanes =
Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ||
all_of(EntryVal->users(), scalarUserIsUniform)
? 1
: VF;
// Compute the scalar steps and save the results in VectorLoopValueMap.
ScalarParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
for (unsigned Lane = 0; Lane < VF; ++Lane) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
auto *Mul = Builder.CreateMul(StartIdx, Step);
auto *Add = Builder.CreateAdd(ScalarIV, Mul);
@ -2332,6 +2349,9 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
// Initialize a new vector map entry.
VectorParts Entry(UF);
// If we've scalarized a value, that value should be an instruction.
auto *I = cast<Instruction>(V);
// If we aren't vectorizing, we can just copy the scalar map values over to
// the vector map.
if (VF == 1) {
@ -2340,9 +2360,12 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
return VectorLoopValueMap.initVector(V, Entry);
}
// Get the last scalarized instruction. This corresponds to the instruction
// we created for the last vector lane on the last unroll iteration.
auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, VF - 1));
// Get the last scalar instruction we generated for V. If the value is
// known to be uniform after vectorization, this corresponds to lane zero
// of the last unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the last unroll iteration.
unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
// Set the insert point after the last scalarized instruction. This ensures
// the insertelement sequence will directly follow the scalar definitions.
@ -2350,15 +2373,24 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
auto NewIP = std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);
// However, if we are vectorizing, we need to construct the vector values
// using insertelement instructions. Since the resulting vectors are stored
// in VectorLoopValueMap, we will only generate the insertelements once.
// However, if we are vectorizing, we need to construct the vector values.
// If the value is known to be uniform after vectorization, we can just
// broadcast the scalar value corresponding to lane zero for each unroll
// iteration. Otherwise, we construct the vector values using insertelement
// instructions. Since the resulting vectors are stored in
// VectorLoopValueMap, we will only generate the insertelements once.
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Insert = UndefValue::get(VectorType::get(V->getType(), VF));
for (unsigned Lane = 0; Lane < VF; ++Lane)
Insert = Builder.CreateInsertElement(
Insert, getScalarValue(V, Part, Lane), Builder.getInt32(Lane));
Entry[Part] = Insert;
Value *VectorValue = nullptr;
if (Legal->isUniformAfterVectorization(I)) {
VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
} else {
VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
for (unsigned Lane = 0; Lane < VF; ++Lane)
VectorValue = Builder.CreateInsertElement(
VectorValue, getScalarValue(V, Part, Lane),
Builder.getInt32(Lane));
}
Entry[Part] = VectorValue;
}
Builder.restoreIP(OldIP);
return VectorLoopValueMap.initVector(V, Entry);
@ -2378,6 +2410,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
if (OrigLoop->isLoopInvariant(V))
return V;
assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
: true && "Uniform values only have lane zero");
// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
@ -2884,11 +2919,16 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
if (IfPredicateInstr)
Cond = createBlockInMask(Instr->getParent());
// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
// For each scalar that we create:
for (unsigned Lane = 0; Lane < VF; ++Lane) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
// Start if-block.
Value *Cmp = nullptr;
@ -4398,12 +4438,16 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// This is the normalized GEP that starts counting at zero.
Value *PtrInd = Induction;
PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
// These are the scalar results. Notice that we don't generate vector GEPs
// because scalar GEPs result in better code.
ScalarParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
for (unsigned Lane = 0; Lane < VF; ++Lane) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);

View File

@ -78,21 +78,15 @@ loopexit:
; CHECK: vector.body:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %[[i0:.+]] = add i64 %index, 0
; CHECK: %[[i1:.+]] = add i64 %index, 1
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
;
; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; UNROLL-NO-IC: %[[i0:.+]] = add i64 %index, 0
; UNROLL-NO-IC: %[[i1:.+]] = add i64 %index, 1
; UNROLL-NO-IC: %[[i2:.+]] = add i64 %index, 2
; UNROLL-NO-IC: %[[i3:.+]] = add i64 %index, 3
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i2]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i3]]
;
; IND-LABEL: @scalarize_induction_variable_01(
; IND: vector.body:
@ -611,9 +605,7 @@ exit:
; CHECK: %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
; CHECK: %offset.idx = add i32 %i, %index
; CHECK: %[[A1:.*]] = add i32 %offset.idx, 0
; CHECK: %[[A2:.*]] = add i32 %offset.idx, 1
; CHECK: %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
; CHECK: %[[G2:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A2]]
; CHECK: %[[G3:.*]] = getelementptr i32, i32* %[[G1]], i32 0
; CHECK: %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
; CHECK: store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]

View File

@ -8,13 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 %startval, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
entry:
@ -40,13 +34,7 @@ loopend:
; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i128 %startval, %index
; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7
define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
entry:
@ -72,13 +60,7 @@ loopend:
; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i16 %startval, {{.*}}
; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7
define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
entry:
@ -121,13 +103,7 @@ loopend:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 1023, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
define void @reverse_forward_induction_i64_i8() {
entry:
@ -153,13 +129,7 @@ while.end:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 1023, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
define void @reverse_forward_induction_i64_i8_signed() {
entry: