diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7a28473c25c..ae5ec8cb88a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -126,11 +126,6 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), "trip count that is smaller than this " "value.")); -static cl::opt MaximizeBandwidth( - "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, - cl::desc("Maximize bandwidth when selecting vectorization factor which " - "will be determined by the smallest type in loop.")); - /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) @@ -1382,10 +1377,10 @@ public: /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// \return The size (in bits) of the smallest and widest types in the code - /// that needs to be vectorized. We ignore values that remain scalar such as + /// \return The size (in bits) of the widest type in the code that + /// needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - std::pair getSmallestAndWidestTypes(); + unsigned getWidestType(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -1412,10 +1407,8 @@ public: unsigned NumInstructions; }; - /// \return Returns information about the register usages of the loop for the - /// given vectorization factors. - SmallVector - calculateRegisterUsage(const SmallVector &VFs); + /// \return information about the register usage of the loop. + RegisterUsage calculateRegisterUsage(); private: /// Returns the expected execution cost. The unit of the cost does @@ -4714,8 +4707,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); - unsigned SmallestType, WidestType; - std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); + unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4723,9 +4715,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; - - DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " - << WidestType << " bits.\n"); + DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4738,26 +4728,6 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { " into one vector!"); unsigned VF = MaxVectorSize; - if (MaximizeBandwidth && !OptForSize) { - // Collect all viable vectorization factors. - SmallVector VFs; - unsigned NewMaxVectorSize = WidestRegister / SmallestType; - for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) - VFs.push_back(VS); - - // For each VF calculate its register usage. - auto RUs = calculateRegisterUsage(VFs); - - // Select the largest VF which doesn't require more registers than existing - // ones. - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); - for (int i = RUs.size() - 1; i >= 0; --i) { - if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { - VF = VFs[i]; - break; - } - } - } // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { @@ -4833,9 +4803,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { return Factor; } -std::pair -LoopVectorizationCostModel::getSmallestAndWidestTypes() { - unsigned MinWidth = -1U; +unsigned LoopVectorizationCostModel::getWidestType() { unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4875,14 +4843,12 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) continue; - MinWidth = std::min(MinWidth, - (unsigned)DL.getTypeSizeInBits(T->getScalarType())); MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return {MinWidth, MaxWidth}; + return MaxWidth; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4928,7 +4894,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, TargetNumRegisters = ForceTargetNumVectorRegs; } - RegisterUsage R = calculateRegisterUsage({VF})[0]; + LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -5038,9 +5004,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; } -SmallVector -LoopVectorizationCostModel::calculateRegisterUsage( - const SmallVector &VFs) { +LoopVectorizationCostModel::RegisterUsage +LoopVectorizationCostModel::calculateRegisterUsage() { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -5061,8 +5026,8 @@ LoopVectorizationCostModel::calculateRegisterUsage( LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - RegisterUsage RU; - RU.NumInstructions = 0; + RegisterUsage R; + R.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -5081,7 +5046,7 @@ LoopVectorizationCostModel::calculateRegisterUsage( unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { - RU.NumInstructions += (*bb)->size(); + R.NumInstructions += (*bb)->size(); for (Instruction &I : **bb) { IdxToInstr[Index++] = &I; @@ -5116,20 +5081,10 @@ LoopVectorizationCostModel::calculateRegisterUsage( TransposeEnds[it->second].push_back(it->first); SmallSet OpenIntervals; + unsigned MaxUsage = 0; - // Get the size of the widest register. - unsigned MaxSafeDepDist = -1U; - if (Legal->getMaxSafeDepDistBytes() != -1U) - MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; - unsigned WidestRegister = - std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); - const DataLayout &DL = TheFunction->getParent()->getDataLayout(); - - SmallVector RUs(VFs.size()); - SmallVector MaxUsages(VFs.size(), 0); DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. @@ -5141,47 +5096,27 @@ LoopVectorizationCostModel::calculateRegisterUsage( // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int j = 0, e = List.size(); j < e; ++j) + for (unsigned int j=0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); - // For each VF find the maximum usage of registers. - for (unsigned j = 0, e = VFs.size(); j < e; ++j) { - // Count the number of live interals. - unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) { - unsigned TypeSize = - DL.getTypeSizeInBits(Inst->getType()->getScalarType()); - RegUsage += std::max(1, VFs[j] * TypeSize / WidestRegister); - } - MaxUsages[j] = std::max(MaxUsages[j], RegUsage); - } + // Count the number of live interals. + MaxUsage = std::max(MaxUsage, OpenIntervals.size()); - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " - << OpenIntervals.size() << '\n'); + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << + OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - for (unsigned i = 0, e = VFs.size(); i < e; ++i) { - unsigned Invariant = 0; - for (auto Inst : LoopInvariants) { - unsigned TypeSize = - DL.getTypeSizeInBits(Inst->getType()->getScalarType()); - Invariant += std::max(1, VFs[i] * TypeSize / WidestRegister); - } + unsigned Invariant = LoopInvariants.size(); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); - DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); - - RU.LoopInvariantRegs = Invariant; - RU.MaxLocalUsers = MaxUsages[i]; - RUs[i] = RU; - } - - return RUs; + R.LoopInvariantRegs = Invariant; + R.MaxLocalUsers = MaxUsage; + return R; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { diff --git a/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll b/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll deleted file mode 100644 index e6dc39c2afa..00000000000 --- a/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll +++ /dev/null @@ -1,46 +0,0 @@ -; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s -; REQUIRES: asserts - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@a = global [1000 x i8] zeroinitializer, align 16 -@b = global [1000 x i8] zeroinitializer, align 16 -@c = global [1000 x i8] zeroinitializer, align 16 -@u = global [1000 x i32] zeroinitializer, align 16 -@v = global [1000 x i32] zeroinitializer, align 16 -@w = global [1000 x i32] zeroinitializer, align 16 - -; Tests that the vectorization factor is determined by the smallest instead of -; widest type in the loop for maximum bandwidth when -; -vectorizer-maximize-bandwidth is indicated. -; -; CHECK-label: foo -; CHECK: LV: Selecting VF: 16. -define void @foo() { -entry: - br label %for.body - -for.cond.cleanup: - ret void - -for.body: - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv - %0 = load i8, i8* %arrayidx, align 1 - %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv - %1 = load i8, i8* %arrayidx2, align 1 - %add = add i8 %1, %0 - %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv - store i8 %add, i8* %arrayidx6, align 1 - %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv - %2 = load i32, i32* %arrayidx8, align 4 - %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv - %3 = load i32, i32* %arrayidx10, align 4 - %add11 = add nsw i32 %3, %2 - %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv - store i32 %add11, i32* %arrayidx13, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 1000 - br i1 %exitcond, label %for.cond.cleanup, label %for.body -} diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll index cca829b9457..6cd3c9c3bc0 100644 --- a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -17,7 +17,7 @@ target triple = "x86_64-apple-macosx10.8.0" ; widest vector count. ; ; CHECK: test_consecutive_store -; CHECK: The Smallest and Widest types: 64 / 64 bits. +; CHECK: The Widest type: 64 bits define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 { %4 = load %0*, %0** %2, align 8 %5 = icmp eq %0** %0, %1 @@ -51,7 +51,7 @@ define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwt ; p[i][y] = (int*) (1 + q[i]); ; } ; CHECK: test_nonconsecutive_store -; CHECK: The Smallest and Widest types: 16 / 16 bits. +; CHECK: The Widest type: 16 bits define void @test_nonconsecutive_store() nounwind ssp uwtable { br label %1 @@ -93,7 +93,7 @@ define void @test_nonconsecutive_store() nounwind ssp uwtable { ;; Now we check the same rules for loads. We should take consecutive loads of ;; pointer types into account. ; CHECK: test_consecutive_ptr_load -; CHECK: The Smallest and Widest types: 8 / 64 bits. +; CHECK: The Widest type: 64 bits define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { br label %1 @@ -117,7 +117,7 @@ define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { ;; However, we should not take unconsecutive loads of pointers into account. ; CHECK: test_nonconsecutive_ptr_load -; CHECK: LV: The Smallest and Widest types: 16 / 16 bits. +; CHECK: The Widest type: 16 bits define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable { br label %1