mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-03 10:19:25 +00:00
Revert the revision 251592 as it fails a test on some platforms.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251617 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
7b38ebfa47
commit
e29e7e235a
@ -126,11 +126,6 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
|
||||
"trip count that is smaller than this "
|
||||
"value."));
|
||||
|
||||
static cl::opt<bool> MaximizeBandwidth(
|
||||
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
||||
cl::desc("Maximize bandwidth when selecting vectorization factor which "
|
||||
"will be determined by the smallest type in loop."));
|
||||
|
||||
/// This enables versioning on the strides of symbolically striding memory
|
||||
/// accesses in code like the following.
|
||||
/// for (i = 0; i < N; ++i)
|
||||
@ -1382,10 +1377,10 @@ public:
|
||||
/// possible.
|
||||
VectorizationFactor selectVectorizationFactor(bool OptForSize);
|
||||
|
||||
/// \return The size (in bits) of the smallest and widest types in the code
|
||||
/// that needs to be vectorized. We ignore values that remain scalar such as
|
||||
/// \return The size (in bits) of the widest type in the code that
|
||||
/// needs to be vectorized. We ignore values that remain scalar such as
|
||||
/// 64 bit loop indices.
|
||||
std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
|
||||
unsigned getWidestType();
|
||||
|
||||
/// \return The desired interleave count.
|
||||
/// If interleave count has been specified by metadata it will be returned.
|
||||
@ -1412,10 +1407,8 @@ public:
|
||||
unsigned NumInstructions;
|
||||
};
|
||||
|
||||
/// \return Returns information about the register usages of the loop for the
|
||||
/// given vectorization factors.
|
||||
SmallVector<RegisterUsage, 8>
|
||||
calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
|
||||
/// \return information about the register usage of the loop.
|
||||
RegisterUsage calculateRegisterUsage();
|
||||
|
||||
private:
|
||||
/// Returns the expected execution cost. The unit of the cost does
|
||||
@ -4714,8 +4707,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
|
||||
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
|
||||
|
||||
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
|
||||
unsigned SmallestType, WidestType;
|
||||
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
|
||||
unsigned WidestType = getWidestType();
|
||||
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
|
||||
unsigned MaxSafeDepDist = -1U;
|
||||
if (Legal->getMaxSafeDepDistBytes() != -1U)
|
||||
@ -4723,9 +4715,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
|
||||
WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
|
||||
WidestRegister : MaxSafeDepDist);
|
||||
unsigned MaxVectorSize = WidestRegister / WidestType;
|
||||
|
||||
DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
|
||||
<< WidestType << " bits.\n");
|
||||
DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
|
||||
DEBUG(dbgs() << "LV: The Widest register is: "
|
||||
<< WidestRegister << " bits.\n");
|
||||
|
||||
@ -4738,26 +4728,6 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
|
||||
" into one vector!");
|
||||
|
||||
unsigned VF = MaxVectorSize;
|
||||
if (MaximizeBandwidth && !OptForSize) {
|
||||
// Collect all viable vectorization factors.
|
||||
SmallVector<unsigned, 8> VFs;
|
||||
unsigned NewMaxVectorSize = WidestRegister / SmallestType;
|
||||
for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
|
||||
VFs.push_back(VS);
|
||||
|
||||
// For each VF calculate its register usage.
|
||||
auto RUs = calculateRegisterUsage(VFs);
|
||||
|
||||
// Select the largest VF which doesn't require more registers than existing
|
||||
// ones.
|
||||
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
|
||||
for (int i = RUs.size() - 1; i >= 0; --i) {
|
||||
if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
|
||||
VF = VFs[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we optimize the program for size, avoid creating the tail loop.
|
||||
if (OptForSize) {
|
||||
@ -4833,9 +4803,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
|
||||
return Factor;
|
||||
}
|
||||
|
||||
std::pair<unsigned, unsigned>
|
||||
LoopVectorizationCostModel::getSmallestAndWidestTypes() {
|
||||
unsigned MinWidth = -1U;
|
||||
unsigned LoopVectorizationCostModel::getWidestType() {
|
||||
unsigned MaxWidth = 8;
|
||||
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
|
||||
|
||||
@ -4875,14 +4843,12 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
|
||||
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
|
||||
continue;
|
||||
|
||||
MinWidth = std::min(MinWidth,
|
||||
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
|
||||
MaxWidth = std::max(MaxWidth,
|
||||
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
|
||||
}
|
||||
}
|
||||
|
||||
return {MinWidth, MaxWidth};
|
||||
return MaxWidth;
|
||||
}
|
||||
|
||||
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
|
||||
@ -4928,7 +4894,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
|
||||
TargetNumRegisters = ForceTargetNumVectorRegs;
|
||||
}
|
||||
|
||||
RegisterUsage R = calculateRegisterUsage({VF})[0];
|
||||
LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
|
||||
// We divide by these constants so assume that we have at least one
|
||||
// instruction that uses at least one register.
|
||||
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
|
||||
@ -5038,9 +5004,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
|
||||
return 1;
|
||||
}
|
||||
|
||||
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
|
||||
LoopVectorizationCostModel::calculateRegisterUsage(
|
||||
const SmallVector<unsigned, 8> &VFs) {
|
||||
LoopVectorizationCostModel::RegisterUsage
|
||||
LoopVectorizationCostModel::calculateRegisterUsage() {
|
||||
// This function calculates the register usage by measuring the highest number
|
||||
// of values that are alive at a single location. Obviously, this is a very
|
||||
// rough estimation. We scan the loop in a topological order in order and
|
||||
@ -5061,8 +5026,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(
|
||||
LoopBlocksDFS DFS(TheLoop);
|
||||
DFS.perform(LI);
|
||||
|
||||
RegisterUsage RU;
|
||||
RU.NumInstructions = 0;
|
||||
RegisterUsage R;
|
||||
R.NumInstructions = 0;
|
||||
|
||||
// Each 'key' in the map opens a new interval. The values
|
||||
// of the map are the index of the 'last seen' usage of the
|
||||
@ -5081,7 +5046,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(
|
||||
unsigned Index = 0;
|
||||
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
|
||||
be = DFS.endRPO(); bb != be; ++bb) {
|
||||
RU.NumInstructions += (*bb)->size();
|
||||
R.NumInstructions += (*bb)->size();
|
||||
for (Instruction &I : **bb) {
|
||||
IdxToInstr[Index++] = &I;
|
||||
|
||||
@ -5116,20 +5081,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(
|
||||
TransposeEnds[it->second].push_back(it->first);
|
||||
|
||||
SmallSet<Instruction*, 8> OpenIntervals;
|
||||
unsigned MaxUsage = 0;
|
||||
|
||||
// Get the size of the widest register.
|
||||
unsigned MaxSafeDepDist = -1U;
|
||||
if (Legal->getMaxSafeDepDistBytes() != -1U)
|
||||
MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
|
||||
unsigned WidestRegister =
|
||||
std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
|
||||
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
|
||||
|
||||
SmallVector<RegisterUsage, 8> RUs(VFs.size());
|
||||
SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
|
||||
|
||||
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
|
||||
|
||||
for (unsigned int i = 0; i < Index; ++i) {
|
||||
Instruction *I = IdxToInstr[i];
|
||||
// Ignore instructions that are never used within the loop.
|
||||
@ -5141,47 +5096,27 @@ LoopVectorizationCostModel::calculateRegisterUsage(
|
||||
|
||||
// Remove all of the instructions that end at this location.
|
||||
InstrList &List = TransposeEnds[i];
|
||||
for (unsigned int j = 0, e = List.size(); j < e; ++j)
|
||||
for (unsigned int j=0, e = List.size(); j < e; ++j)
|
||||
OpenIntervals.erase(List[j]);
|
||||
|
||||
// For each VF find the maximum usage of registers.
|
||||
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
|
||||
// Count the number of live interals.
|
||||
unsigned RegUsage = 0;
|
||||
for (auto Inst : OpenIntervals) {
|
||||
unsigned TypeSize =
|
||||
DL.getTypeSizeInBits(Inst->getType()->getScalarType());
|
||||
RegUsage += std::max<unsigned>(1, VFs[j] * TypeSize / WidestRegister);
|
||||
}
|
||||
MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
|
||||
}
|
||||
// Count the number of live interals.
|
||||
MaxUsage = std::max(MaxUsage, OpenIntervals.size());
|
||||
|
||||
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
|
||||
<< OpenIntervals.size() << '\n');
|
||||
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
|
||||
OpenIntervals.size() << '\n');
|
||||
|
||||
// Add the current instruction to the list of open intervals.
|
||||
OpenIntervals.insert(I);
|
||||
}
|
||||
|
||||
for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
|
||||
unsigned Invariant = 0;
|
||||
for (auto Inst : LoopInvariants) {
|
||||
unsigned TypeSize =
|
||||
DL.getTypeSizeInBits(Inst->getType()->getScalarType());
|
||||
Invariant += std::max<unsigned>(1, VFs[i] * TypeSize / WidestRegister);
|
||||
}
|
||||
unsigned Invariant = LoopInvariants.size();
|
||||
DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
|
||||
DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
|
||||
DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
|
||||
|
||||
DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
|
||||
DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
|
||||
DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
|
||||
DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
|
||||
|
||||
RU.LoopInvariantRegs = Invariant;
|
||||
RU.MaxLocalUsers = MaxUsages[i];
|
||||
RUs[i] = RU;
|
||||
}
|
||||
|
||||
return RUs;
|
||||
R.LoopInvariantRegs = Invariant;
|
||||
R.MaxLocalUsers = MaxUsage;
|
||||
return R;
|
||||
}
|
||||
|
||||
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
|
||||
|
@ -1,46 +0,0 @@
|
||||
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
@a = global [1000 x i8] zeroinitializer, align 16
|
||||
@b = global [1000 x i8] zeroinitializer, align 16
|
||||
@c = global [1000 x i8] zeroinitializer, align 16
|
||||
@u = global [1000 x i32] zeroinitializer, align 16
|
||||
@v = global [1000 x i32] zeroinitializer, align 16
|
||||
@w = global [1000 x i32] zeroinitializer, align 16
|
||||
|
||||
; Tests that the vectorization factor is determined by the smallest instead of
|
||||
; widest type in the loop for maximum bandwidth when
|
||||
; -vectorizer-maximize-bandwidth is indicated.
|
||||
;
|
||||
; CHECK-label: foo
|
||||
; CHECK: LV: Selecting VF: 16.
|
||||
define void @foo() {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup:
|
||||
ret void
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv
|
||||
%0 = load i8, i8* %arrayidx, align 1
|
||||
%arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv
|
||||
%1 = load i8, i8* %arrayidx2, align 1
|
||||
%add = add i8 %1, %0
|
||||
%arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv
|
||||
store i8 %add, i8* %arrayidx6, align 1
|
||||
%arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv
|
||||
%2 = load i32, i32* %arrayidx8, align 4
|
||||
%arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv
|
||||
%3 = load i32, i32* %arrayidx10, align 4
|
||||
%add11 = add nsw i32 %3, %2
|
||||
%arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv
|
||||
store i32 %add11, i32* %arrayidx13, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 1000
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
@ -17,7 +17,7 @@ target triple = "x86_64-apple-macosx10.8.0"
|
||||
; widest vector count.
|
||||
;
|
||||
; CHECK: test_consecutive_store
|
||||
; CHECK: The Smallest and Widest types: 64 / 64 bits.
|
||||
; CHECK: The Widest type: 64 bits
|
||||
define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
|
||||
%4 = load %0*, %0** %2, align 8
|
||||
%5 = icmp eq %0** %0, %1
|
||||
@ -51,7 +51,7 @@ define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwt
|
||||
; p[i][y] = (int*) (1 + q[i]);
|
||||
; }
|
||||
; CHECK: test_nonconsecutive_store
|
||||
; CHECK: The Smallest and Widest types: 16 / 16 bits.
|
||||
; CHECK: The Widest type: 16 bits
|
||||
define void @test_nonconsecutive_store() nounwind ssp uwtable {
|
||||
br label %1
|
||||
|
||||
@ -93,7 +93,7 @@ define void @test_nonconsecutive_store() nounwind ssp uwtable {
|
||||
;; Now we check the same rules for loads. We should take consecutive loads of
|
||||
;; pointer types into account.
|
||||
; CHECK: test_consecutive_ptr_load
|
||||
; CHECK: The Smallest and Widest types: 8 / 64 bits.
|
||||
; CHECK: The Widest type: 64 bits
|
||||
define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
|
||||
br label %1
|
||||
|
||||
@ -117,7 +117,7 @@ define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
|
||||
|
||||
;; However, we should not take unconsecutive loads of pointers into account.
|
||||
; CHECK: test_nonconsecutive_ptr_load
|
||||
; CHECK: LV: The Smallest and Widest types: 16 / 16 bits.
|
||||
; CHECK: The Widest type: 16 bits
|
||||
define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
|
||||
br label %1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user