[llvm-mca] Move the logic that computes the block throughput into Support.h. NFC

This will allow us to share the logic that computes the block throughput with
other views.

llvm-svn: 333755
This commit is contained in:
Andrea Di Biagio 2018-06-01 14:35:21 +00:00
parent a75eb0fe77
commit 0110f33d56
4 changed files with 63 additions and 48 deletions

View File

@ -24,6 +24,14 @@ namespace mca {
using namespace llvm;
SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
unsigned Width)
: SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
ProcResourceMasks(Model.getNumProcResourceKinds(), 0) {
computeProcResourceMasks(SM, ProcResourceMasks);
}
void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
// We are only interested in the "instruction dispatched" events generated by
// the dispatch stage for instructions that are part of iteration #0.
@ -41,48 +49,14 @@ void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
const InstrDesc &Desc = Inst.getDesc();
NumMicroOps += Desc.NumMicroOps;
for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
if (!RU.second.size())
continue;
assert(RU.second.NumUnits && "Expected more than one unit used!");
if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) {
ProcResourceUsage[RU.first] = RU.second.size();
continue;
}
ProcResourceUsage[RU.first] += RU.second.size();
}
}
double SummaryView::getBlockRThroughput() const {
assert(NumMicroOps && "Expected at least one micro opcode!");
SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds());
computeProcResourceMasks(SM, Masks);
// The block throughput is bounded from above by the hardware dispatch
// throughput. That is because the DispatchWidth is an upper bound on the
// number of opcodes that can be part of a single dispatch group.
double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
// The block throughput is also limited by the amount of hardware parallelism.
// The number of available resource units affects the resource pressure
// distributed, as well as how many blocks can be executed every cycle.
for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
uint64_t Mask = Masks[I];
const auto It = ProcResourceUsage.find_as(Mask);
if (It != ProcResourceUsage.end()) {
const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
unsigned NumUnits = MCDesc.NumUnits;
double Throughput = static_cast<double>(It->second) / NumUnits;
Max = std::max(Max, Throughput);
if (RU.second.size()) {
const auto It = find(ProcResourceMasks, RU.first);
assert(It != ProcResourceMasks.end() &&
"Invalid processor resource mask!");
ProcResourceUsage[std::distance(ProcResourceMasks.begin(), It)] +=
RU.second.size();
}
}
// The block reciprocal throughput is computed as the MAX of:
// - (#uOps / DispatchWidth)
// - (#units / resource cycles) for every consumed processor resource.
return Max;
}
void SummaryView::printView(raw_ostream &OS) const {
@ -90,7 +64,8 @@ void SummaryView::printView(raw_ostream &OS) const {
unsigned Instructions = Source.size();
unsigned TotalInstructions = Instructions * Iterations;
double IPC = (double)TotalInstructions / TotalCycles;
double BlockRThroughput = getBlockRThroughput();
double BlockRThroughput = computeBlockRThroughput(
SM, DispatchWidth, NumMicroOps, ProcResourceUsage);
std::string Buffer;
raw_string_ostream TempStream(Buffer);

View File

@ -45,10 +45,15 @@ class SummaryView : public View {
unsigned TotalCycles;
// The total number of micro opcodes contributed by a block of instructions.
unsigned NumMicroOps;
// For each processor resource, this map stores the cumulative number of
// resource cycles consumed by a block of instructions. The resource mask ID
// is used as the key value to access elements of this map.
llvm::DenseMap<uint64_t, unsigned> ProcResourceUsage;
// For each processor resource, this vector stores the cumulative number of
// resource cycles consumed by the analyzed code block.
llvm::SmallVector<unsigned, 8> ProcResourceUsage;
// Each processor resource is associated with a so-called processor resource
// mask. This vector allows to correlate processor resource IDs with processor
// resource masks. There is exactly one element per each processor resource
// declared by the scheduling model.
llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
// Compute the reciprocal throughput for the analyzed code block.
// The reciprocal block throughput is computed as the MAX between:
@ -58,9 +63,7 @@ class SummaryView : public View {
public:
SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
unsigned Width)
: SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
NumMicroOps(0) {}
unsigned Width);
void onCycleEnd() override { ++TotalCycles; }

View File

@ -48,4 +48,32 @@ void computeProcResourceMasks(const MCSchedModel &SM,
ProcResourceID++;
}
}
double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
unsigned NumMicroOps,
ArrayRef<unsigned> ProcResourceUsage) {
// The block throughput is bounded from above by the hardware dispatch
// throughput. That is because the DispatchWidth is an upper bound on the
// number of opcodes that can be part of a single dispatch group.
double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
// The block throughput is also limited by the amount of hardware parallelism.
// The number of available resource units affects the resource pressure
// distribution, as well as how many blocks can be executed every cycle.
for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
unsigned ResourceCycles = ProcResourceUsage[I];
if (!ResourceCycles)
continue;
const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
double Throughput = static_cast<double>(ResourceCycles) / MCDesc.NumUnits;
Max = std::max(Max, Throughput);
}
// The block reciprocal throughput is computed as the MAX of:
// - (NumMicroOps / DispatchWidth)
// - (NumUnits / ResourceCycles) for every consumed processor resource.
return Max;
}
} // namespace mca

View File

@ -15,6 +15,7 @@
#ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H
#define LLVM_TOOLS_LLVM_MCA_SUPPORT_H
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCSchedule.h"
@ -44,6 +45,14 @@ namespace mca {
/// problems with simple bit manipulation operations.
void computeProcResourceMasks(const llvm::MCSchedModel &SM,
llvm::SmallVectorImpl<uint64_t> &Masks);
/// Compute the reciprocal block throughput from a set of processor resource
/// cycles. The reciprocal block throughput is computed as the MAX between:
/// - NumMicroOps / DispatchWidth
/// - ProcResourceCycles / #ProcResourceUnits (for every consumed resource).
double computeBlockRThroughput(const llvm::MCSchedModel &SM,
unsigned DispatchWidth, unsigned NumMicroOps,
llvm::ArrayRef<unsigned> ProcResourceUsage);
} // namespace mca
#endif