mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-03-03 01:48:15 +00:00
[llvm-mca] Move the logic that computes the block throughput into Support.h. NFC
This will allow us to share the logic that computes the block throughput with other views. llvm-svn: 333755
This commit is contained in:
parent
a75eb0fe77
commit
0110f33d56
@ -24,6 +24,14 @@ namespace mca {
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
|
||||
unsigned Width)
|
||||
: SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
|
||||
NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
|
||||
ProcResourceMasks(Model.getNumProcResourceKinds(), 0) {
|
||||
computeProcResourceMasks(SM, ProcResourceMasks);
|
||||
}
|
||||
|
||||
void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
|
||||
// We are only interested in the "instruction dispatched" events generated by
|
||||
// the dispatch stage for instructions that are part of iteration #0.
|
||||
@ -41,48 +49,14 @@ void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) {
|
||||
const InstrDesc &Desc = Inst.getDesc();
|
||||
NumMicroOps += Desc.NumMicroOps;
|
||||
for (const std::pair<uint64_t, const ResourceUsage> &RU : Desc.Resources) {
|
||||
if (!RU.second.size())
|
||||
continue;
|
||||
|
||||
assert(RU.second.NumUnits && "Expected more than one unit used!");
|
||||
if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) {
|
||||
ProcResourceUsage[RU.first] = RU.second.size();
|
||||
continue;
|
||||
}
|
||||
|
||||
ProcResourceUsage[RU.first] += RU.second.size();
|
||||
}
|
||||
}
|
||||
|
||||
double SummaryView::getBlockRThroughput() const {
|
||||
assert(NumMicroOps && "Expected at least one micro opcode!");
|
||||
|
||||
SmallVector<uint64_t, 8> Masks(SM.getNumProcResourceKinds());
|
||||
computeProcResourceMasks(SM, Masks);
|
||||
|
||||
// The block throughput is bounded from above by the hardware dispatch
|
||||
// throughput. That is because the DispatchWidth is an upper bound on the
|
||||
// number of opcodes that can be part of a single dispatch group.
|
||||
double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
|
||||
|
||||
// The block throughput is also limited by the amount of hardware parallelism.
|
||||
// The number of available resource units affects the resource pressure
|
||||
// distributed, as well as how many blocks can be executed every cycle.
|
||||
for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
|
||||
uint64_t Mask = Masks[I];
|
||||
const auto It = ProcResourceUsage.find_as(Mask);
|
||||
if (It != ProcResourceUsage.end()) {
|
||||
const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
|
||||
unsigned NumUnits = MCDesc.NumUnits;
|
||||
double Throughput = static_cast<double>(It->second) / NumUnits;
|
||||
Max = std::max(Max, Throughput);
|
||||
if (RU.second.size()) {
|
||||
const auto It = find(ProcResourceMasks, RU.first);
|
||||
assert(It != ProcResourceMasks.end() &&
|
||||
"Invalid processor resource mask!");
|
||||
ProcResourceUsage[std::distance(ProcResourceMasks.begin(), It)] +=
|
||||
RU.second.size();
|
||||
}
|
||||
}
|
||||
|
||||
// The block reciprocal throughput is computed as the MAX of:
|
||||
// - (#uOps / DispatchWidth)
|
||||
// - (#units / resource cycles) for every consumed processor resource.
|
||||
return Max;
|
||||
}
|
||||
|
||||
void SummaryView::printView(raw_ostream &OS) const {
|
||||
@ -90,7 +64,8 @@ void SummaryView::printView(raw_ostream &OS) const {
|
||||
unsigned Instructions = Source.size();
|
||||
unsigned TotalInstructions = Instructions * Iterations;
|
||||
double IPC = (double)TotalInstructions / TotalCycles;
|
||||
double BlockRThroughput = getBlockRThroughput();
|
||||
double BlockRThroughput = computeBlockRThroughput(
|
||||
SM, DispatchWidth, NumMicroOps, ProcResourceUsage);
|
||||
|
||||
std::string Buffer;
|
||||
raw_string_ostream TempStream(Buffer);
|
||||
|
@ -45,10 +45,15 @@ class SummaryView : public View {
|
||||
unsigned TotalCycles;
|
||||
// The total number of micro opcodes contributed by a block of instructions.
|
||||
unsigned NumMicroOps;
|
||||
// For each processor resource, this map stores the cumulative number of
|
||||
// resource cycles consumed by a block of instructions. The resource mask ID
|
||||
// is used as the key value to access elements of this map.
|
||||
llvm::DenseMap<uint64_t, unsigned> ProcResourceUsage;
|
||||
// For each processor resource, this vector stores the cumulative number of
|
||||
// resource cycles consumed by the analyzed code block.
|
||||
llvm::SmallVector<unsigned, 8> ProcResourceUsage;
|
||||
|
||||
// Each processor resource is associated with a so-called processor resource
|
||||
// mask. This vector allows to correlate processor resource IDs with processor
|
||||
// resource masks. There is exactly one element per each processor resource
|
||||
// declared by the scheduling model.
|
||||
llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
|
||||
|
||||
// Compute the reciprocal throughput for the analyzed code block.
|
||||
// The reciprocal block throughput is computed as the MAX between:
|
||||
@ -58,9 +63,7 @@ class SummaryView : public View {
|
||||
|
||||
public:
|
||||
SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
|
||||
unsigned Width)
|
||||
: SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
|
||||
NumMicroOps(0) {}
|
||||
unsigned Width);
|
||||
|
||||
void onCycleEnd() override { ++TotalCycles; }
|
||||
|
||||
|
@ -48,4 +48,32 @@ void computeProcResourceMasks(const MCSchedModel &SM,
|
||||
ProcResourceID++;
|
||||
}
|
||||
}
|
||||
|
||||
double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
|
||||
unsigned NumMicroOps,
|
||||
ArrayRef<unsigned> ProcResourceUsage) {
|
||||
// The block throughput is bounded from above by the hardware dispatch
|
||||
// throughput. That is because the DispatchWidth is an upper bound on the
|
||||
// number of opcodes that can be part of a single dispatch group.
|
||||
double Max = static_cast<double>(NumMicroOps) / DispatchWidth;
|
||||
|
||||
// The block throughput is also limited by the amount of hardware parallelism.
|
||||
// The number of available resource units affects the resource pressure
|
||||
// distribution, as well as how many blocks can be executed every cycle.
|
||||
for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
|
||||
unsigned ResourceCycles = ProcResourceUsage[I];
|
||||
if (!ResourceCycles)
|
||||
continue;
|
||||
|
||||
const MCProcResourceDesc &MCDesc = *SM.getProcResource(I);
|
||||
double Throughput = static_cast<double>(ResourceCycles) / MCDesc.NumUnits;
|
||||
Max = std::max(Max, Throughput);
|
||||
}
|
||||
|
||||
// The block reciprocal throughput is computed as the MAX of:
|
||||
// - (NumMicroOps / DispatchWidth)
|
||||
// - (NumUnits / ResourceCycles) for every consumed processor resource.
|
||||
return Max;
|
||||
}
|
||||
|
||||
} // namespace mca
|
||||
|
@ -15,6 +15,7 @@
|
||||
#ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H
|
||||
#define LLVM_TOOLS_LLVM_MCA_SUPPORT_H
|
||||
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/MC/MCSchedule.h"
|
||||
|
||||
@ -44,6 +45,14 @@ namespace mca {
|
||||
/// problems with simple bit manipulation operations.
|
||||
void computeProcResourceMasks(const llvm::MCSchedModel &SM,
|
||||
llvm::SmallVectorImpl<uint64_t> &Masks);
|
||||
|
||||
/// Compute the reciprocal block throughput from a set of processor resource
|
||||
/// cycles. The reciprocal block throughput is computed as the MAX between:
|
||||
/// - NumMicroOps / DispatchWidth
|
||||
/// - ProcResourceCycles / #ProcResourceUnits (for every consumed resource).
|
||||
double computeBlockRThroughput(const llvm::MCSchedModel &SM,
|
||||
unsigned DispatchWidth, unsigned NumMicroOps,
|
||||
llvm::ArrayRef<unsigned> ProcResourceUsage);
|
||||
} // namespace mca
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user