diff --git a/tools/llvm-mca/SummaryView.cpp b/tools/llvm-mca/SummaryView.cpp index c4746c62c0d..5cb5c23c973 100644 --- a/tools/llvm-mca/SummaryView.cpp +++ b/tools/llvm-mca/SummaryView.cpp @@ -24,6 +24,14 @@ namespace mca { using namespace llvm; +SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S, + unsigned Width) + : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0), + NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0), + ProcResourceMasks(Model.getNumProcResourceKinds(), 0) { + computeProcResourceMasks(SM, ProcResourceMasks); +} + void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) { // We are only interested in the "instruction dispatched" events generated by // the dispatch stage for instructions that are part of iteration #0. @@ -41,48 +49,14 @@ void SummaryView::onInstructionEvent(const HWInstructionEvent &Event) { const InstrDesc &Desc = Inst.getDesc(); NumMicroOps += Desc.NumMicroOps; for (const std::pair &RU : Desc.Resources) { - if (!RU.second.size()) - continue; - - assert(RU.second.NumUnits && "Expected more than one unit used!"); - if (ProcResourceUsage.find(RU.first) == ProcResourceUsage.end()) { - ProcResourceUsage[RU.first] = RU.second.size(); - continue; - } - - ProcResourceUsage[RU.first] += RU.second.size(); - } -} - -double SummaryView::getBlockRThroughput() const { - assert(NumMicroOps && "Expected at least one micro opcode!"); - - SmallVector Masks(SM.getNumProcResourceKinds()); - computeProcResourceMasks(SM, Masks); - - // The block throughput is bounded from above by the hardware dispatch - // throughput. That is because the DispatchWidth is an upper bound on the - // number of opcodes that can be part of a single dispatch group. - double Max = static_cast(NumMicroOps) / DispatchWidth; - - // The block throughput is also limited by the amount of hardware parallelism. - // The number of available resource units affects the resource pressure - // distributed, as well as how many blocks can be executed every cycle. - for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { - uint64_t Mask = Masks[I]; - const auto It = ProcResourceUsage.find_as(Mask); - if (It != ProcResourceUsage.end()) { - const MCProcResourceDesc &MCDesc = *SM.getProcResource(I); - unsigned NumUnits = MCDesc.NumUnits; - double Throughput = static_cast(It->second) / NumUnits; - Max = std::max(Max, Throughput); + if (RU.second.size()) { + const auto It = find(ProcResourceMasks, RU.first); + assert(It != ProcResourceMasks.end() && + "Invalid processor resource mask!"); + ProcResourceUsage[std::distance(ProcResourceMasks.begin(), It)] += + RU.second.size(); } } - - // The block reciprocal throughput is computed as the MAX of: - // - (#uOps / DispatchWidth) - // - (#units / resource cycles) for every consumed processor resource. - return Max; } void SummaryView::printView(raw_ostream &OS) const { @@ -90,7 +64,8 @@ void SummaryView::printView(raw_ostream &OS) const { unsigned Instructions = Source.size(); unsigned TotalInstructions = Instructions * Iterations; double IPC = (double)TotalInstructions / TotalCycles; - double BlockRThroughput = getBlockRThroughput(); + double BlockRThroughput = computeBlockRThroughput( + SM, DispatchWidth, NumMicroOps, ProcResourceUsage); std::string Buffer; raw_string_ostream TempStream(Buffer); diff --git a/tools/llvm-mca/SummaryView.h b/tools/llvm-mca/SummaryView.h index fe8a5e20f9d..04f4a871247 100644 --- a/tools/llvm-mca/SummaryView.h +++ b/tools/llvm-mca/SummaryView.h @@ -45,10 +45,15 @@ class SummaryView : public View { unsigned TotalCycles; // The total number of micro opcodes contributed by a block of instructions. unsigned NumMicroOps; - // For each processor resource, this map stores the cumulative number of - // resource cycles consumed by a block of instructions. The resource mask ID - // is used as the key value to access elements of this map. - llvm::DenseMap ProcResourceUsage; + // For each processor resource, this vector stores the cumulative number of + // resource cycles consumed by the analyzed code block. + llvm::SmallVector ProcResourceUsage; + + // Each processor resource is associated with a so-called processor resource + // mask. This vector allows to correlate processor resource IDs with processor + // resource masks. There is exactly one element per each processor resource + // declared by the scheduling model. + llvm::SmallVector ProcResourceMasks; // Compute the reciprocal throughput for the analyzed code block. // The reciprocal block throughput is computed as the MAX between: @@ -58,9 +63,7 @@ class SummaryView : public View { public: SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S, - unsigned Width) - : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0), - NumMicroOps(0) {} + unsigned Width); void onCycleEnd() override { ++TotalCycles; } diff --git a/tools/llvm-mca/Support.cpp b/tools/llvm-mca/Support.cpp index e97cc51430d..fac237f740d 100644 --- a/tools/llvm-mca/Support.cpp +++ b/tools/llvm-mca/Support.cpp @@ -48,4 +48,32 @@ void computeProcResourceMasks(const MCSchedModel &SM, ProcResourceID++; } } + +double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth, + unsigned NumMicroOps, + ArrayRef ProcResourceUsage) { + // The block throughput is bounded from above by the hardware dispatch + // throughput. That is because the DispatchWidth is an upper bound on the + // number of opcodes that can be part of a single dispatch group. + double Max = static_cast(NumMicroOps) / DispatchWidth; + + // The block throughput is also limited by the amount of hardware parallelism. + // The number of available resource units affects the resource pressure + // distribution, as well as how many blocks can be executed every cycle. + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + unsigned ResourceCycles = ProcResourceUsage[I]; + if (!ResourceCycles) + continue; + + const MCProcResourceDesc &MCDesc = *SM.getProcResource(I); + double Throughput = static_cast(ResourceCycles) / MCDesc.NumUnits; + Max = std::max(Max, Throughput); + } + + // The block reciprocal throughput is computed as the MAX of: + // - (NumMicroOps / DispatchWidth) + // - (NumUnits / ResourceCycles) for every consumed processor resource. + return Max; +} + } // namespace mca diff --git a/tools/llvm-mca/Support.h b/tools/llvm-mca/Support.h index a29ef8b58e2..98302e7e15c 100644 --- a/tools/llvm-mca/Support.h +++ b/tools/llvm-mca/Support.h @@ -15,6 +15,7 @@ #ifndef LLVM_TOOLS_LLVM_MCA_SUPPORT_H #define LLVM_TOOLS_LLVM_MCA_SUPPORT_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCSchedule.h" @@ -44,6 +45,14 @@ namespace mca { /// problems with simple bit manipulation operations. void computeProcResourceMasks(const llvm::MCSchedModel &SM, llvm::SmallVectorImpl &Masks); + +/// Compute the reciprocal block throughput from a set of processor resource +/// cycles. The reciprocal block throughput is computed as the MAX between: +/// - NumMicroOps / DispatchWidth +/// - ProcResourceCycles / #ProcResourceUnits (for every consumed resource). +double computeBlockRThroughput(const llvm::MCSchedModel &SM, + unsigned DispatchWidth, unsigned NumMicroOps, + llvm::ArrayRef ProcResourceUsage); } // namespace mca #endif