[BOLT] Dump dynamic execution per instruction opcode

Summary:
We extended DynoStats to dump the histogram per instruction opcode. By
default the dump is turned off. Use '-print-dyno-opcode-stats' to enable
the dump.

BOLT also dumps for each instruction opcode the maximum execution count and
corresponding function name and basic block offsets where the instruction
occurs. Below is a sample of the dump:

                   Opcode,    Execution Count,      Max Exec Count, Function Name:Offset
                  SHR8rCL,                232,                 232, _ZNK5folly14AsyncSSLSocket4goodEv:53
                VPADDDYrr,              13956,                 388, chacha20_encrypt_bytes.part.0/3:736
               PMOVSXBWrr,                  4,                   2, ares_expand_name/1:264
                VMOVAPSmr,               1082,                  43, chacha20_encrypt_bytes.part.0/3:2864
                VPSHUFBrr,               9540,                1667, chacha20_encrypt_bytes.part.0/3:4416
            VPUNPCKLDQYrr,               1102,                 188, jsimd_ycc_rgb_convert_avx2/1:125
          VPBROADCASTQYrm,                 39,                  39, chacha20_encrypt_bytes.part.0/3:400
               PMOVSXWDrr,                  8,                   2, ares_expand_name/1:264
                   VPORrr,                817,                 129, jsimd_idct_islow_avx2/1:41
                  PSLLDri,            8690752,               65644, blockmix_salsa8_xor/1:1424

(cherry picked from FBD28859624)
This commit is contained in:
Zino Benaissa 2021-05-24 21:33:43 -07:00 committed by Maksim Panchenko
parent c9f5f47b51
commit 60b15062e1
3 changed files with 111 additions and 5 deletions

View File

@ -37,6 +37,14 @@ DynoStatsScale("dyno-stats-scale",
cl::Hidden,
cl::cat(BoltCategory));
static cl::opt<uint32_t>
PrintDynoOpcodeStat("print-dyno-opcode-stats",
cl::desc("print per instruction opcode dyno stats and the function"
"names:BB offsets of the nth highest execution counts"),
cl::init(0),
cl::Hidden,
cl::cat(BoltCategory));
} // namespace opts
namespace llvm {
@ -69,7 +77,8 @@ bool DynoStats::lessThan(const DynoStats &Other,
);
}
void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const {
void DynoStats::print(raw_ostream &OS, const DynoStats *Other,
MCInstPrinter *Printer) const {
auto printStatWithDelta = [&](const std::string &Name, uint64_t Stat,
uint64_t OtherStat) {
OS << format("%'20lld : ", Stat * opts::DynoStatsScale) << Name;
@ -95,6 +104,35 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other) const {
printStatWithDelta(Desc[Stat], Stats[Stat], Other ? (*Other)[Stat] : 0);
}
if (opts::PrintDynoOpcodeStat && Printer) {
outs() << "\nProgram-wide opcode histogram:\n";
OS << " Opcode, Execution Count, Max Exec Count, "
"Function Name:Offset ...\n";
std::vector<std::pair<uint64_t, unsigned>> SortedHistogram;
for (const OpcodeStatTy &Stat : OpcodeHistogram)
SortedHistogram.emplace_back(Stat.second.first, Stat.first);
// Sort using lexicographic ordering
std::sort(SortedHistogram.begin(), SortedHistogram.end());
// Dump in ascending order: Start with Opcode with Highest execution
// count.
for (auto Stat = SortedHistogram.rbegin(); Stat != SortedHistogram.rend();
++Stat) {
OS << format("%20s,%'18lld",
Printer->getOpcodeName(Stat->second).data(),
Stat->first * opts::DynoStatsScale);
MaxOpcodeHistogramTy MaxMultiMap =
OpcodeHistogram.at(Stat->second).second;
// Start with function name:BB offset with highest execution count.
for (auto Max = MaxMultiMap.rbegin(); Max != MaxMultiMap.rend(); ++Max) {
OS << format(", %'18lld, ", Max->first * opts::DynoStatsScale)
<< Max->second.first.str() << ':' << Max->second.second;
}
OS << '\n';
}
}
}
void DynoStats::operator+=(const DynoStats &Other) {
@ -103,6 +141,29 @@ void DynoStats::operator+=(const DynoStats &Other) {
++Stat) {
Stats[Stat] += Other[Stat];
}
for (const OpcodeStatTy &Stat : Other.OpcodeHistogram) {
auto I = OpcodeHistogram.find(Stat.first);
if (I == OpcodeHistogram.end()) {
OpcodeHistogram.emplace(Stat);
} else {
// Merge Other Historgrams, log only the opts::PrintDynoOpcodeStat'th
// maximum counts.
I->second.first += Stat.second.first;
auto &MMap = I->second.second;
auto &OtherMMap = Stat.second.second;
auto Size = MMap.size();
assert(Size <= opts::PrintDynoOpcodeStat);
for (auto Iter = OtherMMap.rbegin(); Iter != OtherMMap.rend(); ++Iter) {
if (Size++ >= opts::PrintDynoOpcodeStat) {
auto First = MMap.begin();
if (Iter->first <= First->first)
break;
MMap.erase(First);
}
MMap.emplace(*Iter);
}
}
}
}
DynoStats getDynoStats(const BinaryFunction &BF) {
@ -133,15 +194,43 @@ DynoStats getDynoStats(const BinaryFunction &BF) {
if(BF.isAArch64Veneer())
Stats[DynoStats::VENEER_CALLS_AARCH64] += BF.getKnownExecutionCount();
// Count the number of calls by iterating through all instructions.
// Count various instruction types by iterating through all instructions.
// When -print-dyno-opcode-stats is on, count per each opcode and record
// maximum execution counts.
for (const MCInst &Instr : *BB) {
if (opts::PrintDynoOpcodeStat) {
unsigned Opcode = Instr.getOpcode();
auto I = Stats.OpcodeHistogram.find(Opcode);
if (I == Stats.OpcodeHistogram.end()) {
DynoStats::MaxOpcodeHistogramTy MMap;
MMap.emplace(BBExecutionCount,
std::make_pair(BF.getOneName(), BB->getOffset()));
Stats.OpcodeHistogram.emplace(Opcode,
std::make_pair(BBExecutionCount, MMap));
} else {
I->second.first += BBExecutionCount;
bool Insert = true;
if (I->second.second.size() == opts::PrintDynoOpcodeStat) {
auto First = I->second.second.begin();
if (First->first < BBExecutionCount)
I->second.second.erase(First);
else
Insert = false;
}
if (Insert) {
I->second.second.emplace(
BBExecutionCount,
std::make_pair(BF.getOneName(), BB->getOffset()));
}
}
}
if (BC.MIB->isStore(Instr)) {
Stats[DynoStats::STORES] += BBExecutionCount;
}
if (BC.MIB->isLoad(Instr)) {
Stats[DynoStats::LOADS] += BBExecutionCount;
}
if (!BC.MIB->isCall(Instr))
continue;

View File

@ -12,7 +12,10 @@
#define LLVM_TOOLS_LLVM_BOLT_DYNO_STATS_H
#include "llvm/ADT/ArrayRef.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/Support/raw_ostream.h"
#include <map>
#include <unordered_map>
namespace llvm {
@ -102,7 +105,8 @@ public:
return 0;
}
void print(raw_ostream &OS, const DynoStats *Other = nullptr) const;
void print(raw_ostream &OS, const DynoStats *Other = nullptr,
MCInstPrinter *Printer = nullptr) const;
void operator+=(const DynoStats &Other);
bool operator<(const DynoStats &Other) const;
@ -113,6 +117,19 @@ public:
static const char* Description(const Category C) {
return Desc[C];
}
/// Maps instruction opcodes to:
/// 1. Accumulated executed instruction counts.
/// 2. a multimap that records highest execution counts, function names,
/// and BB offsets where intructions of these opcodes occur.
using MaxOpcodeHistogramTy =
std::multimap<uint64_t, std::pair<StringRef, uint32_t>>;
using OpcodeStatTy =
std::pair<unsigned, std::pair<uint64_t, MaxOpcodeHistogramTy>>;
using OpcodeHistogramTy =
std::unordered_map<unsigned, std::pair<uint64_t, MaxOpcodeHistogramTy>>;
OpcodeHistogramTy OpcodeHistogram;
};
inline raw_ostream &operator<<(raw_ostream &OS, const DynoStats &Stats) {

View File

@ -83,7 +83,7 @@ public:
<< PrevDynoStats;
if (Changed) {
outs() << '\n';
NewDynoStats.print(outs(), &PrevDynoStats);
NewDynoStats.print(outs(), &PrevDynoStats, BC.InstPrinter.get());
}
outs() << '\n';
}