Reland^2 "[BOLT] Parallelize legacy profile merging"

Resovled the issue that when number of tasks is fewer than cores, we end
up creating as many threads as the number of cores, making the
performance worse than the single thread version.
This commit is contained in:
Yi Kong 2023-05-19 14:56:46 -07:00
parent 5734a81a55
commit 67cf01bd37

View File

@ -20,6 +20,9 @@
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/PrettyStackTrace.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/ThreadPool.h"
#include <algorithm>
#include <mutex>
#include <unordered_map>
using namespace llvm;
@ -258,31 +261,40 @@ bool isYAML(const StringRef Filename) {
void mergeLegacyProfiles(const SmallVectorImpl<std::string> &Filenames) {
errs() << "Using legacy profile format.\n";
std::optional<bool> BoltedCollection;
StringMap<uint64_t> Entries;
for (const std::string &Filename : Filenames) {
std::mutex BoltedCollectionMutex;
typedef StringMap<uint64_t> ProfileTy;
auto ParseProfile = [&](const std::string &Filename, auto &Profiles) {
const llvm::thread::id tid = llvm::this_thread::get_id();
if (isYAML(Filename))
report_error(Filename, "cannot mix YAML and legacy formats");
ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
MemoryBuffer::getFileOrSTDIN(Filename);
if (std::error_code EC = MB.getError())
report_error(Filename, EC);
errs() << "Merging data from " << Filename << "...\n";
StringRef Buf = MB.get()->getBuffer();
// Check if the string "boltedcollection" is in the first line
if (Buf.startswith("boltedcollection\n")) {
if (!BoltedCollection.value_or(true))
report_error(
Filename,
"cannot mix profile collected in BOLT and non-BOLT deployments");
BoltedCollection = true;
Buf = Buf.drop_front(17);
} else {
if (BoltedCollection.value_or(false))
report_error(
Filename,
"cannot mix profile collected in BOLT and non-BOLT deployments");
BoltedCollection = false;
ProfileTy *Profile;
{
std::lock_guard<std::mutex> Lock(BoltedCollectionMutex);
// Check if the string "boltedcollection" is in the first line
if (Buf.startswith("boltedcollection\n")) {
if (!BoltedCollection.value_or(true))
report_error(
Filename,
"cannot mix profile collected in BOLT and non-BOLT deployments");
BoltedCollection = true;
Buf = Buf.drop_front(17);
} else {
if (BoltedCollection.value_or(false))
report_error(
Filename,
"cannot mix profile collected in BOLT and non-BOLT deployments");
BoltedCollection = false;
}
Profile = &Profiles[tid];
}
SmallVector<StringRef> Lines;
@ -295,15 +307,32 @@ void mergeLegacyProfiles(const SmallVectorImpl<std::string> &Filenames) {
uint64_t Count;
if (Line.substr(Pos + 1, Line.size() - Pos).getAsInteger(10, Count))
report_error(Filename, "Malformed / corrupted profile counter");
Count += Entries.lookup(Signature);
Entries.insert_or_assign(Signature, Count);
Count += Profile->lookup(Signature);
Profile->insert_or_assign(Signature, Count);
}
};
// The final reduction has non-trivial cost, make sure each thread has at
// least 4 tasks.
ThreadPoolStrategy S = optimal_concurrency(
std::max(Filenames.size() / 4, static_cast<size_t>(1)));
ThreadPool Pool(S);
DenseMap<llvm::thread::id, ProfileTy> ParsedProfiles(Pool.getThreadCount());
for (const auto &Filename : Filenames)
Pool.async(ParseProfile, std::cref(Filename), std::ref(ParsedProfiles));
Pool.wait();
ProfileTy MergedProfile;
for (const auto &[Thread, Profile] : ParsedProfiles)
for (const auto &[Key, Value] : Profile) {
uint64_t Count = MergedProfile.lookup(Key) + Value;
MergedProfile.insert_or_assign(Key, Count);
}
}
if (BoltedCollection)
output() << "boltedcollection\n";
for (const auto &Entry : Entries)
output() << Entry.getKey() << " " << Entry.getValue() << "\n";
for (const auto &[Key, Value] : MergedProfile)
output() << Key << " " << Value << "\n";
errs() << "Profile from " << Filenames.size() << " files merged.\n";
}