From 20f66720da2db9b29b1ef38f951ed4cc26c18500 Mon Sep 17 00:00:00 2001 From: Xinliang David Li Date: Thu, 31 Dec 2015 07:57:16 +0000 Subject: [PATCH] [PGO]: Implement Func PGO name string compression This is part of the effort/prepration to reduce the size instr-pgo (object, binary, memory footprint, and raw data). The functionality is currently off by default and not yet used by any clients. llvm-svn: 256667 --- include/llvm/ProfileData/InstrProf.h | 38 ++++++++- lib/ProfileData/InstrProf.cpp | 101 +++++++++++++++++++++++- unittests/ProfileData/InstrProfTest.cpp | 61 ++++++++++++++ 3 files changed, 196 insertions(+), 4 deletions(-) diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h index 4688759a3bd..d8e9174196d 100644 --- a/include/llvm/ProfileData/InstrProf.h +++ b/include/llvm/ProfileData/InstrProf.h @@ -160,6 +160,29 @@ GlobalVariable *createPGOFuncNameVar(Module &M, /// the original (static) function name. StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName); +/// Given a vector of strings (function PGO names) \c NameStrs, the +/// method generates a combined string \c Result thatis ready to be +/// serialized. The \c Result string is comprised of three fields: +/// The first field is the legnth of the uncompressed strings, and the +/// the second field is the length of the zlib-compressed string. +/// Both fields are encoded in ULEB128. If \c doCompress is false, the +/// third field is the uncompressed strings; otherwise it is the +/// compressed string. When the string compression is off, the +/// second field will have value zero. +int collectPGOFuncNameStrings(const std::vector &NameStrs, + bool doCompression, std::string &Result); +/// Produce \c Result string with the same format described above. The input +/// is vector of PGO function name variables that are referenced. +int collectPGOFuncNameStrings(const std::vector &NameVars, + std::string &Result); +class InstrProfSymtab; +/// \c NameStrings is a string composed of one of more sub-strings encoded in +/// the +/// format described above. The substrings are seperated by 0 or more zero +/// bytes. +/// This method decodes the string and populates the \c Symtab. +int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab); + const std::error_category &instrprof_category(); enum class instrprof_error { @@ -235,6 +258,11 @@ public: /// This interface is used by reader of CoverageMapping test /// format. inline std::error_code create(StringRef D, uint64_t BaseAddr); + /// \c NameStrings is a string composed of one of more sub-strings + /// encoded in the format described above. The substrings are + /// seperated by 0 or more zero bytes. This method decodes the + /// string and populates the \c Symtab. + inline std::error_code create(StringRef NameStrings); /// Create InstrProfSymtab from a set of names iteratable from /// \p IterRange. This interface is used by IndexedProfReader. template void create(const NameIterRange &IterRange); @@ -255,8 +283,8 @@ public: AddrToMD5Map.push_back(std::make_pair(Addr, MD5Val)); } AddrHashMap &getAddrHashMap() { return AddrToMD5Map; } - /// Return function's PGO name from the function name's symabol - /// address in the object file. If an error occurs, Return + /// Return function's PGO name from the function name's symbol + /// address in the object file. If an error occurs, return /// an empty string. StringRef getFuncName(uint64_t FuncNameAddress, size_t NameSize); /// Return function's PGO name from the name's md5 hash value. @@ -270,6 +298,12 @@ std::error_code InstrProfSymtab::create(StringRef D, uint64_t BaseAddr) { return std::error_code(); } +std::error_code InstrProfSymtab::create(StringRef NameStrings) { + if (readPGOFuncNameStrings(NameStrings, *this)) + return make_error_code(instrprof_error::malformed); + return std::error_code(); +} + template void InstrProfSymtab::create(const NameIterRange &IterRange) { for (auto Name : IterRange) diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp index f5acd23129d..df3f8fade3b 100644 --- a/lib/ProfileData/InstrProf.cpp +++ b/lib/ProfileData/InstrProf.cpp @@ -12,12 +12,14 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ProfileData/InstrProf.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" #include "llvm/IR/GlobalVariable.h" -#include "llvm/ProfileData/InstrProf.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LEB128.h" #include "llvm/Support/ManagedStatic.h" using namespace llvm; @@ -162,6 +164,101 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName) { return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), FuncName); } +int collectPGOFuncNameStrings(const std::vector &NameStrs, + bool doCompression, std::string &Result) { + uint8_t Header[16], *P = Header; + std::string UncompressedNameStrings; + + for (auto NameStr : NameStrs) { + UncompressedNameStrings += NameStr; + UncompressedNameStrings.append(" "); + } + unsigned EncLen = encodeULEB128(UncompressedNameStrings.length(), P); + P += EncLen; + if (!doCompression) { + EncLen = encodeULEB128(0, P); + P += EncLen; + Result.append(reinterpret_cast(&Header[0]), P - &Header[0]); + Result += UncompressedNameStrings; + return 0; + } + SmallVector CompressedNameStrings; + zlib::Status Success = + zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings, + zlib::BestSizeCompression); + assert(Success == zlib::StatusOK); + if (Success != zlib::StatusOK) + return 1; + EncLen = encodeULEB128(CompressedNameStrings.size(), P); + P += EncLen; + Result.append(reinterpret_cast(&Header[0]), P - &Header[0]); + Result += + std::string(CompressedNameStrings.data(), CompressedNameStrings.size()); + return 0; +} + +int collectPGOFuncNameStrings(const std::vector &NameVars, + std::string &Result) { + std::vector NameStrs; + for (auto *NameVar : NameVars) { + auto *Arr = cast(NameVar->getInitializer()); + StringRef NameStr = + Arr->isCString() ? Arr->getAsCString() : Arr->getAsString(); + NameStrs.push_back(NameStr.str()); + } + return collectPGOFuncNameStrings(NameStrs, zlib::isAvailable(), Result); +} + +int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) { + const uint8_t *P = reinterpret_cast(NameStrings.data()); + const uint8_t *EndP = reinterpret_cast(NameStrings.data() + + NameStrings.size()); + while (P < EndP) { + uint32_t N; + uint64_t UncompressedSize = decodeULEB128(P, &N); + P += N; + uint64_t CompressedSize = decodeULEB128(P, &N); + P += N; + bool isCompressed = (CompressedSize != 0); + SmallString<128> UncompressedNameStrings; + StringRef NameStrings; + if (isCompressed) { + StringRef CompressedNameStrings(reinterpret_cast(P), + CompressedSize); + if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings, + UncompressedSize) != zlib::StatusOK) + return 1; + P += CompressedSize; + NameStrings = StringRef(UncompressedNameStrings.data(), + UncompressedNameStrings.size()); + } else { + NameStrings = + StringRef(reinterpret_cast(P), UncompressedSize); + P += UncompressedSize; + } + // Now parse the name strings. + size_t NameStart = 0; + bool isLast = false; + do { + size_t NameStop = NameStrings.find(' ', NameStart); + if (NameStop == StringRef::npos) + return 1; + if (NameStop == NameStrings.size() - 1) + isLast = true; + StringRef Name = NameStrings.substr(NameStart, NameStop - NameStart); + Symtab.addFuncName(Name); + if (isLast) + break; + NameStart = NameStop + 1; + } while (true); + + while (P < EndP && *P == 0) + P++; + } + Symtab.finalizeSymtab(); + return 0; +} + instrprof_error InstrProfValueSiteRecord::mergeValueData(InstrProfValueSiteRecord &Input, uint64_t Weight) { diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp index 8f4db871a92..453ad0f22b3 100644 --- a/unittests/ProfileData/InstrProfTest.cpp +++ b/unittests/ProfileData/InstrProfTest.cpp @@ -9,6 +9,7 @@ #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/InstrProfWriter.h" +#include "llvm/Support/Compression.h" #include "gtest/gtest.h" #include @@ -583,4 +584,64 @@ TEST_F(InstrProfTest, instr_prof_symtab_test) { ASSERT_EQ(StringRef("bar3"), R); } +TEST_F(InstrProfTest, instr_prof_symtab_compression_test) { + std::vector FuncNames1; + std::vector FuncNames2; + for (int I = 0; I < 10 * 1024; I++) { + std::string str; + raw_string_ostream OS(str); + OS << "func_" << I; + FuncNames1.push_back(OS.str()); + str.clear(); + OS << "fooooooooooooooo_" << I; + FuncNames1.push_back(OS.str()); + str.clear(); + OS << "BAR_" << I; + FuncNames2.push_back(OS.str()); + str.clear(); + OS << "BlahblahBlahblahBar_" << I; + FuncNames2.push_back(OS.str()); + } + + for (int Padding = 0; Padding < 10; Padding++) { + for (int DoCompression = 0; DoCompression < 2; DoCompression++) { + // Compressing: + std::string FuncNameStrings1; + collectPGOFuncNameStrings(FuncNames1, + (DoCompression != 0 && zlib::isAvailable()), + FuncNameStrings1); + + // Compressing: + std::string FuncNameStrings2; + collectPGOFuncNameStrings(FuncNames2, + (DoCompression != 0 && zlib::isAvailable()), + FuncNameStrings2); + + // Join with paddings: + std::string FuncNameStrings = FuncNameStrings1; + for (int P = 0; P < Padding; P++) { + FuncNameStrings.push_back('\0'); + } + FuncNameStrings += FuncNameStrings2; + + // Now decompress + InstrProfSymtab Symtab; + Symtab.create(StringRef(FuncNameStrings)); + + // Now check + for (int I = 0; I < 10 * 1024; I++) { + std::string N[4]; + N[0] = FuncNames1[2 * I]; + N[1] = FuncNames1[2 * I + 1]; + N[2] = FuncNames2[2 * I]; + N[3] = FuncNames2[2 * I + 1]; + for (int J = 0; J < 4; J++) { + StringRef R = Symtab.getFuncName(IndexedInstrProf::ComputeHash(N[J])); + ASSERT_EQ(StringRef(N[J]), R); + } + } + } + } +} + } // end anonymous namespace