Bug 1656155 - pt 6. Add slop information to Replay.cpp r=glandium

Differential Revision: https://phabricator.services.mozilla.com/D88403
This commit is contained in:
Paul Bone 2020-10-08 22:44:38 +00:00
parent 6655819c99
commit de7f32735e

View File

@ -76,13 +76,17 @@ class MappedArray {
/* Type for records of allocations. */
struct MemSlot {
void* mPtr;
// mRequest is only valid if mPtr is non-null. It doesn't need to be cleared
// when memory is freed or realloc()ed.
size_t mRequest;
};
/* An almost infinite list of slots.
* In essence, this is a linked list of arrays of groups of slots.
* Each group is 1MB. On 64-bits, one group allows to store 128k allocations.
* Each group is 1MB. On 64-bits, one group allows to store 64k allocations.
* Each MemSlotList instance can store 1023 such groups, which means more
* than 130M allocations. In case more would be needed, we chain to another
* than 67M allocations. In case more would be needed, we chain to another
* MemSlotList, and so on.
* Using 1023 groups makes the MemSlotList itself page sized on 32-bits
* and 2 pages-sized on 64-bits.
@ -294,7 +298,12 @@ size_t parseNumber(Buffer aBuf) {
/* Class to handle dispatching the replay function calls to replace-malloc. */
class Replay {
public:
Replay() : mOps(0) {
Replay()
: mOps(0),
mNumUsedSlots(0),
mTotalRequestedSize(0),
mTotalAllocatedSize(0),
mCalculateSlop(false) {
#ifdef _WIN32
// See comment in FdPrintf.h as to why native win32 handles are used.
mStdErr = reinterpret_cast<intptr_t>(GetStdHandle(STD_ERROR_HANDLE));
@ -303,6 +312,8 @@ class Replay {
#endif
}
void enableSlopCalculation() { mCalculateSlop = true; }
MemSlot& operator[](size_t index) const { return mSlots[index]; }
void malloc(Buffer& aArgs, Buffer& aResult) {
@ -310,6 +321,13 @@ class Replay {
mOps++;
size_t size = parseNumber(aArgs);
aSlot.mPtr = ::malloc_impl(size);
if (aSlot.mPtr) {
aSlot.mRequest = size;
if (mCalculateSlop) {
mTotalRequestedSize += size;
mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr);
}
}
}
void posix_memalign(Buffer& aArgs, Buffer& aResult) {
@ -320,6 +338,11 @@ class Replay {
void* ptr;
if (::posix_memalign_impl(&ptr, alignment, size) == 0) {
aSlot.mPtr = ptr;
aSlot.mRequest = size;
if (mCalculateSlop) {
mTotalRequestedSize += size;
mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr);
}
} else {
aSlot.mPtr = nullptr;
}
@ -331,6 +354,13 @@ class Replay {
size_t alignment = parseNumber(aArgs.SplitChar(','));
size_t size = parseNumber(aArgs);
aSlot.mPtr = ::aligned_alloc_impl(alignment, size);
if (aSlot.mPtr) {
aSlot.mRequest = size;
if (mCalculateSlop) {
mTotalRequestedSize += size;
mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr);
}
}
}
void calloc(Buffer& aArgs, Buffer& aResult) {
@ -339,6 +369,13 @@ class Replay {
size_t num = parseNumber(aArgs.SplitChar(','));
size_t size = parseNumber(aArgs);
aSlot.mPtr = ::calloc_impl(num, size);
if (aSlot.mPtr) {
aSlot.mRequest = num * size;
if (mCalculateSlop) {
mTotalRequestedSize += num * size;
mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr);
}
}
}
void realloc(Buffer& aArgs, Buffer& aResult) {
@ -354,6 +391,13 @@ class Replay {
void* old_ptr = old_slot.mPtr;
old_slot.mPtr = nullptr;
aSlot.mPtr = ::realloc_impl(old_ptr, size);
if (aSlot.mPtr) {
aSlot.mRequest = size;
if (mCalculateSlop) {
mTotalRequestedSize += size;
mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr);
}
}
}
void free(Buffer& aArgs, Buffer& aResult) {
@ -377,6 +421,13 @@ class Replay {
size_t alignment = parseNumber(aArgs.SplitChar(','));
size_t size = parseNumber(aArgs);
aSlot.mPtr = ::memalign_impl(alignment, size);
if (aSlot.mPtr) {
aSlot.mRequest = size;
if (mCalculateSlop) {
mTotalRequestedSize += size;
mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr);
}
}
}
void valloc(Buffer& aArgs, Buffer& aResult) {
@ -384,6 +435,13 @@ class Replay {
mOps++;
size_t size = parseNumber(aArgs);
aSlot.mPtr = ::valloc_impl(size);
if (aSlot.mPtr) {
aSlot.mRequest = size;
if (mCalculateSlop) {
mTotalRequestedSize += size;
mTotalAllocatedSize += ::malloc_usable_size_impl(aSlot.mPtr);
}
}
}
void jemalloc_stats(Buffer& aArgs, Buffer& aResult) {
@ -394,7 +452,52 @@ class Replay {
jemalloc_stats_t stats;
jemalloc_bin_stats_t bin_stats[JEMALLOC_MAX_STATS_BINS];
::jemalloc_stats_internal(&stats, bin_stats);
size_t num_objects = 0;
size_t num_sloppy_objects = 0;
size_t total_allocated = 0;
size_t total_slop = 0;
size_t large_slop = 0;
size_t large_used = 0;
size_t huge_slop = 0;
size_t huge_used = 0;
size_t bin_slop[JEMALLOC_MAX_STATS_BINS] = {0};
for (size_t slot_id = 0; slot_id < mNumUsedSlots; slot_id++) {
MemSlot& slot = mSlots[slot_id];
if (slot.mPtr) {
size_t used = ::malloc_usable_size_impl(slot.mPtr);
size_t slop = used - slot.mRequest;
total_allocated += used;
total_slop += slop;
num_objects++;
if (slop) {
num_sloppy_objects++;
}
if (used <= stats.page_size / 2) {
// We know that this is an inefficient linear search, but there's a
// small number of bins and this is simple.
for (unsigned i = 0; i < JEMALLOC_MAX_STATS_BINS; i++) {
auto& bin = bin_stats[i];
if (used == bin.size) {
bin_slop[i] += slop;
break;
}
}
} else if (used <= stats.large_max) {
large_slop += slop;
large_used += used;
} else {
huge_slop += slop;
huge_used += used;
}
}
}
FdPrintf(mStdErr, "\n");
FdPrintf(mStdErr, "Objects: %9zu\n", num_objects);
FdPrintf(mStdErr, "Slots: %9zu\n", mNumUsedSlots);
FdPrintf(mStdErr, "Ops: %9zu\n", mOps);
FdPrintf(mStdErr, "mapped: %9zu\n", stats.mapped);
FdPrintf(mStdErr, "allocated: %9zu\n", stats.allocated);
@ -405,6 +508,19 @@ class Replay {
FdPrintf(mStdErr, "quantum-max: %9zu\n", stats.quantum_max);
FdPrintf(mStdErr, "subpage-max: %9zu\n", stats.page_size / 2);
FdPrintf(mStdErr, "large-max: %9zu\n", stats.large_max);
if (mCalculateSlop) {
size_t slop = mTotalAllocatedSize - mTotalRequestedSize;
FdPrintf(mStdErr,
"Total slop for all allocations: %zuKiB/%zuKiB (%zu%%)\n",
slop / 1024, mTotalAllocatedSize / 1024,
percent(slop, mTotalAllocatedSize));
}
FdPrintf(mStdErr, "Live sloppy objects: %zu/%zu (%zu%%)\n",
num_sloppy_objects, num_objects,
percent(num_sloppy_objects, num_objects));
FdPrintf(mStdErr, "Live sloppy bytes: %zuKiB/%zuKiB (%zu%%)\n",
total_slop / 1024, total_allocated / 1024,
percent(total_slop, total_allocated));
FdPrintf(mStdErr, "\n%8s %11s %10s %8s %9s %9s %8s\n", "bin-size",
"unused (c)", "total (c)", "used (c)", "non-full (r)", "total (r)",
@ -419,6 +535,20 @@ class Replay {
}
}
FdPrintf(mStdErr, "\n%5s %8s %9s %7s\n", "bin", "slop", "used", "percent");
for (unsigned i = 0; i < JEMALLOC_MAX_STATS_BINS; i++) {
auto& bin = bin_stats[i];
if (bin.size) {
size_t used = bin.bytes_total - bin.bytes_unused;
FdPrintf(mStdErr, "%5zu %8zu %9zu %6zu%%\n", bin.size, bin_slop[i],
used, percent(bin_slop[i], used));
}
}
FdPrintf(mStdErr, "%5s %8zu %9zu %6zu%%\n", "large", large_slop, large_used,
percent(large_slop, large_used));
FdPrintf(mStdErr, "%5s %8zu %9zu %6zu%%\n", "huge", huge_slop, huge_used,
percent(huge_slop, huge_used));
/* TODO: Add more data, like actual RSS as measured by OS, but compensated
* for the replay internal data. */
}
@ -440,19 +570,41 @@ class Replay {
}
size_t slot_id = parseNumber(aResult);
mNumUsedSlots = std::max(mNumUsedSlots, slot_id + 1);
return mSlots[slot_id];
}
intptr_t mStdErr;
size_t mOps;
// The number of slots that have been used. It is used to iterate over slots
// without accessing those we haven't initialised.
size_t mNumUsedSlots;
MemSlotList mSlots;
size_t mTotalRequestedSize;
size_t mTotalAllocatedSize;
// Whether to calculate slop for all allocations over the runtime of a
// process.
bool mCalculateSlop;
};
int main() {
int main(int argc, const char* argv[]) {
size_t first_pid = 0;
FdReader reader(0);
Replay replay;
for (int i = 1; i < argc; i++) {
const char *option = argv[i];
if (strcmp(option, "-s") == 0) {
replay.enableSlopCalculation();
} else {
fprintf(stderr, "Unknown command line option: %s\n", option);
return EXIT_FAILURE;
}
}
/* Read log from stdin and dispatch function calls to the Replay instance.
* The log format is essentially:
* <pid> <tid> <function>([<args>])[=<result>]