Bug 1382440 - Watch CPU usage in BHR r=froydnj

We would like to be able to see if a given hang in BHR occurred
under high CPU load, as this is an indication that the hang is
of less use to us, since it's likely that the external CPU use
is more responsible for it.

The way this works is fairly simple. We get the system CPU usage
on a scale from 0 to 1, and we get the current process's CPU
usage, also on a scale from 0 to 1, and we subtract the latter
from the former. We then compare this value to a threshold, which
is 1 - (1 / p), where p is the number of (virtual) cores on the
machine. This threshold might need to be tuned, so that we
require an entire physical core in order to not annotate the hang,
but for now it seemed the most reasonable line in the sand.

I should note that this considers CPU usage in child or parent
processes as external. While we are responsible for that CPU usage,
it still indicates that the stack we receive from BHR is of little
value to us, since the source of the actual hang is external to
that stack.

MozReview-Commit-ID: JkG53zq1MdY

--HG--
extra : rebase_source : 16553a9b5eac0a73cd1619c6ee01fa177ca60e58
This commit is contained in:
Doug Thayer 2017-07-24 13:46:09 -07:00
parent f66c96f2c8
commit 8aae071723
6 changed files with 359 additions and 2 deletions

View File

@ -6,6 +6,7 @@
#include "mozilla/ArrayUtils.h"
#include "mozilla/BackgroundHangMonitor.h"
#include "mozilla/CPUUsageWatcher.h"
#include "mozilla/LinkedList.h"
#include "mozilla/Monitor.h"
#include "mozilla/Move.h"
@ -14,6 +15,7 @@
#include "mozilla/Telemetry.h"
#include "mozilla/ThreadLocal.h"
#include "mozilla/SystemGroup.h"
#include "mozilla/Unused.h"
#include "prinrval.h"
#include "prthread.h"
@ -39,6 +41,10 @@
// the 99.9th percentile of the thread hangs stack depths reported by Telemetry.
static const size_t kMaxThreadHangStackDepth = 30;
// Interval at which we check the global and per-process CPU usage in order to determine
// if there is high external CPU usage.
static const int32_t kCheckCPUIntervalMilliseconds = 2000;
// An utility comparator function used by std::unique to collapse "(* script)" entries in
// a vector representing a call stack.
bool StackScriptEntriesCollapser(const char* aStackEntry, const char *aAnotherStackEntry)
@ -95,6 +101,9 @@ public:
// thread, and carried around, as nsStreamTransportService::Init is
// non-threadsafe.
nsCOMPtr<nsIEventTarget> mSTS;
// Allows us to watch CPU usage and annotate hangs when the system is
// under high external load.
CPUUsageWatcher mCPUUsageWatcher;
void Shutdown()
{
@ -283,6 +292,9 @@ BackgroundHangManager::RunMonitorThread()
// Default values for the first iteration of thread loop
PRIntervalTime waitTime = PR_INTERVAL_NO_WAIT;
PRIntervalTime recheckTimeout = PR_INTERVAL_NO_WAIT;
PRIntervalTime lastCheckedCPUUsage = systemTime;
PRIntervalTime checkCPUUsageInterval =
PR_MillisecondsToInterval(kCheckCPUIntervalMilliseconds);
while (!mShutdown) {
nsresult rv = autoLock.Wait(waitTime);
@ -291,6 +303,11 @@ BackgroundHangManager::RunMonitorThread()
PRIntervalTime systemInterval = newTime - systemTime;
systemTime = newTime;
if (systemTime - lastCheckedCPUUsage > checkCPUUsageInterval) {
Unused << NS_WARN_IF(mCPUUsageWatcher.CollectCPUUsage().isErr());
lastCheckedCPUUsage = systemTime;
}
/* waitTime is a quarter of the shortest timeout value; If our timing
latency is low enough (less than half the shortest timeout value),
we can update mIntervalNow. */
@ -347,6 +364,16 @@ BackgroundHangManager::RunMonitorThread()
currentThread->mHangStack,
currentThread->mRunnableName,
true);
// If we hang immediately on waking, then the most recently collected
// CPU usage is going to be an average across the whole time we were
// sleeping. Accordingly, we want to make sure that when we hang, we
// collect a fresh value.
if (systemTime != lastCheckedCPUUsage) {
Unused << NS_WARN_IF(mCPUUsageWatcher.CollectCPUUsage().isErr());
lastCheckedCPUUsage = systemTime;
}
currentThread->mHangStart = interval;
currentThread->mHanging = true;
currentThread->mAnnotations =
@ -572,6 +599,7 @@ BackgroundHangMonitor::Startup()
if (XRE_IsParentProcess()) { // cached ClientID hasn't been read yet
BackgroundHangThread::Startup();
BackgroundHangManager::sInstance = new BackgroundHangManager();
Unused << NS_WARN_IF(BackgroundHangManager::sInstance->mCPUUsageWatcher.Init().isErr());
nsCOMPtr<nsIObserverService> observerService = mozilla::services::GetObserverService();
MOZ_ASSERT(observerService);
@ -585,6 +613,7 @@ BackgroundHangMonitor::Startup()
BackgroundHangThread::Startup();
BackgroundHangManager::sInstance = new BackgroundHangManager();
Unused << NS_WARN_IF(BackgroundHangManager::sInstance->mCPUUsageWatcher.Init().isErr());
#endif
}
@ -598,6 +627,7 @@ BackgroundHangMonitor::Shutdown()
}
MOZ_ASSERT(BackgroundHangManager::sInstance, "Not initialized");
BackgroundHangManager::sInstance->mCPUUsageWatcher.Uninit();
/* Scope our lock inside Shutdown() because the sInstance object can
be destroyed as soon as we set sInstance to nullptr below, and
we don't want to hold the lock when it's being destroyed. */

View File

@ -7,6 +7,7 @@
#ifndef mozilla_BackgroundHangMonitor_h
#define mozilla_BackgroundHangMonitor_h
#include "mozilla/CPUUsageWatcher.h"
#include "mozilla/HangAnnotations.h"
#include "mozilla/Monitor.h"
#include "mozilla/RefPtr.h"

View File

@ -1064,12 +1064,11 @@ ShutdownXPCOM(nsIServiceManager* aServMgr)
Omnijar::CleanUp();
HangMonitor::Shutdown();
BackgroundHangMonitor::Shutdown();
delete sMainHangMonitor;
sMainHangMonitor = nullptr;
BackgroundHangMonitor::Shutdown();
NS_LogTerm();
return NS_OK;

View File

@ -0,0 +1,243 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "mozilla/CPUUsageWatcher.h"
#include "prsystem.h"
#ifdef XP_MACOSX
#include <mach/mach_host.h>
#endif
// We only support OSX and Windows, because on Linux we're forced to read
// from /proc/stat in order to get global CPU values. We would prefer to not
// eat that cost for this.
#if defined(NIGHTLY_BUILD) && (defined(XP_WIN) || defined(XP_MACOSX))
#define CPU_USAGE_WATCHER_ACTIVE
#endif
namespace mozilla {
// Even if the machine only has one processor, tolerate up to 50%
// external CPU usage.
static const float kTolerableExternalCPUUsageFloor = 0.5f;
struct CPUStats {
// The average CPU usage time, which can be summed across all cores in the
// system, or averaged between them. Whichever it is, it needs to be in the
// same units as updateTime.
uint64_t usageTime;
// A monotonically increasing value in the same units as usageTime, which can
// be used to determine the percentage of active vs idle time
uint64_t updateTime;
};
#ifdef XP_MACOSX
static const uint64_t kNanosecondsPerSecond = 1000000000LL;
static const uint64_t kCPUCheckInterval = kNanosecondsPerSecond / 2LL;
Result<uint64_t, CPUUsageWatcherError>
GetClockTime(clockid_t clockId) {
timespec clockResult;
bool success = !clock_gettime(clockId, &clockResult);
if (!success) {
return Err(ClockGetTimeError);
}
return ((uint64_t)clockResult.tv_sec) * kNanosecondsPerSecond +
(uint64_t)clockResult.tv_nsec;
}
Result<CPUStats, CPUUsageWatcherError>
GetProcessCPUStats(int32_t numCPUs) {
CPUStats result = {};
MOZ_TRY_VAR(result.usageTime, GetClockTime(CLOCK_PROCESS_CPUTIME_ID));
MOZ_TRY_VAR(result.updateTime, GetClockTime(CLOCK_MONOTONIC));
// CLOCK_PROCESS_CPUTIME_ID will give us the sum of the values across all
// of our cores. Divide by the number of CPUs to get an average.
result.usageTime /= numCPUs;
return result;
}
Result<CPUStats, CPUUsageWatcherError>
GetGlobalCPUStats() {
CPUStats result = {};
host_cpu_load_info_data_t loadInfo;
mach_msg_type_number_t loadInfoCount = HOST_CPU_LOAD_INFO_COUNT;
kern_return_t statsResult = host_statistics(mach_host_self(),
HOST_CPU_LOAD_INFO,
(host_info_t)&loadInfo,
&loadInfoCount);
if (statsResult != KERN_SUCCESS) {
return Err(HostStatisticsError);
}
result.usageTime = loadInfo.cpu_ticks[CPU_STATE_USER] +
loadInfo.cpu_ticks[CPU_STATE_NICE] +
loadInfo.cpu_ticks[CPU_STATE_SYSTEM];
result.updateTime = result.usageTime + loadInfo.cpu_ticks[CPU_STATE_IDLE];
return result;
}
#endif // XP_MACOSX
#ifdef XP_WIN
// A FILETIME represents the number of 100-nanosecond ticks since 1/1/1601 UTC
static const uint64_t kFILETIMETicksPerSecond = 10000000;
static const uint64_t kCPUCheckInterval = kFILETIMETicksPerSecond / 2;
uint64_t
FiletimeToInteger(FILETIME filetime) {
return ((uint64_t)filetime.dwLowDateTime) |
(uint64_t)filetime.dwHighDateTime << 32;
}
Result<CPUStats, CPUUsageWatcherError> GetProcessCPUStats(int32_t numCPUs) {
CPUStats result = {};
FILETIME creationFiletime;
FILETIME exitFiletime;
FILETIME kernelFiletime;
FILETIME userFiletime;
bool success = GetProcessTimes(GetCurrentProcess(),
&creationFiletime,
&exitFiletime,
&kernelFiletime,
&userFiletime);
if (!success) {
return Err(GetProcessTimesError);
}
result.usageTime = FiletimeToInteger(kernelFiletime) +
FiletimeToInteger(userFiletime);
FILETIME nowFiletime;
GetSystemTimeAsFileTime(&nowFiletime);
result.updateTime = FiletimeToInteger(nowFiletime);
result.usageTime /= numCPUs;
return result;
}
Result<CPUStats, CPUUsageWatcherError>
GetGlobalCPUStats() {
CPUStats result = {};
FILETIME idleFiletime;
FILETIME kernelFiletime;
FILETIME userFiletime;
bool success = GetSystemTimes(&idleFiletime,
&kernelFiletime,
&userFiletime);
if (!success) {
return Err(GetSystemTimesError);
}
result.usageTime = FiletimeToInteger(kernelFiletime) +
FiletimeToInteger(userFiletime);
result.updateTime = result.usageTime + FiletimeToInteger(idleFiletime);
return result;
}
#endif // XP_WIN
Result<Ok, CPUUsageWatcherError>
CPUUsageWatcher::Init()
{
mNumCPUs = PR_GetNumberOfProcessors();
if (mNumCPUs <= 0) {
mExternalUsageThreshold = 1.0f;
return Err(GetNumberOfProcessorsError);
}
mExternalUsageThreshold = std::max(1.0f - 1.0f / (float)mNumCPUs,
kTolerableExternalCPUUsageFloor);
#ifdef CPU_USAGE_WATCHER_ACTIVE
CPUStats processTimes;
MOZ_TRY_VAR(processTimes, GetProcessCPUStats(mNumCPUs));
mProcessUpdateTime = processTimes.updateTime;
mProcessUsageTime = processTimes.usageTime;
CPUStats globalTimes;
MOZ_TRY_VAR(globalTimes, GetGlobalCPUStats());
mGlobalUpdateTime = globalTimes.updateTime;
mGlobalUsageTime = globalTimes.usageTime;
mInitialized = true;
CPUUsageWatcher* self = this;
NS_DispatchToMainThread(
NS_NewRunnableFunction("CPUUsageWatcher::Init",
[=]() { HangMonitor::RegisterAnnotator(*self); }));
#endif // CPU_USAGE_WATCHER_ACTIVE
return Ok();
}
void
CPUUsageWatcher::Uninit()
{
mInitialized = false;
#ifdef CPU_USAGE_WATCHER_ACTIVE
HangMonitor::UnregisterAnnotator(*this);
#endif // CPU_USAGE_WATCHER_ACTIVE
}
Result<Ok, CPUUsageWatcherError>
CPUUsageWatcher::CollectCPUUsage()
{
if (!mInitialized) {
return Ok();
}
#ifdef CPU_USAGE_WATCHER_ACTIVE
mExternalUsageRatio = 0.0f;
CPUStats processTimes;
MOZ_TRY_VAR(processTimes, GetProcessCPUStats(mNumCPUs));
CPUStats globalTimes;
MOZ_TRY_VAR(globalTimes, GetGlobalCPUStats());
uint64_t processUsageDelta = processTimes.usageTime - mProcessUsageTime;
uint64_t processUpdateDelta = processTimes.updateTime - mProcessUpdateTime;
float processUsageNormalized = processUsageDelta > 0 ?
(float)processUsageDelta / (float)processUpdateDelta :
0.0f;
uint64_t globalUsageDelta = globalTimes.usageTime - mGlobalUsageTime;
uint64_t globalUpdateDelta = globalTimes.updateTime - mGlobalUpdateTime;
float globalUsageNormalized = globalUsageDelta > 0 ?
(float)globalUsageDelta / (float)globalUpdateDelta :
0.0f;
mProcessUsageTime = processTimes.usageTime;
mProcessUpdateTime = processTimes.updateTime;
mGlobalUsageTime = globalTimes.usageTime;
mGlobalUpdateTime = globalTimes.updateTime;
mExternalUsageRatio = std::max(0.0f,
globalUsageNormalized - processUsageNormalized);
#endif // CPU_USAGE_WATCHER_ACTIVE
return Ok();
}
void
CPUUsageWatcher::AnnotateHang(HangMonitor::HangAnnotations& aAnnotations) {
if (!mInitialized) {
return;
}
if (mExternalUsageRatio > mExternalUsageThreshold) {
aAnnotations.AddAnnotation(NS_LITERAL_STRING("ExternalCPUHigh"), true);
}
}
} // namespace mozilla

View File

@ -0,0 +1,82 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef mozilla_CPUUsageWatcher_h
#define mozilla_CPUUsageWatcher_h
#include <stdint.h>
#include "mozilla/HangAnnotations.h"
namespace mozilla {
enum CPUUsageWatcherError : uint8_t
{
ClockGetTimeError,
GetNumberOfProcessorsError,
GetProcessTimesError,
GetSystemTimesError,
HostStatisticsError,
ProcStatError,
};
class CPUUsageHangAnnotator
: public HangMonitor::Annotator
{
public:
};
class CPUUsageWatcher
: public HangMonitor::Annotator
{
public:
CPUUsageWatcher()
: mInitialized(false)
, mExternalUsageThreshold(0)
, mExternalUsageRatio(0)
, mProcessUsageTime(0)
, mProcessUpdateTime(0)
, mGlobalUsageTime(0)
, mGlobalUpdateTime(0)
{}
Result<Ok, CPUUsageWatcherError> Init();
void Uninit();
// Updates necessary values to allow AnnotateHang to function. This must be
// called on some semi-regular basis, as it will calculate the mean CPU
// usage values between now and the last time it was called.
Result<Ok, CPUUsageWatcherError> CollectCPUUsage();
void AnnotateHang(HangMonitor::HangAnnotations& aAnnotations) final;
private:
bool mInitialized;
// The threshold above which we will mark a hang as occurring under high
// external CPU usage conditions
float mExternalUsageThreshold;
// The CPU usage (0-1) external to our process, averaged between the two
// most recent monitor thread runs
float mExternalUsageRatio;
// The total cumulative CPU usage time by our process as of the last
// CollectCPUUsage or Startup
uint64_t mProcessUsageTime;
// A time value in the same units as mProcessUsageTime used to
// determine the ratio of CPU usage time to idle time
uint64_t mProcessUpdateTime;
// The total cumulative CPU usage time by all processes as of the last
// CollectCPUUsage or Startup
uint64_t mGlobalUsageTime;
// A time value in the same units as mGlobalUsageTime used to
// determine the ratio of CPU usage time to idle time
uint64_t mGlobalUpdateTime;
// The number of virtual cores on our machine
uint64_t mNumCPUs;
};
} // namespace mozilla
#endif // mozilla_CPUUsageWatcher_h

View File

@ -40,6 +40,7 @@ EXPORTS.mozilla += [
'BlockingResourceBase.h',
'CondVar.h',
'CooperativeThreadPool.h',
'CPUUsageWatcher.h',
'DeadlockDetector.h',
'EventQueue.h',
'HangAnnotations.h',
@ -77,6 +78,7 @@ UNIFIED_SOURCES += [
'AbstractThread.cpp',
'BlockingResourceBase.cpp',
'CooperativeThreadPool.cpp',
'CPUUsageWatcher.cpp',
'EventQueue.cpp',
'HangAnnotations.cpp',
'HangMonitor.cpp',