mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-28 23:31:56 +00:00
Bug 1382440 - Watch CPU usage in BHR r=froydnj
We would like to be able to see if a given hang in BHR occurred under high CPU load, as this is an indication that the hang is of less use to us, since it's likely that the external CPU use is more responsible for it. The way this works is fairly simple. We get the system CPU usage on a scale from 0 to 1, and we get the current process's CPU usage, also on a scale from 0 to 1, and we subtract the latter from the former. We then compare this value to a threshold, which is 1 - (1 / p), where p is the number of (virtual) cores on the machine. This threshold might need to be tuned, so that we require an entire physical core in order to not annotate the hang, but for now it seemed the most reasonable line in the sand. I should note that this considers CPU usage in child or parent processes as external. While we are responsible for that CPU usage, it still indicates that the stack we receive from BHR is of little value to us, since the source of the actual hang is external to that stack. MozReview-Commit-ID: JkG53zq1MdY --HG-- extra : rebase_source : 16553a9b5eac0a73cd1619c6ee01fa177ca60e58
This commit is contained in:
parent
f66c96f2c8
commit
8aae071723
@ -6,6 +6,7 @@
|
||||
|
||||
#include "mozilla/ArrayUtils.h"
|
||||
#include "mozilla/BackgroundHangMonitor.h"
|
||||
#include "mozilla/CPUUsageWatcher.h"
|
||||
#include "mozilla/LinkedList.h"
|
||||
#include "mozilla/Monitor.h"
|
||||
#include "mozilla/Move.h"
|
||||
@ -14,6 +15,7 @@
|
||||
#include "mozilla/Telemetry.h"
|
||||
#include "mozilla/ThreadLocal.h"
|
||||
#include "mozilla/SystemGroup.h"
|
||||
#include "mozilla/Unused.h"
|
||||
|
||||
#include "prinrval.h"
|
||||
#include "prthread.h"
|
||||
@ -39,6 +41,10 @@
|
||||
// the 99.9th percentile of the thread hangs stack depths reported by Telemetry.
|
||||
static const size_t kMaxThreadHangStackDepth = 30;
|
||||
|
||||
// Interval at which we check the global and per-process CPU usage in order to determine
|
||||
// if there is high external CPU usage.
|
||||
static const int32_t kCheckCPUIntervalMilliseconds = 2000;
|
||||
|
||||
// An utility comparator function used by std::unique to collapse "(* script)" entries in
|
||||
// a vector representing a call stack.
|
||||
bool StackScriptEntriesCollapser(const char* aStackEntry, const char *aAnotherStackEntry)
|
||||
@ -95,6 +101,9 @@ public:
|
||||
// thread, and carried around, as nsStreamTransportService::Init is
|
||||
// non-threadsafe.
|
||||
nsCOMPtr<nsIEventTarget> mSTS;
|
||||
// Allows us to watch CPU usage and annotate hangs when the system is
|
||||
// under high external load.
|
||||
CPUUsageWatcher mCPUUsageWatcher;
|
||||
|
||||
void Shutdown()
|
||||
{
|
||||
@ -283,6 +292,9 @@ BackgroundHangManager::RunMonitorThread()
|
||||
// Default values for the first iteration of thread loop
|
||||
PRIntervalTime waitTime = PR_INTERVAL_NO_WAIT;
|
||||
PRIntervalTime recheckTimeout = PR_INTERVAL_NO_WAIT;
|
||||
PRIntervalTime lastCheckedCPUUsage = systemTime;
|
||||
PRIntervalTime checkCPUUsageInterval =
|
||||
PR_MillisecondsToInterval(kCheckCPUIntervalMilliseconds);
|
||||
|
||||
while (!mShutdown) {
|
||||
nsresult rv = autoLock.Wait(waitTime);
|
||||
@ -291,6 +303,11 @@ BackgroundHangManager::RunMonitorThread()
|
||||
PRIntervalTime systemInterval = newTime - systemTime;
|
||||
systemTime = newTime;
|
||||
|
||||
if (systemTime - lastCheckedCPUUsage > checkCPUUsageInterval) {
|
||||
Unused << NS_WARN_IF(mCPUUsageWatcher.CollectCPUUsage().isErr());
|
||||
lastCheckedCPUUsage = systemTime;
|
||||
}
|
||||
|
||||
/* waitTime is a quarter of the shortest timeout value; If our timing
|
||||
latency is low enough (less than half the shortest timeout value),
|
||||
we can update mIntervalNow. */
|
||||
@ -347,6 +364,16 @@ BackgroundHangManager::RunMonitorThread()
|
||||
currentThread->mHangStack,
|
||||
currentThread->mRunnableName,
|
||||
true);
|
||||
|
||||
// If we hang immediately on waking, then the most recently collected
|
||||
// CPU usage is going to be an average across the whole time we were
|
||||
// sleeping. Accordingly, we want to make sure that when we hang, we
|
||||
// collect a fresh value.
|
||||
if (systemTime != lastCheckedCPUUsage) {
|
||||
Unused << NS_WARN_IF(mCPUUsageWatcher.CollectCPUUsage().isErr());
|
||||
lastCheckedCPUUsage = systemTime;
|
||||
}
|
||||
|
||||
currentThread->mHangStart = interval;
|
||||
currentThread->mHanging = true;
|
||||
currentThread->mAnnotations =
|
||||
@ -572,6 +599,7 @@ BackgroundHangMonitor::Startup()
|
||||
if (XRE_IsParentProcess()) { // cached ClientID hasn't been read yet
|
||||
BackgroundHangThread::Startup();
|
||||
BackgroundHangManager::sInstance = new BackgroundHangManager();
|
||||
Unused << NS_WARN_IF(BackgroundHangManager::sInstance->mCPUUsageWatcher.Init().isErr());
|
||||
|
||||
nsCOMPtr<nsIObserverService> observerService = mozilla::services::GetObserverService();
|
||||
MOZ_ASSERT(observerService);
|
||||
@ -585,6 +613,7 @@ BackgroundHangMonitor::Startup()
|
||||
|
||||
BackgroundHangThread::Startup();
|
||||
BackgroundHangManager::sInstance = new BackgroundHangManager();
|
||||
Unused << NS_WARN_IF(BackgroundHangManager::sInstance->mCPUUsageWatcher.Init().isErr());
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -598,6 +627,7 @@ BackgroundHangMonitor::Shutdown()
|
||||
}
|
||||
|
||||
MOZ_ASSERT(BackgroundHangManager::sInstance, "Not initialized");
|
||||
BackgroundHangManager::sInstance->mCPUUsageWatcher.Uninit();
|
||||
/* Scope our lock inside Shutdown() because the sInstance object can
|
||||
be destroyed as soon as we set sInstance to nullptr below, and
|
||||
we don't want to hold the lock when it's being destroyed. */
|
||||
|
@ -7,6 +7,7 @@
|
||||
#ifndef mozilla_BackgroundHangMonitor_h
|
||||
#define mozilla_BackgroundHangMonitor_h
|
||||
|
||||
#include "mozilla/CPUUsageWatcher.h"
|
||||
#include "mozilla/HangAnnotations.h"
|
||||
#include "mozilla/Monitor.h"
|
||||
#include "mozilla/RefPtr.h"
|
||||
|
@ -1064,12 +1064,11 @@ ShutdownXPCOM(nsIServiceManager* aServMgr)
|
||||
Omnijar::CleanUp();
|
||||
|
||||
HangMonitor::Shutdown();
|
||||
BackgroundHangMonitor::Shutdown();
|
||||
|
||||
delete sMainHangMonitor;
|
||||
sMainHangMonitor = nullptr;
|
||||
|
||||
BackgroundHangMonitor::Shutdown();
|
||||
|
||||
NS_LogTerm();
|
||||
|
||||
return NS_OK;
|
||||
|
243
xpcom/threads/CPUUsageWatcher.cpp
Normal file
243
xpcom/threads/CPUUsageWatcher.cpp
Normal file
@ -0,0 +1,243 @@
|
||||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "mozilla/CPUUsageWatcher.h"
|
||||
|
||||
#include "prsystem.h"
|
||||
|
||||
#ifdef XP_MACOSX
|
||||
#include <mach/mach_host.h>
|
||||
#endif
|
||||
|
||||
// We only support OSX and Windows, because on Linux we're forced to read
|
||||
// from /proc/stat in order to get global CPU values. We would prefer to not
|
||||
// eat that cost for this.
|
||||
#if defined(NIGHTLY_BUILD) && (defined(XP_WIN) || defined(XP_MACOSX))
|
||||
#define CPU_USAGE_WATCHER_ACTIVE
|
||||
#endif
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
// Even if the machine only has one processor, tolerate up to 50%
|
||||
// external CPU usage.
|
||||
static const float kTolerableExternalCPUUsageFloor = 0.5f;
|
||||
|
||||
struct CPUStats {
|
||||
// The average CPU usage time, which can be summed across all cores in the
|
||||
// system, or averaged between them. Whichever it is, it needs to be in the
|
||||
// same units as updateTime.
|
||||
uint64_t usageTime;
|
||||
// A monotonically increasing value in the same units as usageTime, which can
|
||||
// be used to determine the percentage of active vs idle time
|
||||
uint64_t updateTime;
|
||||
};
|
||||
|
||||
#ifdef XP_MACOSX
|
||||
|
||||
static const uint64_t kNanosecondsPerSecond = 1000000000LL;
|
||||
static const uint64_t kCPUCheckInterval = kNanosecondsPerSecond / 2LL;
|
||||
|
||||
Result<uint64_t, CPUUsageWatcherError>
|
||||
GetClockTime(clockid_t clockId) {
|
||||
timespec clockResult;
|
||||
bool success = !clock_gettime(clockId, &clockResult);
|
||||
if (!success) {
|
||||
return Err(ClockGetTimeError);
|
||||
}
|
||||
return ((uint64_t)clockResult.tv_sec) * kNanosecondsPerSecond +
|
||||
(uint64_t)clockResult.tv_nsec;
|
||||
}
|
||||
|
||||
Result<CPUStats, CPUUsageWatcherError>
|
||||
GetProcessCPUStats(int32_t numCPUs) {
|
||||
CPUStats result = {};
|
||||
MOZ_TRY_VAR(result.usageTime, GetClockTime(CLOCK_PROCESS_CPUTIME_ID));
|
||||
MOZ_TRY_VAR(result.updateTime, GetClockTime(CLOCK_MONOTONIC));
|
||||
// CLOCK_PROCESS_CPUTIME_ID will give us the sum of the values across all
|
||||
// of our cores. Divide by the number of CPUs to get an average.
|
||||
result.usageTime /= numCPUs;
|
||||
return result;
|
||||
}
|
||||
|
||||
Result<CPUStats, CPUUsageWatcherError>
|
||||
GetGlobalCPUStats() {
|
||||
CPUStats result = {};
|
||||
host_cpu_load_info_data_t loadInfo;
|
||||
mach_msg_type_number_t loadInfoCount = HOST_CPU_LOAD_INFO_COUNT;
|
||||
kern_return_t statsResult = host_statistics(mach_host_self(),
|
||||
HOST_CPU_LOAD_INFO,
|
||||
(host_info_t)&loadInfo,
|
||||
&loadInfoCount);
|
||||
if (statsResult != KERN_SUCCESS) {
|
||||
return Err(HostStatisticsError);
|
||||
}
|
||||
|
||||
result.usageTime = loadInfo.cpu_ticks[CPU_STATE_USER] +
|
||||
loadInfo.cpu_ticks[CPU_STATE_NICE] +
|
||||
loadInfo.cpu_ticks[CPU_STATE_SYSTEM];
|
||||
result.updateTime = result.usageTime + loadInfo.cpu_ticks[CPU_STATE_IDLE];
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif // XP_MACOSX
|
||||
|
||||
#ifdef XP_WIN
|
||||
|
||||
// A FILETIME represents the number of 100-nanosecond ticks since 1/1/1601 UTC
|
||||
static const uint64_t kFILETIMETicksPerSecond = 10000000;
|
||||
static const uint64_t kCPUCheckInterval = kFILETIMETicksPerSecond / 2;
|
||||
|
||||
uint64_t
|
||||
FiletimeToInteger(FILETIME filetime) {
|
||||
return ((uint64_t)filetime.dwLowDateTime) |
|
||||
(uint64_t)filetime.dwHighDateTime << 32;
|
||||
}
|
||||
|
||||
Result<CPUStats, CPUUsageWatcherError> GetProcessCPUStats(int32_t numCPUs) {
|
||||
CPUStats result = {};
|
||||
FILETIME creationFiletime;
|
||||
FILETIME exitFiletime;
|
||||
FILETIME kernelFiletime;
|
||||
FILETIME userFiletime;
|
||||
bool success = GetProcessTimes(GetCurrentProcess(),
|
||||
&creationFiletime,
|
||||
&exitFiletime,
|
||||
&kernelFiletime,
|
||||
&userFiletime);
|
||||
if (!success) {
|
||||
return Err(GetProcessTimesError);
|
||||
}
|
||||
|
||||
result.usageTime = FiletimeToInteger(kernelFiletime) +
|
||||
FiletimeToInteger(userFiletime);
|
||||
|
||||
FILETIME nowFiletime;
|
||||
GetSystemTimeAsFileTime(&nowFiletime);
|
||||
result.updateTime = FiletimeToInteger(nowFiletime);
|
||||
|
||||
result.usageTime /= numCPUs;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Result<CPUStats, CPUUsageWatcherError>
|
||||
GetGlobalCPUStats() {
|
||||
CPUStats result = {};
|
||||
FILETIME idleFiletime;
|
||||
FILETIME kernelFiletime;
|
||||
FILETIME userFiletime;
|
||||
bool success = GetSystemTimes(&idleFiletime,
|
||||
&kernelFiletime,
|
||||
&userFiletime);
|
||||
|
||||
if (!success) {
|
||||
return Err(GetSystemTimesError);
|
||||
}
|
||||
|
||||
result.usageTime = FiletimeToInteger(kernelFiletime) +
|
||||
FiletimeToInteger(userFiletime);
|
||||
result.updateTime = result.usageTime + FiletimeToInteger(idleFiletime);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif // XP_WIN
|
||||
|
||||
Result<Ok, CPUUsageWatcherError>
|
||||
CPUUsageWatcher::Init()
|
||||
{
|
||||
mNumCPUs = PR_GetNumberOfProcessors();
|
||||
if (mNumCPUs <= 0) {
|
||||
mExternalUsageThreshold = 1.0f;
|
||||
return Err(GetNumberOfProcessorsError);
|
||||
}
|
||||
mExternalUsageThreshold = std::max(1.0f - 1.0f / (float)mNumCPUs,
|
||||
kTolerableExternalCPUUsageFloor);
|
||||
|
||||
#ifdef CPU_USAGE_WATCHER_ACTIVE
|
||||
|
||||
CPUStats processTimes;
|
||||
MOZ_TRY_VAR(processTimes, GetProcessCPUStats(mNumCPUs));
|
||||
mProcessUpdateTime = processTimes.updateTime;
|
||||
mProcessUsageTime = processTimes.usageTime;
|
||||
|
||||
CPUStats globalTimes;
|
||||
MOZ_TRY_VAR(globalTimes, GetGlobalCPUStats());
|
||||
mGlobalUpdateTime = globalTimes.updateTime;
|
||||
mGlobalUsageTime = globalTimes.usageTime;
|
||||
|
||||
mInitialized = true;
|
||||
|
||||
CPUUsageWatcher* self = this;
|
||||
NS_DispatchToMainThread(
|
||||
NS_NewRunnableFunction("CPUUsageWatcher::Init",
|
||||
[=]() { HangMonitor::RegisterAnnotator(*self); }));
|
||||
|
||||
#endif // CPU_USAGE_WATCHER_ACTIVE
|
||||
return Ok();
|
||||
}
|
||||
|
||||
void
|
||||
CPUUsageWatcher::Uninit()
|
||||
{
|
||||
mInitialized = false;
|
||||
|
||||
#ifdef CPU_USAGE_WATCHER_ACTIVE
|
||||
HangMonitor::UnregisterAnnotator(*this);
|
||||
#endif // CPU_USAGE_WATCHER_ACTIVE
|
||||
}
|
||||
|
||||
Result<Ok, CPUUsageWatcherError>
|
||||
CPUUsageWatcher::CollectCPUUsage()
|
||||
{
|
||||
if (!mInitialized) {
|
||||
return Ok();
|
||||
}
|
||||
|
||||
#ifdef CPU_USAGE_WATCHER_ACTIVE
|
||||
mExternalUsageRatio = 0.0f;
|
||||
|
||||
CPUStats processTimes;
|
||||
MOZ_TRY_VAR(processTimes, GetProcessCPUStats(mNumCPUs));
|
||||
CPUStats globalTimes;
|
||||
MOZ_TRY_VAR(globalTimes, GetGlobalCPUStats());
|
||||
|
||||
uint64_t processUsageDelta = processTimes.usageTime - mProcessUsageTime;
|
||||
uint64_t processUpdateDelta = processTimes.updateTime - mProcessUpdateTime;
|
||||
float processUsageNormalized = processUsageDelta > 0 ?
|
||||
(float)processUsageDelta / (float)processUpdateDelta :
|
||||
0.0f;
|
||||
|
||||
uint64_t globalUsageDelta = globalTimes.usageTime - mGlobalUsageTime;
|
||||
uint64_t globalUpdateDelta = globalTimes.updateTime - mGlobalUpdateTime;
|
||||
float globalUsageNormalized = globalUsageDelta > 0 ?
|
||||
(float)globalUsageDelta / (float)globalUpdateDelta :
|
||||
0.0f;
|
||||
|
||||
mProcessUsageTime = processTimes.usageTime;
|
||||
mProcessUpdateTime = processTimes.updateTime;
|
||||
mGlobalUsageTime = globalTimes.usageTime;
|
||||
mGlobalUpdateTime = globalTimes.updateTime;
|
||||
|
||||
mExternalUsageRatio = std::max(0.0f,
|
||||
globalUsageNormalized - processUsageNormalized);
|
||||
#endif // CPU_USAGE_WATCHER_ACTIVE
|
||||
|
||||
return Ok();
|
||||
}
|
||||
|
||||
void
|
||||
CPUUsageWatcher::AnnotateHang(HangMonitor::HangAnnotations& aAnnotations) {
|
||||
if (!mInitialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (mExternalUsageRatio > mExternalUsageThreshold) {
|
||||
aAnnotations.AddAnnotation(NS_LITERAL_STRING("ExternalCPUHigh"), true);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace mozilla
|
82
xpcom/threads/CPUUsageWatcher.h
Normal file
82
xpcom/threads/CPUUsageWatcher.h
Normal file
@ -0,0 +1,82 @@
|
||||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef mozilla_CPUUsageWatcher_h
|
||||
#define mozilla_CPUUsageWatcher_h
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "mozilla/HangAnnotations.h"
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
enum CPUUsageWatcherError : uint8_t
|
||||
{
|
||||
ClockGetTimeError,
|
||||
GetNumberOfProcessorsError,
|
||||
GetProcessTimesError,
|
||||
GetSystemTimesError,
|
||||
HostStatisticsError,
|
||||
ProcStatError,
|
||||
};
|
||||
|
||||
class CPUUsageHangAnnotator
|
||||
: public HangMonitor::Annotator
|
||||
{
|
||||
public:
|
||||
};
|
||||
|
||||
class CPUUsageWatcher
|
||||
: public HangMonitor::Annotator
|
||||
{
|
||||
public:
|
||||
CPUUsageWatcher()
|
||||
: mInitialized(false)
|
||||
, mExternalUsageThreshold(0)
|
||||
, mExternalUsageRatio(0)
|
||||
, mProcessUsageTime(0)
|
||||
, mProcessUpdateTime(0)
|
||||
, mGlobalUsageTime(0)
|
||||
, mGlobalUpdateTime(0)
|
||||
{}
|
||||
|
||||
Result<Ok, CPUUsageWatcherError> Init();
|
||||
|
||||
void Uninit();
|
||||
|
||||
// Updates necessary values to allow AnnotateHang to function. This must be
|
||||
// called on some semi-regular basis, as it will calculate the mean CPU
|
||||
// usage values between now and the last time it was called.
|
||||
Result<Ok, CPUUsageWatcherError> CollectCPUUsage();
|
||||
|
||||
void AnnotateHang(HangMonitor::HangAnnotations& aAnnotations) final;
|
||||
private:
|
||||
bool mInitialized;
|
||||
// The threshold above which we will mark a hang as occurring under high
|
||||
// external CPU usage conditions
|
||||
float mExternalUsageThreshold;
|
||||
// The CPU usage (0-1) external to our process, averaged between the two
|
||||
// most recent monitor thread runs
|
||||
float mExternalUsageRatio;
|
||||
// The total cumulative CPU usage time by our process as of the last
|
||||
// CollectCPUUsage or Startup
|
||||
uint64_t mProcessUsageTime;
|
||||
// A time value in the same units as mProcessUsageTime used to
|
||||
// determine the ratio of CPU usage time to idle time
|
||||
uint64_t mProcessUpdateTime;
|
||||
// The total cumulative CPU usage time by all processes as of the last
|
||||
// CollectCPUUsage or Startup
|
||||
uint64_t mGlobalUsageTime;
|
||||
// A time value in the same units as mGlobalUsageTime used to
|
||||
// determine the ratio of CPU usage time to idle time
|
||||
uint64_t mGlobalUpdateTime;
|
||||
// The number of virtual cores on our machine
|
||||
uint64_t mNumCPUs;
|
||||
};
|
||||
|
||||
} // namespace mozilla
|
||||
|
||||
#endif // mozilla_CPUUsageWatcher_h
|
@ -40,6 +40,7 @@ EXPORTS.mozilla += [
|
||||
'BlockingResourceBase.h',
|
||||
'CondVar.h',
|
||||
'CooperativeThreadPool.h',
|
||||
'CPUUsageWatcher.h',
|
||||
'DeadlockDetector.h',
|
||||
'EventQueue.h',
|
||||
'HangAnnotations.h',
|
||||
@ -77,6 +78,7 @@ UNIFIED_SOURCES += [
|
||||
'AbstractThread.cpp',
|
||||
'BlockingResourceBase.cpp',
|
||||
'CooperativeThreadPool.cpp',
|
||||
'CPUUsageWatcher.cpp',
|
||||
'EventQueue.cpp',
|
||||
'HangAnnotations.cpp',
|
||||
'HangMonitor.cpp',
|
||||
|
Loading…
Reference in New Issue
Block a user