gecko-dev/xpcom/threads/HangMonitor.cpp
Markus Stange 4bdc4ca0fb Bug 1323100 - Register most of the remaining threadfunc threads with the profiler. r=froydnj
As far as I can tell, this covers all the remaining threads which we start
using PR_CreateThread, except the ones that are created inside NSPR or NSS,
and except for the Shutdown Watchdog thread in nsTerminator.cpp and the
CacheIO thread. The Shutdown Watchdog thread stays alive past leak detection
during shutdown (by design), so we'd report leaks if we profiled it. The
CacheIO thread seems to stay alive past shutdown leak detection sometimes as
well.

This adds a AutoProfilerRegister stack class for easy registering and
unregistering. There are a few places where we still call
profiler_register_thread() and profiler_unregister_thread() manually, either
because registration happens conditionally, or because there is a variable that
gets put on the stack before the AutoProfilerRegister (e.g. a dynamically
generated thread name). AutoProfilerRegister needs to be the first object on
the stack because it uses its own `this` pointer as the stack top address.

MozReview-Commit-ID: 3vwhS55Yzt

--HG--
extra : rebase_source : 56dd27282e7bd09a7e7dc7ca09ccfe3a0198e7af
2017-01-05 16:34:26 +01:00

437 lines
12 KiB
C++

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "mozilla/HangMonitor.h"
#include "mozilla/Atomics.h"
#include "mozilla/BackgroundHangMonitor.h"
#include "mozilla/Monitor.h"
#include "mozilla/Preferences.h"
#include "mozilla/ProcessedStack.h"
#include "mozilla/Telemetry.h"
#include "mozilla/StaticPtr.h"
#include "mozilla/UniquePtr.h"
#include "nsReadableUtils.h"
#include "mozilla/StackWalk.h"
#ifdef _WIN64
#include "mozilla/StackWalk_windows.h"
#endif
#include "nsThreadUtils.h"
#include "nsXULAppAPI.h"
#include "GeckoProfiler.h"
#ifdef MOZ_CRASHREPORTER
#include "nsExceptionHandler.h"
#endif
#ifdef XP_WIN
#include <windows.h>
#endif
#if defined(MOZ_ENABLE_PROFILER_SPS) && defined(MOZ_PROFILING) && defined(XP_WIN)
#define REPORT_CHROME_HANGS
#endif
namespace mozilla {
namespace HangMonitor {
/**
* A flag which may be set from within a debugger to disable the hang
* monitor.
*/
volatile bool gDebugDisableHangMonitor = false;
const char kHangMonitorPrefName[] = "hangmonitor.timeout";
#ifdef REPORT_CHROME_HANGS
const char kTelemetryPrefName[] = "toolkit.telemetry.enabled";
#endif
// Monitor protects gShutdown and gTimeout, but not gTimestamp which rely on
// being atomically set by the processor; synchronization doesn't really matter
// in this use case.
Monitor* gMonitor;
// The timeout preference, in seconds.
int32_t gTimeout;
PRThread* gThread;
// Set when shutdown begins to signal the thread to exit immediately.
bool gShutdown;
// The timestamp of the last event notification, or PR_INTERVAL_NO_WAIT if
// we're currently not processing events.
Atomic<PRIntervalTime> gTimestamp(PR_INTERVAL_NO_WAIT);
#ifdef REPORT_CHROME_HANGS
// Main thread ID used in reporting chrome hangs under Windows
static HANDLE winMainThreadHandle = nullptr;
// Default timeout for reporting chrome hangs to Telemetry (5 seconds)
static const int32_t DEFAULT_CHROME_HANG_INTERVAL = 5;
// Maximum number of PCs to gather from the stack
static const int32_t MAX_CALL_STACK_PCS = 400;
#endif
// PrefChangedFunc
void
PrefChanged(const char*, void*)
{
int32_t newval = Preferences::GetInt(kHangMonitorPrefName);
#ifdef REPORT_CHROME_HANGS
// Monitor chrome hangs on the profiling branch if Telemetry enabled
if (newval == 0) {
bool telemetryEnabled = Preferences::GetBool(kTelemetryPrefName);
if (telemetryEnabled) {
newval = DEFAULT_CHROME_HANG_INTERVAL;
}
}
#endif
MonitorAutoLock lock(*gMonitor);
if (newval != gTimeout) {
gTimeout = newval;
lock.Notify();
}
}
void
Crash()
{
if (gDebugDisableHangMonitor) {
return;
}
#ifdef XP_WIN
if (::IsDebuggerPresent()) {
return;
}
#endif
#ifdef MOZ_CRASHREPORTER
// If you change this, you must also deal with the threadsafety of AnnotateCrashReport in
// non-chrome processes!
if (GeckoProcessType_Default == XRE_GetProcessType()) {
CrashReporter::AnnotateCrashReport(NS_LITERAL_CSTRING("Hang"),
NS_LITERAL_CSTRING("1"));
}
#endif
MOZ_CRASH("HangMonitor triggered");
}
#ifdef REPORT_CHROME_HANGS
static void
ChromeStackWalker(uint32_t aFrameNumber, void* aPC, void* aSP, void* aClosure)
{
MOZ_ASSERT(aClosure);
std::vector<uintptr_t>* stack =
static_cast<std::vector<uintptr_t>*>(aClosure);
if (stack->size() == MAX_CALL_STACK_PCS) {
return;
}
MOZ_ASSERT(stack->size() < MAX_CALL_STACK_PCS);
stack->push_back(reinterpret_cast<uintptr_t>(aPC));
}
static void
GetChromeHangReport(Telemetry::ProcessedStack& aStack,
int32_t& aSystemUptime,
int32_t& aFirefoxUptime)
{
MOZ_ASSERT(winMainThreadHandle);
// The thread we're about to suspend might have the alloc lock
// so allocate ahead of time
std::vector<uintptr_t> rawStack;
rawStack.reserve(MAX_CALL_STACK_PCS);
// Workaround possible deadlock where the main thread is running a
// long-standing JS job, and happens to be in the JIT allocator when we
// suspend it. Since, on win 64, this requires holding a process lock that
// MozStackWalk requires, take this "workaround lock" to avoid deadlock.
#ifdef _WIN64
AcquireStackWalkWorkaroundLock();
#endif
DWORD ret = ::SuspendThread(winMainThreadHandle);
bool suspended = false;
if (ret != -1) {
// SuspendThread is asynchronous, so the thread may still be running. Use
// GetThreadContext to ensure it's really suspended.
// See https://blogs.msdn.microsoft.com/oldnewthing/20150205-00/?p=44743.
CONTEXT context;
context.ContextFlags = CONTEXT_CONTROL;
if (::GetThreadContext(winMainThreadHandle, &context)) {
suspended = true;
}
}
#ifdef _WIN64
ReleaseStackWalkWorkaroundLock();
#endif
if (!suspended) {
if (ret != -1) {
MOZ_ALWAYS_TRUE(::ResumeThread(winMainThreadHandle) != DWORD(-1));
}
return;
}
MozStackWalk(ChromeStackWalker, /* skipFrames */ 0, /* maxFrames */ 0,
reinterpret_cast<void*>(&rawStack),
reinterpret_cast<uintptr_t>(winMainThreadHandle), nullptr);
ret = ::ResumeThread(winMainThreadHandle);
if (ret == -1) {
return;
}
aStack = Telemetry::GetStackAndModules(rawStack);
// Record system uptime (in minutes) at the time of the hang
aSystemUptime = ((GetTickCount() / 1000) - (gTimeout * 2)) / 60;
// Record Firefox uptime (in minutes) at the time of the hang
bool error;
TimeStamp processCreation = TimeStamp::ProcessCreation(error);
if (!error) {
TimeDuration td = TimeStamp::Now() - processCreation;
aFirefoxUptime = (static_cast<int32_t>(td.ToSeconds()) - (gTimeout * 2)) / 60;
} else {
aFirefoxUptime = -1;
}
}
#endif
void
ThreadMain(void*)
{
AutoProfilerRegister registerThread("Hang Monitor");
PR_SetCurrentThreadName("Hang Monitor");
MonitorAutoLock lock(*gMonitor);
// In order to avoid issues with the hang monitor incorrectly triggering
// during a general system stop such as sleeping, the monitor thread must
// run twice to trigger hang protection.
PRIntervalTime lastTimestamp = 0;
int waitCount = 0;
#ifdef REPORT_CHROME_HANGS
Telemetry::ProcessedStack stack;
int32_t systemUptime = -1;
int32_t firefoxUptime = -1;
UniquePtr<HangAnnotations> annotations;
#endif
while (true) {
if (gShutdown) {
return; // Exit the thread
}
// avoid rereading the volatile value in this loop
PRIntervalTime timestamp = gTimestamp;
PRIntervalTime now = PR_IntervalNow();
if (timestamp != PR_INTERVAL_NO_WAIT &&
now < timestamp) {
// 32-bit overflow, reset for another waiting period
timestamp = 1; // lowest legal PRInterval value
}
if (timestamp != PR_INTERVAL_NO_WAIT &&
timestamp == lastTimestamp &&
gTimeout > 0) {
++waitCount;
#ifdef REPORT_CHROME_HANGS
// Capture the chrome-hang stack + Firefox & system uptimes after
// the minimum hang duration has been reached (not when the hang ends)
if (waitCount == 2) {
GetChromeHangReport(stack, systemUptime, firefoxUptime);
annotations = ChromeHangAnnotatorCallout();
}
#else
// This is the crash-on-hang feature.
// See bug 867313 for the quirk in the waitCount comparison
if (waitCount >= 2) {
int32_t delay =
int32_t(PR_IntervalToSeconds(now - timestamp));
if (delay >= gTimeout) {
MonitorAutoUnlock unlock(*gMonitor);
Crash();
}
}
#endif
} else {
#ifdef REPORT_CHROME_HANGS
if (waitCount >= 2) {
uint32_t hangDuration = PR_IntervalToSeconds(now - lastTimestamp);
Telemetry::RecordChromeHang(hangDuration, stack, systemUptime,
firefoxUptime, Move(annotations));
stack.Clear();
}
#endif
lastTimestamp = timestamp;
waitCount = 0;
}
PRIntervalTime timeout;
if (gTimeout <= 0) {
timeout = PR_INTERVAL_NO_TIMEOUT;
} else {
timeout = PR_MillisecondsToInterval(gTimeout * 500);
}
lock.Wait(timeout);
}
}
void
Startup()
{
if (GeckoProcessType_Default != XRE_GetProcessType() &&
GeckoProcessType_Content != XRE_GetProcessType()) {
return;
}
MOZ_ASSERT(!gMonitor, "Hang monitor already initialized");
gMonitor = new Monitor("HangMonitor");
Preferences::RegisterCallback(PrefChanged, kHangMonitorPrefName, nullptr);
PrefChanged(nullptr, nullptr);
#ifdef REPORT_CHROME_HANGS
Preferences::RegisterCallback(PrefChanged, kTelemetryPrefName, nullptr);
winMainThreadHandle =
OpenThread(THREAD_ALL_ACCESS, FALSE, GetCurrentThreadId());
if (!winMainThreadHandle) {
return;
}
#endif
// Don't actually start measuring hangs until we hit the main event loop.
// This potentially misses a small class of really early startup hangs,
// but avoids dealing with some xpcshell tests and other situations which
// start XPCOM but don't ever start the event loop.
Suspend();
gThread = PR_CreateThread(PR_USER_THREAD,
ThreadMain,
nullptr, PR_PRIORITY_LOW, PR_GLOBAL_THREAD,
PR_JOINABLE_THREAD, 0);
}
void
Shutdown()
{
if (GeckoProcessType_Default != XRE_GetProcessType() &&
GeckoProcessType_Content != XRE_GetProcessType()) {
return;
}
MOZ_ASSERT(gMonitor, "Hang monitor not started");
{
// Scope the lock we're going to delete later
MonitorAutoLock lock(*gMonitor);
gShutdown = true;
lock.Notify();
}
// thread creation could theoretically fail
if (gThread) {
PR_JoinThread(gThread);
gThread = nullptr;
}
delete gMonitor;
gMonitor = nullptr;
}
static bool
IsUIMessageWaiting()
{
#ifndef XP_WIN
return false;
#else
#define NS_WM_IMEFIRST WM_IME_SETCONTEXT
#define NS_WM_IMELAST WM_IME_KEYUP
BOOL haveUIMessageWaiting = FALSE;
MSG msg;
haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_KEYFIRST,
WM_IME_KEYLAST, PM_NOREMOVE);
haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, NS_WM_IMEFIRST,
NS_WM_IMELAST, PM_NOREMOVE);
haveUIMessageWaiting |= ::PeekMessageW(&msg, nullptr, WM_MOUSEFIRST,
WM_MOUSELAST, PM_NOREMOVE);
return haveUIMessageWaiting;
#endif
}
void
NotifyActivity(ActivityType aActivityType)
{
MOZ_ASSERT(NS_IsMainThread(),
"HangMonitor::Notify called from off the main thread.");
// Determine the activity type more specifically
if (aActivityType == kGeneralActivity) {
aActivityType = IsUIMessageWaiting() ? kActivityUIAVail :
kActivityNoUIAVail;
}
// Calculate the cumulative amount of lag time since the last UI message
static uint32_t cumulativeUILagMS = 0;
switch (aActivityType) {
case kActivityNoUIAVail:
cumulativeUILagMS = 0;
break;
case kActivityUIAVail:
case kUIActivity:
if (gTimestamp != PR_INTERVAL_NO_WAIT) {
cumulativeUILagMS += PR_IntervalToMilliseconds(PR_IntervalNow() -
gTimestamp);
}
break;
default:
break;
}
// This is not a locked activity because PRTimeStamp is a 32-bit quantity
// which can be read/written atomically, and we don't want to pay locking
// penalties here.
gTimestamp = PR_IntervalNow();
// If we have UI activity we should reset the timer and report it
if (aActivityType == kUIActivity) {
mozilla::Telemetry::Accumulate(mozilla::Telemetry::EVENTLOOP_UI_ACTIVITY_EXP_MS,
cumulativeUILagMS);
cumulativeUILagMS = 0;
}
if (gThread && !gShutdown) {
mozilla::BackgroundHangMonitor().NotifyActivity();
}
}
void
Suspend()
{
MOZ_ASSERT(NS_IsMainThread(),
"HangMonitor::Suspend called from off the main thread.");
// Because gTimestamp changes this resets the wait count.
gTimestamp = PR_INTERVAL_NO_WAIT;
if (gThread && !gShutdown) {
mozilla::BackgroundHangMonitor().NotifyWait();
}
}
} // namespace HangMonitor
} // namespace mozilla