Bug 1880503 - Generate paired minidump when GPU process is killed following IPC timeout. r=aosmond,gsvelto

When sync IPC under the top-level PCompositorManager protocol does not
reply within a certain time threshold we purposefully kill the GPU
process. While this allows the user to recover from a stuck GPU
process, we have little visibility about the underlying cause.

This patch makes it so that we generate a paired minidump for the GPU
and parent processes prior to killing the GPU process in
GPUProcessHost::KillHard(). The implementation roughly follows the
equivalent for content processes in ContentParent::KillHard().

As the GPU process can be purposefully killed during normal operation,
and because generating minidumps can be expensive, we are careful to
only do so when the new argument aGenerateMinidump is true. We
additionally remove the aReason argument as it is unused (and
currently innacurate in some places).

As these minidumps may not automatically submitted we limit the
minidumps generation to twice per session in order to avoid
accumulating a large number of unsubmitted minidumps on disk.

Differential Revision: https://phabricator.services.mozilla.com/D202166
This commit is contained in:
Jamie Nicol 2024-05-14 12:58:35 +00:00
parent 1f35ae62aa
commit 0056a7ee56
9 changed files with 80 additions and 22 deletions

View File

@ -4513,7 +4513,7 @@ void ContentParent::GeneratePairedMinidump(const char* aReason) {
CrashReporter::Annotation::ipc_channel_error, reason);
// Generate the report and insert into the queue for submittal.
if (mCrashReporter->GenerateMinidumpAndPair(this, "browser"_ns)) {
if (mCrashReporter->GenerateMinidumpAndPair(mSubprocess, "browser"_ns)) {
mCrashReporter->FinalizeCrashReport();
mCreatedPairedMinidumps = true;
}

View File

@ -111,6 +111,33 @@ bool GPUChild::EnsureGPUReady() {
void GPUChild::OnUnexpectedShutdown() { mUnexpectedShutdown = true; }
void GPUChild::GeneratePairedMinidump() {
// At most generate the paired minidumps twice per session in order to
// avoid accumulating a large number of unsubmitted minidumps on disk.
if (mCrashReporter && mNumPairedMinidumpsCreated < 2) {
nsAutoCString additionalDumps("browser");
mCrashReporter->AddAnnotationNSCString(
CrashReporter::Annotation::additional_minidumps, additionalDumps);
nsAutoCString reason("GPUProcessKill");
mCrashReporter->AddAnnotationNSCString(
CrashReporter::Annotation::ipc_channel_error, reason);
if (mCrashReporter->GenerateMinidumpAndPair(mHost, "browser"_ns)) {
mCrashReporter->FinalizeCrashReport();
mCreatedPairedMinidumps = true;
mNumPairedMinidumpsCreated++;
}
}
}
void GPUChild::DeletePairedMinidump() {
if (mCrashReporter && mCreatedPairedMinidumps) {
mCrashReporter->DeleteCrashReport();
mCreatedPairedMinidumps = false;
}
}
mozilla::ipc::IPCResult GPUChild::RecvInitComplete(const GPUDeviceData& aData) {
// We synchronously requested GPU parameters before this arrived.
if (mGPUReady) {
@ -292,14 +319,18 @@ mozilla::ipc::IPCResult GPUChild::RecvAddMemoryReport(
void GPUChild::ActorDestroy(ActorDestroyReason aWhy) {
if (aWhy == AbnormalShutdown || mUnexpectedShutdown) {
nsAutoString dumpId;
GenerateCrashReport(OtherPid(), &dumpId);
Telemetry::Accumulate(
Telemetry::SUBPROCESS_ABNORMAL_ABORT,
nsDependentCString(XRE_GeckoProcessTypeToString(GeckoProcessType_GPU)),
1);
nsAutoString dumpId;
if (!mCreatedPairedMinidumps) {
GenerateCrashReport(OtherPid(), &dumpId);
} else if (mCrashReporter) {
dumpId = mCrashReporter->MinidumpID();
}
// Notify the Telemetry environment so that we can refresh and do a
// subsession split. This also notifies the crash reporter on geckoview.
if (nsCOMPtr<nsIObserverService> obsvc = services::GetObserverService()) {

View File

@ -43,6 +43,14 @@ class GPUChild final : public ipc::CrashReporterHelper<GeckoProcessType_GPU>,
// abnormal.
void OnUnexpectedShutdown();
// Generates a minidump for the GPU process paired with one for the main
// process. Called prior to force-killing the process.
void GeneratePairedMinidump();
// Deletes a minidump created with GeneratePairedMinidump(). Should be called
// if killing the process fails after generating the minidump.
void DeletePairedMinidump();
// gfxVarReceiver overrides.
void OnVarChanged(const GfxVarUpdate& aVar) override;
@ -102,6 +110,13 @@ class GPUChild final : public ipc::CrashReporterHelper<GeckoProcessType_GPU>,
bool mGPUReady;
bool mWaitForVarUpdate = false;
bool mUnexpectedShutdown = false;
// Whether a paired minidump has already been generated, meaning we do not
// need to create a crash report in ActorDestroy().
bool mCreatedPairedMinidumps = false;
// The number of paired minidumps that have been created during this session.
// Used to ensure we do not accumulate a large number of minidumps on disk
// that may never be submitted.
int mNumPairedMinidumpsCreated = 0;
};
} // namespace gfx

View File

@ -181,7 +181,7 @@ void GPUProcessHost::Shutdown(bool aUnexpectedShutdown) {
#ifndef NS_FREE_PERMANENT_DATA
// No need to communicate shutdown, the GPU process doesn't need to
// communicate anything back.
KillHard("NormalShutdown");
KillHard(/* aGenerateMinidump */ false);
#endif
// If we're shutting down unexpectedly, we're in the middle of handling an
@ -211,9 +211,16 @@ void GPUProcessHost::OnChannelClosed() {
MOZ_ASSERT(!mGPUChild);
}
void GPUProcessHost::KillHard(const char* aReason) {
ProcessHandle handle = GetChildProcessHandle();
void GPUProcessHost::KillHard(bool aGenerateMinidump) {
if (mGPUChild && aGenerateMinidump) {
mGPUChild->GeneratePairedMinidump();
}
const ProcessHandle handle = GetChildProcessHandle();
if (!base::KillProcess(handle, base::PROCESS_END_KILLED_BY_USER)) {
if (mGPUChild) {
mGPUChild->DeletePairedMinidump();
}
NS_WARNING("failed to kill subprocess!");
}
@ -222,7 +229,9 @@ void GPUProcessHost::KillHard(const char* aReason) {
uint64_t GPUProcessHost::GetProcessToken() const { return mProcessToken; }
void GPUProcessHost::KillProcess() { KillHard("DiagnosticKill"); }
void GPUProcessHost::KillProcess(bool aGenerateMinidump) {
KillHard(aGenerateMinidump);
}
void GPUProcessHost::CrashProcess() { mGPUChild->SendCrashProcess(); }

View File

@ -107,8 +107,9 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost {
void SetListener(Listener* aListener);
// Kills the GPU process. Used for tests and diagnostics
void KillProcess();
// Kills the GPU process. Used in normal operation to recover from an error,
// as well as for tests and diagnostics.
void KillProcess(bool aGenerateMinidump);
// Causes the GPU process to crash. Used for tests and diagnostics
void CrashProcess();
@ -131,7 +132,7 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost {
void OnChannelClosed();
// Kill the remote process, triggering IPC shutdown.
void KillHard(const char* aReason);
void KillHard(bool aGenerateMinidump);
void DestroyProcess();

View File

@ -715,7 +715,7 @@ bool GPUProcessManager::DisableWebRenderConfig(wr::WebRenderError aError,
// hopefully alleviate the situation.
if (IsProcessStable(TimeStamp::Now())) {
if (mProcess) {
mProcess->KillProcess();
mProcess->KillProcess(/* aGenerateMinidump */ false);
} else {
SimulateDeviceReset();
}
@ -1121,12 +1121,12 @@ void GPUProcessManager::CleanShutdown() {
mVsyncIOThread = nullptr;
}
void GPUProcessManager::KillProcess() {
void GPUProcessManager::KillProcess(bool aGenerateMinidump) {
if (!mProcess) {
return;
}
mProcess->KillProcess();
mProcess->KillProcess(aGenerateMinidump);
}
void GPUProcessManager::CrashProcess() {

View File

@ -184,8 +184,9 @@ class GPUProcessManager final : public GPUProcessHost::Listener {
// true if the message was sent, false if not.
bool NotifyGpuObservers(const char* aTopic);
// Kills the GPU process. Used for tests and diagnostics
void KillProcess();
// Kills the GPU process. Used in normal operation to recover from an error,
// as well as for tests and diagnostics.
void KillProcess(bool aGenerateMinidump = false);
// Causes the GPU process to crash. Used for tests and diagnostics
void CrashProcess();

View File

@ -249,7 +249,7 @@ bool CompositorManagerChild::ShouldContinueFromReplyTimeout() {
if (XRE_IsParentProcess()) {
gfxCriticalNote << "Killing GPU process due to IPC reply timeout";
MOZ_DIAGNOSTIC_ASSERT(GPUProcessManager::Get()->GetGPUChild());
GPUProcessManager::Get()->KillProcess();
GPUProcessManager::Get()->KillProcess(/* aGenerateMinidump */ true);
}
return false;
}

View File

@ -12,7 +12,9 @@
#include "mozilla/UniquePtr.h"
#include "base/process.h"
#include "nsExceptionHandler.h"
#include "nsIFile.h"
#include "nsThreadUtils.h"
#include "mozilla/ipc/GeckoChildProcessHost.h"
#include "mozilla/ipc/ProtocolUtils.h"
namespace mozilla {
@ -55,8 +57,7 @@ class CrashReporterHost {
// GenerateCrashReport does. After this, FinalizeCrashReport may be called.
//
// This calls TakeCrashedChildMinidump and FinalizeCrashReport.
template <typename Toplevel>
bool GenerateMinidumpAndPair(Toplevel* aToplevelProtocol,
bool GenerateMinidumpAndPair(GeckoChildProcessHost* aChildProcessHost,
const nsACString& aPairName) {
auto childHandle = base::kInvalidProcessHandle;
const auto cleanup = MakeScopeExit([&]() {
@ -65,10 +66,10 @@ class CrashReporterHost {
}
});
#ifdef XP_MACOSX
childHandle = aToplevelProtocol->Process()->GetChildTask();
childHandle = aChildProcessHost->GetChildTask();
#else
if (!base::OpenPrivilegedProcessHandle(aToplevelProtocol->OtherPid(),
&childHandle)) {
if (!base::OpenPrivilegedProcessHandle(
aChildProcessHost->GetChildProcessId(), &childHandle)) {
NS_WARNING("Failed to open child process handle.");
return false;
}