Skip to content

Commit

Permalink
Bug 1880503 - Generate paired minidump when GPU process is killed fol…
Browse files Browse the repository at this point in the history
…lowing IPC timeout. r=aosmond,gsvelto

When sync IPC under the top-level PCompositorManager protocol does not
reply within a certain time threshold we purposefully kill the GPU
process. While this allows the user to recover from a stuck GPU
process, we have little visibility about the underlying cause.

This patch makes it so that we generate a paired minidump for the GPU
and parent processes prior to killing the GPU process in
GPUProcessHost::KillHard(). The implementation roughly follows the
equivalent for content processes in ContentParent::KillHard().

As the GPU process can be purposefully killed during normal operation,
and because generating minidumps can be expensive, we are careful to
only do so when the new argument aGenerateMinidump is true. We
additionally remove the aReason argument as it is unused (and
currently innacurate in some places).

As these minidumps may not automatically submitted we limit the
minidumps generation to twice per session in order to avoid
accumulating a large number of unsubmitted minidumps on disk.

Differential Revision: https://phabricator.services.mozilla.com/D202166
  • Loading branch information
jamienicol committed May 14, 2024
1 parent 73cb441 commit c8d2c70
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 22 deletions.
2 changes: 1 addition & 1 deletion dom/ipc/ContentParent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4513,7 +4513,7 @@ void ContentParent::GeneratePairedMinidump(const char* aReason) {
CrashReporter::Annotation::ipc_channel_error, reason);

// Generate the report and insert into the queue for submittal.
if (mCrashReporter->GenerateMinidumpAndPair(this, "browser"_ns)) {
if (mCrashReporter->GenerateMinidumpAndPair(mSubprocess, "browser"_ns)) {
mCrashReporter->FinalizeCrashReport();
mCreatedPairedMinidumps = true;
}
Expand Down
37 changes: 34 additions & 3 deletions gfx/ipc/GPUChild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,33 @@ bool GPUChild::EnsureGPUReady() {

void GPUChild::OnUnexpectedShutdown() { mUnexpectedShutdown = true; }

void GPUChild::GeneratePairedMinidump() {
// At most generate the paired minidumps twice per session in order to
// avoid accumulating a large number of unsubmitted minidumps on disk.
if (mCrashReporter && mNumPairedMinidumpsCreated < 2) {
nsAutoCString additionalDumps("browser");
mCrashReporter->AddAnnotationNSCString(
CrashReporter::Annotation::additional_minidumps, additionalDumps);

nsAutoCString reason("GPUProcessKill");
mCrashReporter->AddAnnotationNSCString(
CrashReporter::Annotation::ipc_channel_error, reason);

if (mCrashReporter->GenerateMinidumpAndPair(mHost, "browser"_ns)) {
mCrashReporter->FinalizeCrashReport();
mCreatedPairedMinidumps = true;
mNumPairedMinidumpsCreated++;
}
}
}

void GPUChild::DeletePairedMinidump() {
if (mCrashReporter && mCreatedPairedMinidumps) {
mCrashReporter->DeleteCrashReport();
mCreatedPairedMinidumps = false;
}
}

mozilla::ipc::IPCResult GPUChild::RecvInitComplete(const GPUDeviceData& aData) {
// We synchronously requested GPU parameters before this arrived.
if (mGPUReady) {
Expand Down Expand Up @@ -292,14 +319,18 @@ mozilla::ipc::IPCResult GPUChild::RecvAddMemoryReport(

void GPUChild::ActorDestroy(ActorDestroyReason aWhy) {
if (aWhy == AbnormalShutdown || mUnexpectedShutdown) {
nsAutoString dumpId;
GenerateCrashReport(OtherPid(), &dumpId);

Telemetry::Accumulate(
Telemetry::SUBPROCESS_ABNORMAL_ABORT,
nsDependentCString(XRE_GeckoProcessTypeToString(GeckoProcessType_GPU)),
1);

nsAutoString dumpId;
if (!mCreatedPairedMinidumps) {
GenerateCrashReport(OtherPid(), &dumpId);
} else if (mCrashReporter) {
dumpId = mCrashReporter->MinidumpID();
}

// Notify the Telemetry environment so that we can refresh and do a
// subsession split. This also notifies the crash reporter on geckoview.
if (nsCOMPtr<nsIObserverService> obsvc = services::GetObserverService()) {
Expand Down
15 changes: 15 additions & 0 deletions gfx/ipc/GPUChild.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ class GPUChild final : public ipc::CrashReporterHelper<GeckoProcessType_GPU>,
// abnormal.
void OnUnexpectedShutdown();

// Generates a minidump for the GPU process paired with one for the main
// process. Called prior to force-killing the process.
void GeneratePairedMinidump();

// Deletes a minidump created with GeneratePairedMinidump(). Should be called
// if killing the process fails after generating the minidump.
void DeletePairedMinidump();

// gfxVarReceiver overrides.
void OnVarChanged(const GfxVarUpdate& aVar) override;

Expand Down Expand Up @@ -102,6 +110,13 @@ class GPUChild final : public ipc::CrashReporterHelper<GeckoProcessType_GPU>,
bool mGPUReady;
bool mWaitForVarUpdate = false;
bool mUnexpectedShutdown = false;
// Whether a paired minidump has already been generated, meaning we do not
// need to create a crash report in ActorDestroy().
bool mCreatedPairedMinidumps = false;
// The number of paired minidumps that have been created during this session.
// Used to ensure we do not accumulate a large number of minidumps on disk
// that may never be submitted.
int mNumPairedMinidumpsCreated = 0;
};

} // namespace gfx
Expand Down
17 changes: 13 additions & 4 deletions gfx/ipc/GPUProcessHost.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ void GPUProcessHost::Shutdown(bool aUnexpectedShutdown) {
#ifndef NS_FREE_PERMANENT_DATA
// No need to communicate shutdown, the GPU process doesn't need to
// communicate anything back.
KillHard("NormalShutdown");
KillHard(/* aGenerateMinidump */ false);
#endif

// If we're shutting down unexpectedly, we're in the middle of handling an
Expand Down Expand Up @@ -211,9 +211,16 @@ void GPUProcessHost::OnChannelClosed() {
MOZ_ASSERT(!mGPUChild);
}

void GPUProcessHost::KillHard(const char* aReason) {
ProcessHandle handle = GetChildProcessHandle();
void GPUProcessHost::KillHard(bool aGenerateMinidump) {
if (mGPUChild && aGenerateMinidump) {
mGPUChild->GeneratePairedMinidump();
}

const ProcessHandle handle = GetChildProcessHandle();
if (!base::KillProcess(handle, base::PROCESS_END_KILLED_BY_USER)) {
if (mGPUChild) {
mGPUChild->DeletePairedMinidump();
}
NS_WARNING("failed to kill subprocess!");
}

Expand All @@ -222,7 +229,9 @@ void GPUProcessHost::KillHard(const char* aReason) {

uint64_t GPUProcessHost::GetProcessToken() const { return mProcessToken; }

void GPUProcessHost::KillProcess() { KillHard("DiagnosticKill"); }
void GPUProcessHost::KillProcess(bool aGenerateMinidump) {
KillHard(aGenerateMinidump);
}

void GPUProcessHost::CrashProcess() { mGPUChild->SendCrashProcess(); }

Expand Down
7 changes: 4 additions & 3 deletions gfx/ipc/GPUProcessHost.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,9 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost {

void SetListener(Listener* aListener);

// Kills the GPU process. Used for tests and diagnostics
void KillProcess();
// Kills the GPU process. Used in normal operation to recover from an error,
// as well as for tests and diagnostics.
void KillProcess(bool aGenerateMinidump);

// Causes the GPU process to crash. Used for tests and diagnostics
void CrashProcess();
Expand All @@ -131,7 +132,7 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost {
void OnChannelClosed();

// Kill the remote process, triggering IPC shutdown.
void KillHard(const char* aReason);
void KillHard(bool aGenerateMinidump);

void DestroyProcess();

Expand Down
6 changes: 3 additions & 3 deletions gfx/ipc/GPUProcessManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ bool GPUProcessManager::DisableWebRenderConfig(wr::WebRenderError aError,
// hopefully alleviate the situation.
if (IsProcessStable(TimeStamp::Now())) {
if (mProcess) {
mProcess->KillProcess();
mProcess->KillProcess(/* aGenerateMinidump */ false);
} else {
SimulateDeviceReset();
}
Expand Down Expand Up @@ -1121,12 +1121,12 @@ void GPUProcessManager::CleanShutdown() {
mVsyncIOThread = nullptr;
}

void GPUProcessManager::KillProcess() {
void GPUProcessManager::KillProcess(bool aGenerateMinidump) {
if (!mProcess) {
return;
}

mProcess->KillProcess();
mProcess->KillProcess(aGenerateMinidump);
}

void GPUProcessManager::CrashProcess() {
Expand Down
5 changes: 3 additions & 2 deletions gfx/ipc/GPUProcessManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,9 @@ class GPUProcessManager final : public GPUProcessHost::Listener {
// true if the message was sent, false if not.
bool NotifyGpuObservers(const char* aTopic);

// Kills the GPU process. Used for tests and diagnostics
void KillProcess();
// Kills the GPU process. Used in normal operation to recover from an error,
// as well as for tests and diagnostics.
void KillProcess(bool aGenerateMinidump = false);

// Causes the GPU process to crash. Used for tests and diagnostics
void CrashProcess();
Expand Down
2 changes: 1 addition & 1 deletion gfx/layers/ipc/CompositorManagerChild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ bool CompositorManagerChild::ShouldContinueFromReplyTimeout() {
if (XRE_IsParentProcess()) {
gfxCriticalNote << "Killing GPU process due to IPC reply timeout";
MOZ_DIAGNOSTIC_ASSERT(GPUProcessManager::Get()->GetGPUChild());
GPUProcessManager::Get()->KillProcess();
GPUProcessManager::Get()->KillProcess(/* aGenerateMinidump */ true);
}
return false;
}
Expand Down
11 changes: 6 additions & 5 deletions ipc/glue/CrashReporterHost.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
#include "mozilla/UniquePtr.h"
#include "base/process.h"
#include "nsExceptionHandler.h"
#include "nsIFile.h"
#include "nsThreadUtils.h"
#include "mozilla/ipc/GeckoChildProcessHost.h"
#include "mozilla/ipc/ProtocolUtils.h"

namespace mozilla {
Expand Down Expand Up @@ -55,8 +57,7 @@ class CrashReporterHost {
// GenerateCrashReport does. After this, FinalizeCrashReport may be called.
//
// This calls TakeCrashedChildMinidump and FinalizeCrashReport.
template <typename Toplevel>
bool GenerateMinidumpAndPair(Toplevel* aToplevelProtocol,
bool GenerateMinidumpAndPair(GeckoChildProcessHost* aChildProcessHost,
const nsACString& aPairName) {
auto childHandle = base::kInvalidProcessHandle;
const auto cleanup = MakeScopeExit([&]() {
Expand All @@ -65,10 +66,10 @@ class CrashReporterHost {
}
});
#ifdef XP_MACOSX
childHandle = aToplevelProtocol->Process()->GetChildTask();
childHandle = aChildProcessHost->GetChildTask();
#else
if (!base::OpenPrivilegedProcessHandle(aToplevelProtocol->OtherPid(),
&childHandle)) {
if (!base::OpenPrivilegedProcessHandle(
aChildProcessHost->GetChildProcessId(), &childHandle)) {
NS_WARNING("Failed to open child process handle.");
return false;
}
Expand Down

0 comments on commit c8d2c70

Please sign in to comment.