From c8d2c7079e780538c0ed4fc324972bcc7d975e3b Mon Sep 17 00:00:00 2001 From: Jamie Nicol Date: Tue, 14 May 2024 12:58:35 +0000 Subject: [PATCH] Bug 1880503 - Generate paired minidump when GPU process is killed following IPC timeout. r=aosmond,gsvelto When sync IPC under the top-level PCompositorManager protocol does not reply within a certain time threshold we purposefully kill the GPU process. While this allows the user to recover from a stuck GPU process, we have little visibility about the underlying cause. This patch makes it so that we generate a paired minidump for the GPU and parent processes prior to killing the GPU process in GPUProcessHost::KillHard(). The implementation roughly follows the equivalent for content processes in ContentParent::KillHard(). As the GPU process can be purposefully killed during normal operation, and because generating minidumps can be expensive, we are careful to only do so when the new argument aGenerateMinidump is true. We additionally remove the aReason argument as it is unused (and currently innacurate in some places). As these minidumps may not automatically submitted we limit the minidumps generation to twice per session in order to avoid accumulating a large number of unsubmitted minidumps on disk. Differential Revision: https://phabricator.services.mozilla.com/D202166 --- dom/ipc/ContentParent.cpp | 2 +- gfx/ipc/GPUChild.cpp | 37 +++++++++++++++++++++-- gfx/ipc/GPUChild.h | 15 +++++++++ gfx/ipc/GPUProcessHost.cpp | 17 ++++++++--- gfx/ipc/GPUProcessHost.h | 7 +++-- gfx/ipc/GPUProcessManager.cpp | 6 ++-- gfx/ipc/GPUProcessManager.h | 5 +-- gfx/layers/ipc/CompositorManagerChild.cpp | 2 +- ipc/glue/CrashReporterHost.h | 11 ++++--- 9 files changed, 80 insertions(+), 22 deletions(-) diff --git a/dom/ipc/ContentParent.cpp b/dom/ipc/ContentParent.cpp index 2070f4ab3c44..583f1b6192af 100644 --- a/dom/ipc/ContentParent.cpp +++ b/dom/ipc/ContentParent.cpp @@ -4513,7 +4513,7 @@ void ContentParent::GeneratePairedMinidump(const char* aReason) { CrashReporter::Annotation::ipc_channel_error, reason); // Generate the report and insert into the queue for submittal. - if (mCrashReporter->GenerateMinidumpAndPair(this, "browser"_ns)) { + if (mCrashReporter->GenerateMinidumpAndPair(mSubprocess, "browser"_ns)) { mCrashReporter->FinalizeCrashReport(); mCreatedPairedMinidumps = true; } diff --git a/gfx/ipc/GPUChild.cpp b/gfx/ipc/GPUChild.cpp index 2034b206e329..ea7025977436 100644 --- a/gfx/ipc/GPUChild.cpp +++ b/gfx/ipc/GPUChild.cpp @@ -111,6 +111,33 @@ bool GPUChild::EnsureGPUReady() { void GPUChild::OnUnexpectedShutdown() { mUnexpectedShutdown = true; } +void GPUChild::GeneratePairedMinidump() { + // At most generate the paired minidumps twice per session in order to + // avoid accumulating a large number of unsubmitted minidumps on disk. + if (mCrashReporter && mNumPairedMinidumpsCreated < 2) { + nsAutoCString additionalDumps("browser"); + mCrashReporter->AddAnnotationNSCString( + CrashReporter::Annotation::additional_minidumps, additionalDumps); + + nsAutoCString reason("GPUProcessKill"); + mCrashReporter->AddAnnotationNSCString( + CrashReporter::Annotation::ipc_channel_error, reason); + + if (mCrashReporter->GenerateMinidumpAndPair(mHost, "browser"_ns)) { + mCrashReporter->FinalizeCrashReport(); + mCreatedPairedMinidumps = true; + mNumPairedMinidumpsCreated++; + } + } +} + +void GPUChild::DeletePairedMinidump() { + if (mCrashReporter && mCreatedPairedMinidumps) { + mCrashReporter->DeleteCrashReport(); + mCreatedPairedMinidumps = false; + } +} + mozilla::ipc::IPCResult GPUChild::RecvInitComplete(const GPUDeviceData& aData) { // We synchronously requested GPU parameters before this arrived. if (mGPUReady) { @@ -292,14 +319,18 @@ mozilla::ipc::IPCResult GPUChild::RecvAddMemoryReport( void GPUChild::ActorDestroy(ActorDestroyReason aWhy) { if (aWhy == AbnormalShutdown || mUnexpectedShutdown) { - nsAutoString dumpId; - GenerateCrashReport(OtherPid(), &dumpId); - Telemetry::Accumulate( Telemetry::SUBPROCESS_ABNORMAL_ABORT, nsDependentCString(XRE_GeckoProcessTypeToString(GeckoProcessType_GPU)), 1); + nsAutoString dumpId; + if (!mCreatedPairedMinidumps) { + GenerateCrashReport(OtherPid(), &dumpId); + } else if (mCrashReporter) { + dumpId = mCrashReporter->MinidumpID(); + } + // Notify the Telemetry environment so that we can refresh and do a // subsession split. This also notifies the crash reporter on geckoview. if (nsCOMPtr obsvc = services::GetObserverService()) { diff --git a/gfx/ipc/GPUChild.h b/gfx/ipc/GPUChild.h index 4756370b8e8b..79d35ff8284c 100644 --- a/gfx/ipc/GPUChild.h +++ b/gfx/ipc/GPUChild.h @@ -43,6 +43,14 @@ class GPUChild final : public ipc::CrashReporterHelper, // abnormal. void OnUnexpectedShutdown(); + // Generates a minidump for the GPU process paired with one for the main + // process. Called prior to force-killing the process. + void GeneratePairedMinidump(); + + // Deletes a minidump created with GeneratePairedMinidump(). Should be called + // if killing the process fails after generating the minidump. + void DeletePairedMinidump(); + // gfxVarReceiver overrides. void OnVarChanged(const GfxVarUpdate& aVar) override; @@ -102,6 +110,13 @@ class GPUChild final : public ipc::CrashReporterHelper, bool mGPUReady; bool mWaitForVarUpdate = false; bool mUnexpectedShutdown = false; + // Whether a paired minidump has already been generated, meaning we do not + // need to create a crash report in ActorDestroy(). + bool mCreatedPairedMinidumps = false; + // The number of paired minidumps that have been created during this session. + // Used to ensure we do not accumulate a large number of minidumps on disk + // that may never be submitted. + int mNumPairedMinidumpsCreated = 0; }; } // namespace gfx diff --git a/gfx/ipc/GPUProcessHost.cpp b/gfx/ipc/GPUProcessHost.cpp index 77cfcf300f89..75f68f04740c 100644 --- a/gfx/ipc/GPUProcessHost.cpp +++ b/gfx/ipc/GPUProcessHost.cpp @@ -181,7 +181,7 @@ void GPUProcessHost::Shutdown(bool aUnexpectedShutdown) { #ifndef NS_FREE_PERMANENT_DATA // No need to communicate shutdown, the GPU process doesn't need to // communicate anything back. - KillHard("NormalShutdown"); + KillHard(/* aGenerateMinidump */ false); #endif // If we're shutting down unexpectedly, we're in the middle of handling an @@ -211,9 +211,16 @@ void GPUProcessHost::OnChannelClosed() { MOZ_ASSERT(!mGPUChild); } -void GPUProcessHost::KillHard(const char* aReason) { - ProcessHandle handle = GetChildProcessHandle(); +void GPUProcessHost::KillHard(bool aGenerateMinidump) { + if (mGPUChild && aGenerateMinidump) { + mGPUChild->GeneratePairedMinidump(); + } + + const ProcessHandle handle = GetChildProcessHandle(); if (!base::KillProcess(handle, base::PROCESS_END_KILLED_BY_USER)) { + if (mGPUChild) { + mGPUChild->DeletePairedMinidump(); + } NS_WARNING("failed to kill subprocess!"); } @@ -222,7 +229,9 @@ void GPUProcessHost::KillHard(const char* aReason) { uint64_t GPUProcessHost::GetProcessToken() const { return mProcessToken; } -void GPUProcessHost::KillProcess() { KillHard("DiagnosticKill"); } +void GPUProcessHost::KillProcess(bool aGenerateMinidump) { + KillHard(aGenerateMinidump); +} void GPUProcessHost::CrashProcess() { mGPUChild->SendCrashProcess(); } diff --git a/gfx/ipc/GPUProcessHost.h b/gfx/ipc/GPUProcessHost.h index 64ca1429d9a0..e811c3c17e1c 100644 --- a/gfx/ipc/GPUProcessHost.h +++ b/gfx/ipc/GPUProcessHost.h @@ -107,8 +107,9 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost { void SetListener(Listener* aListener); - // Kills the GPU process. Used for tests and diagnostics - void KillProcess(); + // Kills the GPU process. Used in normal operation to recover from an error, + // as well as for tests and diagnostics. + void KillProcess(bool aGenerateMinidump); // Causes the GPU process to crash. Used for tests and diagnostics void CrashProcess(); @@ -131,7 +132,7 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost { void OnChannelClosed(); // Kill the remote process, triggering IPC shutdown. - void KillHard(const char* aReason); + void KillHard(bool aGenerateMinidump); void DestroyProcess(); diff --git a/gfx/ipc/GPUProcessManager.cpp b/gfx/ipc/GPUProcessManager.cpp index 9a70a4a60319..21ea6f39c24e 100644 --- a/gfx/ipc/GPUProcessManager.cpp +++ b/gfx/ipc/GPUProcessManager.cpp @@ -715,7 +715,7 @@ bool GPUProcessManager::DisableWebRenderConfig(wr::WebRenderError aError, // hopefully alleviate the situation. if (IsProcessStable(TimeStamp::Now())) { if (mProcess) { - mProcess->KillProcess(); + mProcess->KillProcess(/* aGenerateMinidump */ false); } else { SimulateDeviceReset(); } @@ -1121,12 +1121,12 @@ void GPUProcessManager::CleanShutdown() { mVsyncIOThread = nullptr; } -void GPUProcessManager::KillProcess() { +void GPUProcessManager::KillProcess(bool aGenerateMinidump) { if (!mProcess) { return; } - mProcess->KillProcess(); + mProcess->KillProcess(aGenerateMinidump); } void GPUProcessManager::CrashProcess() { diff --git a/gfx/ipc/GPUProcessManager.h b/gfx/ipc/GPUProcessManager.h index e6f916ec67d9..0c92706b95de 100644 --- a/gfx/ipc/GPUProcessManager.h +++ b/gfx/ipc/GPUProcessManager.h @@ -184,8 +184,9 @@ class GPUProcessManager final : public GPUProcessHost::Listener { // true if the message was sent, false if not. bool NotifyGpuObservers(const char* aTopic); - // Kills the GPU process. Used for tests and diagnostics - void KillProcess(); + // Kills the GPU process. Used in normal operation to recover from an error, + // as well as for tests and diagnostics. + void KillProcess(bool aGenerateMinidump = false); // Causes the GPU process to crash. Used for tests and diagnostics void CrashProcess(); diff --git a/gfx/layers/ipc/CompositorManagerChild.cpp b/gfx/layers/ipc/CompositorManagerChild.cpp index 8d8933937366..242e6d28348f 100644 --- a/gfx/layers/ipc/CompositorManagerChild.cpp +++ b/gfx/layers/ipc/CompositorManagerChild.cpp @@ -249,7 +249,7 @@ bool CompositorManagerChild::ShouldContinueFromReplyTimeout() { if (XRE_IsParentProcess()) { gfxCriticalNote << "Killing GPU process due to IPC reply timeout"; MOZ_DIAGNOSTIC_ASSERT(GPUProcessManager::Get()->GetGPUChild()); - GPUProcessManager::Get()->KillProcess(); + GPUProcessManager::Get()->KillProcess(/* aGenerateMinidump */ true); } return false; } diff --git a/ipc/glue/CrashReporterHost.h b/ipc/glue/CrashReporterHost.h index df0af17d435a..ed6fcb801f68 100644 --- a/ipc/glue/CrashReporterHost.h +++ b/ipc/glue/CrashReporterHost.h @@ -12,7 +12,9 @@ #include "mozilla/UniquePtr.h" #include "base/process.h" #include "nsExceptionHandler.h" +#include "nsIFile.h" #include "nsThreadUtils.h" +#include "mozilla/ipc/GeckoChildProcessHost.h" #include "mozilla/ipc/ProtocolUtils.h" namespace mozilla { @@ -55,8 +57,7 @@ class CrashReporterHost { // GenerateCrashReport does. After this, FinalizeCrashReport may be called. // // This calls TakeCrashedChildMinidump and FinalizeCrashReport. - template - bool GenerateMinidumpAndPair(Toplevel* aToplevelProtocol, + bool GenerateMinidumpAndPair(GeckoChildProcessHost* aChildProcessHost, const nsACString& aPairName) { auto childHandle = base::kInvalidProcessHandle; const auto cleanup = MakeScopeExit([&]() { @@ -65,10 +66,10 @@ class CrashReporterHost { } }); #ifdef XP_MACOSX - childHandle = aToplevelProtocol->Process()->GetChildTask(); + childHandle = aChildProcessHost->GetChildTask(); #else - if (!base::OpenPrivilegedProcessHandle(aToplevelProtocol->OtherPid(), - &childHandle)) { + if (!base::OpenPrivilegedProcessHandle( + aChildProcessHost->GetChildProcessId(), &childHandle)) { NS_WARNING("Failed to open child process handle."); return false; }