From ae991ab175f4e4c1de1f9b386ba39f536baa7573 Mon Sep 17 00:00:00 2001 From: Jamie Nicol Date: Tue, 30 Apr 2024 15:06:21 +0000 Subject: [PATCH] Bug 1880503 - Generate paired minidump when GPU process is killed following IPC timeout. r=aosmond,gsvelto When sync IPC under the top-level PCompositorManager protocol does not reply within a certain time threshold we purposefully kill the GPU process. While this allows the user to recover from a stuck GPU process, we have little visibility about the underlying cause. This patch makes it so that we generate a paired minidump for the GPU and parent processes prior to killing the GPU process in GPUProcessHost::KillHard(). The implementation roughly follows the equivalent for content processes in ContentParent::KillHard(). As the GPU process can be purposefully killed during normal operation, and because generating minidumps can be expensive, we are careful to only do so when the new argument aGenerateMinidump is true. We additionally remove the aReason argument as it is unused (and currently innacurate in some places). As these minidumps may not automatically submitted we limit the minidumps generation to twice per session in order to avoid accumulating a large number of unsubmitted minidumps on disk. Differential Revision: https://phabricator.services.mozilla.com/D202166 --- gfx/ipc/GPUChild.cpp | 37 +++++++++++++++++++++-- gfx/ipc/GPUChild.h | 15 +++++++++ gfx/ipc/GPUProcessHost.cpp | 17 ++++++++--- gfx/ipc/GPUProcessHost.h | 7 +++-- gfx/ipc/GPUProcessManager.cpp | 6 ++-- gfx/ipc/GPUProcessManager.h | 5 +-- gfx/layers/ipc/CompositorManagerChild.cpp | 2 +- 7 files changed, 73 insertions(+), 16 deletions(-) diff --git a/gfx/ipc/GPUChild.cpp b/gfx/ipc/GPUChild.cpp index 2a9fe2362806..2207072b2a13 100644 --- a/gfx/ipc/GPUChild.cpp +++ b/gfx/ipc/GPUChild.cpp @@ -111,6 +111,33 @@ bool GPUChild::EnsureGPUReady() { void GPUChild::OnUnexpectedShutdown() { mUnexpectedShutdown = true; } +void GPUChild::GeneratePairedMinidump() { + // At most generate the paired minidumps twice per session in order to + // avoid accumulating a large number of unsubmitted minidumps on disk. + if (mCrashReporter && mNumPairedMinidumpsCreated < 2) { + nsAutoCString additionalDumps("browser"); + mCrashReporter->AddAnnotationNSCString( + CrashReporter::Annotation::additional_minidumps, additionalDumps); + + nsAutoCString reason("GPUProcessKill"); + mCrashReporter->AddAnnotationNSCString( + CrashReporter::Annotation::ipc_channel_error, reason); + + if (mCrashReporter->GenerateMinidumpAndPair(this, "browser"_ns)) { + mCrashReporter->FinalizeCrashReport(); + mCreatedPairedMinidumps = true; + mNumPairedMinidumpsCreated++; + } + } +} + +void GPUChild::DeletePairedMinidump() { + if (mCrashReporter && mCreatedPairedMinidumps) { + mCrashReporter->DeleteCrashReport(); + mCreatedPairedMinidumps = false; + } +} + mozilla::ipc::IPCResult GPUChild::RecvInitComplete(const GPUDeviceData& aData) { // We synchronously requested GPU parameters before this arrived. if (mGPUReady) { @@ -291,14 +318,18 @@ mozilla::ipc::IPCResult GPUChild::RecvAddMemoryReport( void GPUChild::ActorDestroy(ActorDestroyReason aWhy) { if (aWhy == AbnormalShutdown || mUnexpectedShutdown) { - nsAutoString dumpId; - GenerateCrashReport(OtherPid(), &dumpId); - Telemetry::Accumulate( Telemetry::SUBPROCESS_ABNORMAL_ABORT, nsDependentCString(XRE_GeckoProcessTypeToString(GeckoProcessType_GPU)), 1); + nsAutoString dumpId; + if (!mCreatedPairedMinidumps) { + GenerateCrashReport(OtherPid(), &dumpId); + } else if (mCrashReporter) { + dumpId = mCrashReporter->MinidumpID(); + } + // Notify the Telemetry environment so that we can refresh and do a // subsession split. This also notifies the crash reporter on geckoview. if (nsCOMPtr obsvc = services::GetObserverService()) { diff --git a/gfx/ipc/GPUChild.h b/gfx/ipc/GPUChild.h index 6c2765fcc71f..49e58d204926 100644 --- a/gfx/ipc/GPUChild.h +++ b/gfx/ipc/GPUChild.h @@ -43,6 +43,14 @@ class GPUChild final : public ipc::CrashReporterHelper, // abnormal. void OnUnexpectedShutdown(); + // Generates a minidump for the GPU process paired with one for the main + // process. Called prior to force-killing the process. + void GeneratePairedMinidump(); + + // Deletes a minidump created with GeneratePairedMinidump(). Should be called + // if killing the process fails after generating the minidump. + void DeletePairedMinidump(); + // gfxVarReceiver overrides. void OnVarChanged(const GfxVarUpdate& aVar) override; @@ -100,6 +108,13 @@ class GPUChild final : public ipc::CrashReporterHelper, bool mGPUReady; bool mWaitForVarUpdate = false; bool mUnexpectedShutdown = false; + // Whether a paired minidump has already been generated, meaning we do not + // need to create a crash report in ActorDestroy(). + bool mCreatedPairedMinidumps = false; + // The number of paired minidumps that have been created during this session. + // Used to ensure we do not accumulate a large number of minidumps on disk + // that may never be submitted. + int mNumPairedMinidumpsCreated = 0; }; } // namespace gfx diff --git a/gfx/ipc/GPUProcessHost.cpp b/gfx/ipc/GPUProcessHost.cpp index 77cfcf300f89..75f68f04740c 100644 --- a/gfx/ipc/GPUProcessHost.cpp +++ b/gfx/ipc/GPUProcessHost.cpp @@ -181,7 +181,7 @@ void GPUProcessHost::Shutdown(bool aUnexpectedShutdown) { #ifndef NS_FREE_PERMANENT_DATA // No need to communicate shutdown, the GPU process doesn't need to // communicate anything back. - KillHard("NormalShutdown"); + KillHard(/* aGenerateMinidump */ false); #endif // If we're shutting down unexpectedly, we're in the middle of handling an @@ -211,9 +211,16 @@ void GPUProcessHost::OnChannelClosed() { MOZ_ASSERT(!mGPUChild); } -void GPUProcessHost::KillHard(const char* aReason) { - ProcessHandle handle = GetChildProcessHandle(); +void GPUProcessHost::KillHard(bool aGenerateMinidump) { + if (mGPUChild && aGenerateMinidump) { + mGPUChild->GeneratePairedMinidump(); + } + + const ProcessHandle handle = GetChildProcessHandle(); if (!base::KillProcess(handle, base::PROCESS_END_KILLED_BY_USER)) { + if (mGPUChild) { + mGPUChild->DeletePairedMinidump(); + } NS_WARNING("failed to kill subprocess!"); } @@ -222,7 +229,9 @@ void GPUProcessHost::KillHard(const char* aReason) { uint64_t GPUProcessHost::GetProcessToken() const { return mProcessToken; } -void GPUProcessHost::KillProcess() { KillHard("DiagnosticKill"); } +void GPUProcessHost::KillProcess(bool aGenerateMinidump) { + KillHard(aGenerateMinidump); +} void GPUProcessHost::CrashProcess() { mGPUChild->SendCrashProcess(); } diff --git a/gfx/ipc/GPUProcessHost.h b/gfx/ipc/GPUProcessHost.h index 4f84d1e296f3..74084e72d26e 100644 --- a/gfx/ipc/GPUProcessHost.h +++ b/gfx/ipc/GPUProcessHost.h @@ -104,8 +104,9 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost { void SetListener(Listener* aListener); - // Kills the GPU process. Used for tests and diagnostics - void KillProcess(); + // Kills the GPU process. Used in normal operation to recover from an error, + // as well as for tests and diagnostics. + void KillProcess(bool aGenerateMinidump); // Causes the GPU process to crash. Used for tests and diagnostics void CrashProcess(); @@ -128,7 +129,7 @@ class GPUProcessHost final : public mozilla::ipc::GeckoChildProcessHost { void OnChannelClosed(); // Kill the remote process, triggering IPC shutdown. - void KillHard(const char* aReason); + void KillHard(bool aGenerateMinidump); void DestroyProcess(); diff --git a/gfx/ipc/GPUProcessManager.cpp b/gfx/ipc/GPUProcessManager.cpp index 97f4d4711164..91b19118b52c 100644 --- a/gfx/ipc/GPUProcessManager.cpp +++ b/gfx/ipc/GPUProcessManager.cpp @@ -714,7 +714,7 @@ bool GPUProcessManager::DisableWebRenderConfig(wr::WebRenderError aError, // hopefully alleviate the situation. if (IsProcessStable(TimeStamp::Now())) { if (mProcess) { - mProcess->KillProcess(); + mProcess->KillProcess(/* aGenerateMinidump */ false); } else { SimulateDeviceReset(); } @@ -1070,12 +1070,12 @@ void GPUProcessManager::CleanShutdown() { mVsyncIOThread = nullptr; } -void GPUProcessManager::KillProcess() { +void GPUProcessManager::KillProcess(bool aGenerateMinidump) { if (!mProcess) { return; } - mProcess->KillProcess(); + mProcess->KillProcess(aGenerateMinidump); } void GPUProcessManager::CrashProcess() { diff --git a/gfx/ipc/GPUProcessManager.h b/gfx/ipc/GPUProcessManager.h index 8b58d15b544d..e499cc529b50 100644 --- a/gfx/ipc/GPUProcessManager.h +++ b/gfx/ipc/GPUProcessManager.h @@ -178,8 +178,9 @@ class GPUProcessManager final : public GPUProcessHost::Listener { // true if the message was sent, false if not. bool NotifyGpuObservers(const char* aTopic); - // Kills the GPU process. Used for tests and diagnostics - void KillProcess(); + // Kills the GPU process. Used in normal operation to recover from an error, + // as well as for tests and diagnostics. + void KillProcess(bool aGenerateMinidump = false); // Causes the GPU process to crash. Used for tests and diagnostics void CrashProcess(); diff --git a/gfx/layers/ipc/CompositorManagerChild.cpp b/gfx/layers/ipc/CompositorManagerChild.cpp index 8d8933937366..242e6d28348f 100644 --- a/gfx/layers/ipc/CompositorManagerChild.cpp +++ b/gfx/layers/ipc/CompositorManagerChild.cpp @@ -249,7 +249,7 @@ bool CompositorManagerChild::ShouldContinueFromReplyTimeout() { if (XRE_IsParentProcess()) { gfxCriticalNote << "Killing GPU process due to IPC reply timeout"; MOZ_DIAGNOSTIC_ASSERT(GPUProcessManager::Get()->GetGPUChild()); - GPUProcessManager::Get()->KillProcess(); + GPUProcessManager::Get()->KillProcess(/* aGenerateMinidump */ true); } return false; }