Skip to content

Commit 85af1fd

Browse files
authored
[Infrastructure] Adds support for capturing process dumps for hanging builds on Windows (dotnet#13912)
* Downloads and installs ProcDump as part of the build. * Registers a scheduled job that wakes up in 160 minutes. * Upon waking up, the background job lists all the processes it received as candidates. * For each process it captures a full memory dump. * At the end of the build, a separate step checks on whether the job ran (the build hanged) or everything is correct and displays statistics. * If dumps are found, they are collected and made available as artifacts under Windows(_Templates)_Tests_Logs.
1 parent 8b7f662 commit 85af1fd

4 files changed

+256
-0
lines changed

.azure/pipelines/jobs/default-build.yml

+11
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@ jobs:
126126
steps:
127127
- checkout: self
128128
clean: true
129+
- ${{ if and(eq(parameters.agentOs, 'Windows'), eq(parameters.isTestingJob, true)) }}:
130+
- powershell: ./eng/scripts/InstallProcDump.ps1
131+
displayName: Install ProcDump
132+
- powershell: ./eng/scripts/StartDumpCollectionForHangingBuilds.ps1 $(ProcDumpPath)procdump.exe artifacts/log/ (Get-Date).AddMinutes(160) dotnet
133+
displayName: Start background dump collection
129134
- ${{ if eq(parameters.installNodeJs, 'true') }}:
130135
- task: NodeTool@0
131136
displayName: Install Node 10.x
@@ -165,6 +170,12 @@ jobs:
165170

166171
- ${{ parameters.afterBuild }}
167172

173+
- ${{ if and(eq(parameters.agentOs, 'Windows'), eq(parameters.isTestingJob, true)) }}:
174+
- powershell: ./eng/scripts/FinishDumpCollectionForHangingBuilds.ps1 artifacts/log/
175+
displayName: Finish background dump collection
176+
continueOnError: true
177+
condition: always()
178+
168179
- ${{ if eq(parameters.agentOs, 'Windows') }}:
169180
- powershell: eng\scripts\KillProcesses.ps1
170181
displayName: Kill processes
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
param(
2+
[Parameter(Mandatory = $true)]
3+
[ValidateNotNullOrEmpty()]
4+
[string]
5+
$ProcDumpOutputPath
6+
)
7+
8+
Write-Output "Finishing dump collection for hanging builds.";
9+
10+
$repoRoot = Resolve-Path "$PSScriptRoot\..\..";
11+
$ProcDumpOutputPath = Join-Path $repoRoot $ProcDumpOutputPath;
12+
13+
$sentinelFile = Join-Path $ProcDumpOutputPath "dump-sentinel.txt";
14+
if ((-not (Test-Path $sentinelFile))) {
15+
Write-Output "No sentinel file available in '$sentinelFile'. " +
16+
"StartDumpCollectionForHangingBuilds.ps1 has not been executed, is not correctly configured or failed before creating the sentinel file.";
17+
return;
18+
}
19+
20+
Get-Process "procdump" -ErrorAction SilentlyContinue | ForEach-Object { Write-Output "ProcDump with PID $($_.Id) is still running."; };
21+
22+
$capturedDumps = Get-ChildItem $ProcDumpOutputPath -Filter *.dmp;
23+
$capturedDumps | ForEach-Object { Write-Output "Found captured dump $_"; };
24+
25+
$JobName = (Get-Content $sentinelFile);
26+
27+
if ($JobName.Count -ne 1) {
28+
if ($JobName.Count -eq 0) {
29+
Write-Warning "No job name found. This is likely an error.";
30+
return;
31+
}
32+
else {
33+
Write-Output "Multiple job names found '$JobName'.";
34+
return;
35+
}
36+
}
37+
38+
$dumpCollectionJob = Get-Job -Name $JobName -ErrorAction SilentlyContinue;
39+
$registeredJob = Get-ScheduledJob -Name $JobName -ErrorAction SilentlyContinue;
40+
41+
if ($null -eq $dumpCollectionJob) {
42+
Write-Output "No job found for '$JobName'. It either didn't run or there is an issue with the job definition.";
43+
44+
if ($null -eq $registeredJob) {
45+
Write-Warning "Couldn't find a scheduled job '$JobName'.";
46+
}
47+
return;
48+
}
49+
50+
Write-Output "Listing existing jobs";
51+
Get-Job -Name CaptureDumps*
52+
53+
Write-Output "Listing existing scheduled jobs";
54+
Get-ScheduledJob -Name CaptureDumps*
55+
56+
Write-Output "Displaying job output";
57+
Receive-Job $dumpCollectionJob;
58+
59+
Write-Output "Waiting for current job to finish";
60+
Get-Job -ErrorAction SilentlyContinue | Wait-Job;
61+
62+
try {
63+
Write-Output "Removing collection job";
64+
Remove-Job $dumpCollectionJob;
65+
}
66+
catch {
67+
Write-Output "Failed to remove collection job";
68+
}
69+
70+
try {
71+
Write-Output "Unregistering scheduled job";
72+
Unregister-ScheduledJob $registeredJob;
73+
}
74+
catch {
75+
Write-Output "Failed to unregister $JobName";
76+
}

eng/scripts/InstallProcDump.ps1

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
<#
2+
.SYNOPSIS
3+
Installs ProcDump into a folder in this repo.
4+
.DESCRIPTION
5+
This script downloads and extracts the ProcDump.
6+
.PARAMETER Force
7+
Overwrite the existing installation
8+
#>
9+
param(
10+
[switch]$Force
11+
)
12+
$ErrorActionPreference = 'Stop'
13+
$ProgressPreference = 'SilentlyContinue' # Workaround PowerShell/PowerShell#2138
14+
15+
Set-StrictMode -Version 1
16+
17+
$repoRoot = Resolve-Path "$PSScriptRoot\..\.."
18+
$installDir = "$repoRoot\.tools\ProcDump\"
19+
$tempDir = "$repoRoot\obj"
20+
21+
if (Test-Path $installDir) {
22+
if ($Force) {
23+
Remove-Item -Force -Recurse $installDir
24+
}
25+
else {
26+
Write-Host "ProcDump already installed to $installDir. Exiting without action. Call this script again with -Force to overwrite."
27+
exit 0
28+
}
29+
}
30+
31+
Remove-Item -Force -Recurse $tempDir -ErrorAction Ignore | out-null
32+
mkdir $tempDir -ea Ignore | out-null
33+
mkdir $installDir -ea Ignore | out-null
34+
Write-Host "Starting ProcDump download"
35+
Invoke-WebRequest -UseBasicParsing -Uri "https://download.sysinternals.com/files/Procdump.zip" -Out "$tempDir/ProcDump.zip"
36+
Write-Host "Done downloading ProcDump"
37+
Expand-Archive "$tempDir/ProcDump.zip" -d "$tempDir/ProcDump/"
38+
Write-Host "Expanded ProcDump to $tempDir"
39+
Write-Host "Installing ProcDump to $installDir"
40+
Move-Item "$tempDir/ProcDump/*" $installDir
41+
Write-Host "Done installing ProcDump to $installDir"
42+
43+
if ($env:TF_BUILD) {
44+
Write-Host "##vso[task.setvariable variable=ProcDumpPath]$installDir"
45+
Write-Host "##vso[task.prependpath]$installDir"
46+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
param(
2+
[Parameter(Mandatory = $true)]
3+
[ValidateNotNullOrEmpty()]
4+
[string]
5+
$ProcDumpPath,
6+
[Parameter(Mandatory = $true)]
7+
[ValidateNotNullOrEmpty()]
8+
[string]
9+
$ProcDumpOutputPath,
10+
[Parameter(Mandatory = $true)]
11+
[datetime]
12+
$WakeTime,
13+
[Parameter(Mandatory = $true)]
14+
[ValidateNotNullOrEmpty()]
15+
[string []]
16+
$CandidateProcessNames
17+
)
18+
19+
Write-Output "Setting up a scheduled job to capture process dumps.";
20+
21+
if ((-not (Test-Path $ProcDumpPath))) {
22+
Write-Warning "Can't find ProcDump at '$ProcDumpPath'.";
23+
}
24+
else {
25+
Write-Output "Using ProcDump from '$ProcDumpPath'.";
26+
}
27+
28+
try {
29+
$previousJobs = Get-Job -Name CaptureDumps* -ErrorAction SilentlyContinue;
30+
$previousScheduledJobs = Get-ScheduledJob CaptureDumps* -ErrorAction SilentlyContinue;
31+
32+
if ($previousJobs.Count -ne 0) {
33+
Write-Output "Found existing dump jobs.";
34+
}
35+
36+
if ($previousScheduledJobs.Count -ne 0) {
37+
Write-Output "Found existing dump jobs.";
38+
}
39+
40+
$previousJobs | Stop-Job -PassThru | Remove-Job;
41+
$previousScheduledJobs | Unregister-ScheduledJob;
42+
}
43+
catch {
44+
Write-Output "There was an error cleaning up previous jobs.";
45+
Write-Output $_.Exception.Message;
46+
}
47+
48+
$repoRoot = Resolve-Path "$PSScriptRoot\..\..";
49+
$ProcDumpOutputPath = Join-Path $repoRoot $ProcDumpOutputPath;
50+
51+
Write-Output "Dumps will be placed at '$ProcDumpOutputPath'.";
52+
Write-Output "Watching processes $($CandidateProcessNames -join ', ')";
53+
54+
# This script registers as a scheduled job. This scheduled job executes after $WakeTime.
55+
# When the scheduled job executes, it runs procdump on all alive processes whose name matches $CandidateProcessNames.
56+
# The dumps are placed in $ProcDumpOutputPath
57+
# If the build completes sucessfully in less than $WakeTime, a final step unregisters the job.
58+
59+
# Create a unique identifier for the job name
60+
$JobName = "CaptureDumps" + (New-Guid).ToString("N");
61+
62+
# Ensure that the dumps output path exists.
63+
if ((-not (Test-Path $ProcDumpOutputPath))) {
64+
New-Item -ItemType Directory $ProcDumpOutputPath | Out-Null;
65+
}
66+
67+
# We write a sentinel file that we use at the end of the build to
68+
# find the job we started and to determine the results from the sheduled
69+
# job (Whether it ran or not and to display the outputs form the job)
70+
$sentinelFile = Join-Path $ProcDumpOutputPath "dump-sentinel.txt";
71+
Out-File -FilePath $sentinelFile -InputObject $JobName | Out-Null;
72+
73+
[scriptblock] $ScriptCode = {
74+
param(
75+
$ProcDumpPath,
76+
$ProcDumpOutputPath,
77+
$CandidateProcessNames)
78+
79+
Write-Output "Waking up to capture process dumps. Determining hanging processes.";
80+
81+
[System.Diagnostics.Process []]$AliveProcesses = @();
82+
foreach ($candidate in $CandidateProcessNames) {
83+
try {
84+
$candidateProcesses = Get-Process $candidate;
85+
$candidateProcesses | ForEach-Object { Write-Output "Found candidate process $candidate with PID '$($_.Id)'." };
86+
$AliveProcesses += $candidateProcesses;
87+
}
88+
catch {
89+
Write-Output "No process found for $candidate";
90+
}
91+
}
92+
93+
Write-Output "Starting process dump capture.";
94+
95+
$dumpFullPath = [System.IO.Path]::Combine($ProcDumpOutputPath, "hung_PROCESSNAME_PID_YYMMDD_HHMMSS.dmp");
96+
97+
Write-Output "Capturing output for $($AliveProcesses.Length) processes.";
98+
99+
foreach ($process in $AliveProcesses) {
100+
101+
$procDumpArgs = @("-accepteula", "-ma", $process.Id, $dumpFullPath);
102+
try {
103+
Write-Output "Capturing dump for dump for '$($process.Name)' with PID '$($process.Id)'.";
104+
Start-Process -FilePath $ProcDumpPath -ArgumentList $procDumpArgs -NoNewWindow -Wait;
105+
}
106+
catch {
107+
Write-Output "There was an error capturing a process dump for '$($process.Name)' with PID '$($process.Id)'."
108+
Write-Warning $_.Exception.Message;
109+
}
110+
}
111+
112+
Write-Output "Done capturing process dumps.";
113+
}
114+
115+
$ScriptTrigger = New-JobTrigger -Once -At $WakeTime;
116+
117+
try {
118+
Register-ScheduledJob -Name $JobName -ScriptBlock $ScriptCode -Trigger $ScriptTrigger -ArgumentList $ProcDumpPath, $ProcDumpOutputPath, $CandidateProcessNames;
119+
}
120+
catch {
121+
Write-Warning "Failed to register scheduled job '$JobName'. Dumps will not be captured for build hangs.";
122+
Write-Warning $_.Exception.Message;
123+
}

0 commit comments

Comments
 (0)