From 7989427f115d9727fab94d60e21518dafd3a3188 Mon Sep 17 00:00:00 2001 From: ivanduplenskikh <115665590+ivanduplenskikh@users.noreply.github.com> Date: Tue, 24 Jan 2023 11:31:26 +0300 Subject: [PATCH] DockerLogin retries (#4100) * DockerLogin retries - added changes from PR microsoft/azure-pipelines-agent#3840 - fixed incorrect variables * DockerLogin retries - added step UseDotNet@2 * DockerLogin retries - added linux to UseDotNet task * Docker Login retries Implement retry into DockerLogin method * Implement feature flag for docker login retries * revert build-job back * Added retry for docker start * fix delay * update knob description * Update agent Docker methods - reworked Docker methods with retries behavior - changed InvokeWithRetryIfNonZero on to incapsulated logic method * - renamed local variable as is in ADO * Added docker retry behavior to docker version method Co-authored-by: Your Name Co-authored-by: Kirill Ivlev <102740624+kirill-ivlev@users.noreply.github.com> Co-authored-by: Merlyn Oppenheim --- src/Agent.Sdk/Knob/AgentKnobs.cs | 9 ++ .../Container/DockerCommandManager.cs | 89 +++++++++++++++++-- .../ContainerOperationProvider.cs | 33 +++---- 3 files changed, 100 insertions(+), 31 deletions(-) diff --git a/src/Agent.Sdk/Knob/AgentKnobs.cs b/src/Agent.Sdk/Knob/AgentKnobs.cs index c276f875b0..ce2c7b8c9b 100644 --- a/src/Agent.Sdk/Knob/AgentKnobs.cs +++ b/src/Agent.Sdk/Knob/AgentKnobs.cs @@ -61,6 +61,15 @@ public class AgentKnobs new EnvironmentKnobSource("AZP_AGENT_USE_HOST_GROUP_ID"), new BuiltInDefaultKnobSource("true")); + public const string DockerActionRetriesVariableName = "VSTSAGENT_DOCKER_ACTION_RETRIES"; + + public static readonly Knob DockerActionRetries = new Knob( + nameof(DockerActionRetries), + "When enabled, the agent retries docker steps if failed", + new RuntimeKnobSource(DockerActionRetriesVariableName), + new EnvironmentKnobSource(DockerActionRetriesVariableName), + new BuiltInDefaultKnobSource("false")); + // Directory structure public static readonly Knob AgentToolsDirectory = new Knob( nameof(AgentToolsDirectory), diff --git a/src/Agent.Worker/Container/DockerCommandManager.cs b/src/Agent.Worker/Container/DockerCommandManager.cs index fdeee39d51..c68564d57d 100644 --- a/src/Agent.Worker/Container/DockerCommandManager.cs +++ b/src/Agent.Worker/Container/DockerCommandManager.cs @@ -57,7 +57,9 @@ public override void Initialize(IHostContext hostContext) public async Task DockerVersion(IExecutionContext context) { ArgUtil.NotNull(context, nameof(context)); - string serverVersionStr = (await ExecuteDockerCommandAsync(context, "version", "--format '{{.Server.APIVersion}}'")).FirstOrDefault(); + var action = new Func>>(async () => await ExecuteDockerCommandAsync(context, "version", "--format '{{.Server.APIVersion}}'")); + const string command = "Docker version"; + string serverVersionStr = (await ExecuteDockerCommandAsyncWithRetries(context, action, command)).FirstOrDefault(); ArgUtil.NotNullOrEmpty(serverVersionStr, "Docker.Server.Version"); context.Output($"Docker daemon API version: {serverVersionStr}"); @@ -98,12 +100,14 @@ public async Task DockerLogin(IExecutionContext context, string server, str ArgUtil.NotNull(username, nameof(username)); ArgUtil.NotNull(password, nameof(password)); - if (PlatformUtil.RunningOnWindows) - { + var action = new Func>(async () => PlatformUtil.RunningOnWindows // Wait for 17.07 to switch using stdin for docker registry password. - return await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password \"{password.Replace("\"", "\\\"")}\" {server}", new List() { password }, context.CancellationToken); - } - return await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password-stdin {server}", new List() { password }, context.CancellationToken); + ? await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password \"{password.Replace("\"", "\\\"")}\" {server}", new List() { password }, context.CancellationToken) + : await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password-stdin {server}", new List() { password }, context.CancellationToken) + ); + + const string command = "Docker login"; + return await ExecuteDockerCommandAsyncWithRetries(context, action, command); } public async Task DockerLogout(IExecutionContext context, string server) @@ -119,7 +123,9 @@ public async Task DockerPull(IExecutionContext context, string image) ArgUtil.NotNull(context, nameof(context)); ArgUtil.NotNull(image, nameof(image)); - return await ExecuteDockerCommandAsync(context, "pull", image, context.CancellationToken); + var action = new Func>(async () => await ExecuteDockerCommandAsync(context, "pull", image, context.CancellationToken)); + const string command = "Docker pull"; + return await ExecuteDockerCommandAsyncWithRetries(context, action, command); } public async Task DockerCreate(IExecutionContext context, ContainerInfo container) @@ -194,7 +200,9 @@ public async Task DockerStart(IExecutionContext context, string containerId ArgUtil.NotNull(context, nameof(context)); ArgUtil.NotNull(containerId, nameof(containerId)); - return await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken); + var action = new Func>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken)); + const string command = "Docker start"; + return await ExecuteDockerCommandAsyncWithRetries(context, action, command); } public async Task DockerRemove(IExecutionContext context, string containerId) @@ -443,5 +451,70 @@ await processInvoker.ExecuteAsync( return output; } + + private static async Task ExecuteDockerCommandAsyncWithRetries(IExecutionContext context, Func> action, string command) + { + bool dockerActionRetries = AgentKnobs.DockerActionRetries.GetValue(context).AsBoolean(); + context.Output($"DockerActionRetries variable value: {dockerActionRetries}"); + + int retryCount = 0; + int exitCode = 0; + const int maxRetries = 3; + TimeSpan delayInSeconds = TimeSpan.FromSeconds(10); + + while (retryCount < maxRetries) + { + exitCode = await action(); + + if (exitCode == 0 || !dockerActionRetries) + { + break; + } + + context.Warning($"{command} failed with exit code {exitCode}, back off {delayInSeconds} seconds before retry."); + await Task.Delay(delayInSeconds); + retryCount++; + } + + return exitCode; + } + + private static async Task> ExecuteDockerCommandAsyncWithRetries(IExecutionContext context, Func>> action, string command) + { + bool dockerActionRetries = AgentKnobs.DockerActionRetries.GetValue(context).AsBoolean(); + context.Output($"DockerActionRetries variable value: {dockerActionRetries}"); + + int retryCount = 0; + List output = new List(); + const int maxRetries = 3; + TimeSpan delayInSeconds = TimeSpan.FromSeconds(10); + + while (retryCount <= maxRetries) + { + try + { + output = await action(); + } + catch (ProcessExitCodeException) + { + if (!dockerActionRetries || retryCount == maxRetries) + { + throw; + } + + context.Warning($"{command} failed, back off {delayInSeconds} seconds before retry."); + await Task.Delay(delayInSeconds); + } + + retryCount++; + + if (output != null && output.Count != 0) + { + break; + } + } + + return output; + } } } diff --git a/src/Agent.Worker/ContainerOperationProvider.cs b/src/Agent.Worker/ContainerOperationProvider.cs index a24ceddcd1..73c0f5c7dd 100644 --- a/src/Agent.Worker/ContainerOperationProvider.cs +++ b/src/Agent.Worker/ContainerOperationProvider.cs @@ -317,7 +317,12 @@ private async Task PullContainerAsync(IExecutionContext executionContext, Contai ArgUtil.NotNullOrEmpty(username, nameof(username)); ArgUtil.NotNullOrEmpty(password, nameof(password)); - int loginExitCode = await _dockerManger.DockerLogin(executionContext, registryServer, username, password); + int loginExitCode = await _dockerManger.DockerLogin( + executionContext, + registryServer, + username, + password); + if (loginExitCode != 0) { throw new InvalidOperationException($"Docker login fail with exit code {loginExitCode}"); @@ -338,29 +343,11 @@ private async Task PullContainerAsync(IExecutionContext executionContext, Contai } } - // Pull down docker image with retry up to 3 times - int retryCount = 0; - int pullExitCode = 0; - while (retryCount < 3) - { - pullExitCode = await _dockerManger.DockerPull(executionContext, container.ContainerImage); - if (pullExitCode == 0) - { - break; - } - else - { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + int pullExitCode = await _dockerManger.DockerPull( + executionContext, + container.ContainerImage); - if (retryCount == 3 && pullExitCode != 0) + if (pullExitCode != 0) { throw new InvalidOperationException($"Docker pull failed with exit code {pullExitCode}"); }