Skip to content

Commit

Permalink
DockerLogin retries (#4100)
Browse files Browse the repository at this point in the history
* DockerLogin retries
- added changes from PR #3840
- fixed incorrect variables

* DockerLogin retries
- added step UseDotNet@2

* DockerLogin retries
- added linux to UseDotNet task

* Docker Login retries
Implement retry into DockerLogin method

* Implement feature flag for docker login retries

* revert build-job back

* Added retry for docker start

* fix delay

* update knob description

* Update agent Docker methods
- reworked Docker methods with retries behavior
- changed InvokeWithRetryIfNonZero on to incapsulated logic method

* - renamed local variable as is in ADO

* Added docker retry behavior to docker version method

Co-authored-by: Your Name <you@example.com>
Co-authored-by: Kirill Ivlev <102740624+kirill-ivlev@users.noreply.github.com>
Co-authored-by: Merlyn Oppenheim <merlynop@microsoft.com>
  • Loading branch information
4 people committed Nov 8, 2023
1 parent 48c295b commit 7989427
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 31 deletions.
9 changes: 9 additions & 0 deletions src/Agent.Sdk/Knob/AgentKnobs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ public class AgentKnobs
new EnvironmentKnobSource("AZP_AGENT_USE_HOST_GROUP_ID"),
new BuiltInDefaultKnobSource("true"));

public const string DockerActionRetriesVariableName = "VSTSAGENT_DOCKER_ACTION_RETRIES";

public static readonly Knob DockerActionRetries = new Knob(
nameof(DockerActionRetries),
"When enabled, the agent retries docker steps if failed",
new RuntimeKnobSource(DockerActionRetriesVariableName),
new EnvironmentKnobSource(DockerActionRetriesVariableName),
new BuiltInDefaultKnobSource("false"));

// Directory structure
public static readonly Knob AgentToolsDirectory = new Knob(
nameof(AgentToolsDirectory),
Expand Down
89 changes: 81 additions & 8 deletions src/Agent.Worker/Container/DockerCommandManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ public override void Initialize(IHostContext hostContext)
public async Task<DockerVersion> DockerVersion(IExecutionContext context)
{
ArgUtil.NotNull(context, nameof(context));
string serverVersionStr = (await ExecuteDockerCommandAsync(context, "version", "--format '{{.Server.APIVersion}}'")).FirstOrDefault();
var action = new Func<Task<List<string>>>(async () => await ExecuteDockerCommandAsync(context, "version", "--format '{{.Server.APIVersion}}'"));
const string command = "Docker version";
string serverVersionStr = (await ExecuteDockerCommandAsyncWithRetries(context, action, command)).FirstOrDefault();
ArgUtil.NotNullOrEmpty(serverVersionStr, "Docker.Server.Version");
context.Output($"Docker daemon API version: {serverVersionStr}");

Expand Down Expand Up @@ -98,12 +100,14 @@ public async Task<int> DockerLogin(IExecutionContext context, string server, str
ArgUtil.NotNull(username, nameof(username));
ArgUtil.NotNull(password, nameof(password));

if (PlatformUtil.RunningOnWindows)
{
var action = new Func<Task<int>>(async () => PlatformUtil.RunningOnWindows
// Wait for 17.07 to switch using stdin for docker registry password.
return await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password \"{password.Replace("\"", "\\\"")}\" {server}", new List<string>() { password }, context.CancellationToken);
}
return await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password-stdin {server}", new List<string>() { password }, context.CancellationToken);
? await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password \"{password.Replace("\"", "\\\"")}\" {server}", new List<string>() { password }, context.CancellationToken)
: await ExecuteDockerCommandAsync(context, "login", $"--username \"{username}\" --password-stdin {server}", new List<string>() { password }, context.CancellationToken)
);

const string command = "Docker login";
return await ExecuteDockerCommandAsyncWithRetries(context, action, command);
}

public async Task<int> DockerLogout(IExecutionContext context, string server)
Expand All @@ -119,7 +123,9 @@ public async Task<int> DockerPull(IExecutionContext context, string image)
ArgUtil.NotNull(context, nameof(context));
ArgUtil.NotNull(image, nameof(image));

return await ExecuteDockerCommandAsync(context, "pull", image, context.CancellationToken);
var action = new Func<Task<int>>(async () => await ExecuteDockerCommandAsync(context, "pull", image, context.CancellationToken));
const string command = "Docker pull";
return await ExecuteDockerCommandAsyncWithRetries(context, action, command);
}

public async Task<string> DockerCreate(IExecutionContext context, ContainerInfo container)
Expand Down Expand Up @@ -194,7 +200,9 @@ public async Task<int> DockerStart(IExecutionContext context, string containerId
ArgUtil.NotNull(context, nameof(context));
ArgUtil.NotNull(containerId, nameof(containerId));

return await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken);
var action = new Func<Task<int>>(async () => await ExecuteDockerCommandAsync(context, "start", containerId, context.CancellationToken));
const string command = "Docker start";
return await ExecuteDockerCommandAsyncWithRetries(context, action, command);
}

public async Task<int> DockerRemove(IExecutionContext context, string containerId)
Expand Down Expand Up @@ -443,5 +451,70 @@ await processInvoker.ExecuteAsync(

return output;
}

private static async Task<int> ExecuteDockerCommandAsyncWithRetries(IExecutionContext context, Func<Task<int>> action, string command)
{
bool dockerActionRetries = AgentKnobs.DockerActionRetries.GetValue(context).AsBoolean();
context.Output($"DockerActionRetries variable value: {dockerActionRetries}");

int retryCount = 0;
int exitCode = 0;
const int maxRetries = 3;
TimeSpan delayInSeconds = TimeSpan.FromSeconds(10);

while (retryCount < maxRetries)
{
exitCode = await action();

if (exitCode == 0 || !dockerActionRetries)
{
break;
}

context.Warning($"{command} failed with exit code {exitCode}, back off {delayInSeconds} seconds before retry.");
await Task.Delay(delayInSeconds);
retryCount++;
}

return exitCode;
}

private static async Task<List<string>> ExecuteDockerCommandAsyncWithRetries(IExecutionContext context, Func<Task<List<string>>> action, string command)
{
bool dockerActionRetries = AgentKnobs.DockerActionRetries.GetValue(context).AsBoolean();
context.Output($"DockerActionRetries variable value: {dockerActionRetries}");

int retryCount = 0;
List<string> output = new List<string>();
const int maxRetries = 3;
TimeSpan delayInSeconds = TimeSpan.FromSeconds(10);

while (retryCount <= maxRetries)
{
try
{
output = await action();
}
catch (ProcessExitCodeException)
{
if (!dockerActionRetries || retryCount == maxRetries)
{
throw;
}

context.Warning($"{command} failed, back off {delayInSeconds} seconds before retry.");
await Task.Delay(delayInSeconds);
}

retryCount++;

if (output != null && output.Count != 0)
{
break;
}
}

return output;
}
}
}
33 changes: 10 additions & 23 deletions src/Agent.Worker/ContainerOperationProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,12 @@ private async Task PullContainerAsync(IExecutionContext executionContext, Contai
ArgUtil.NotNullOrEmpty(username, nameof(username));
ArgUtil.NotNullOrEmpty(password, nameof(password));

int loginExitCode = await _dockerManger.DockerLogin(executionContext, registryServer, username, password);
int loginExitCode = await _dockerManger.DockerLogin(
executionContext,
registryServer,
username,
password);

if (loginExitCode != 0)
{
throw new InvalidOperationException($"Docker login fail with exit code {loginExitCode}");
Expand All @@ -338,29 +343,11 @@ private async Task PullContainerAsync(IExecutionContext executionContext, Contai
}
}

// Pull down docker image with retry up to 3 times
int retryCount = 0;
int pullExitCode = 0;
while (retryCount < 3)
{
pullExitCode = await _dockerManger.DockerPull(executionContext, container.ContainerImage);
if (pullExitCode == 0)
{
break;
}
else
{
retryCount++;
if (retryCount < 3)
{
var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10));
executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backOff.TotalSeconds} seconds before retry.");
await Task.Delay(backOff);
}
}
}
int pullExitCode = await _dockerManger.DockerPull(
executionContext,
container.ContainerImage);

if (retryCount == 3 && pullExitCode != 0)
if (pullExitCode != 0)
{
throw new InvalidOperationException($"Docker pull failed with exit code {pullExitCode}");
}
Expand Down

0 comments on commit 7989427

Please sign in to comment.