Skip to content

[Bug Fix] Refactor Linux container management to avoid race condition that leads the host to initialize placeholder (warmup) function #10848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Mar 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<!-- Please add your release notes in the following format:
- My change description (#PR)
-->

- Update Java Worker Version to [2.18.1](https://github.com/Azure/azure-functions-java-worker/releases/tag/2.18.1)
- Introduced support for response compression, which can be enabled through explicit opt-in (#10870)
- Add support for new FeatureFlag `EnableAzureMonitorTimeIsoFormat` to enable iso time format for azmon logs for Linux Dedicated/EP Skus. (#10684)
Expand All @@ -14,3 +15,4 @@
- Adding support for faas.invoke_duration metric and other spec related updates (#10929)
- Increased the GC allocation budget value to improve cold start (#10953)
- Fixed bug that could result in "Binding names must be unique" error (#10938)
- Fix race condition that leads the host to initialize placeholder (warmup) function in Linux environments (#10848)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the MIT License. See License.txt in the project root for license information.

using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Azure.WebJobs.Script.WebHost.Management;
Expand Down Expand Up @@ -51,8 +52,15 @@ private async Task ApplyStartContextIfPresent(CancellationToken cancellationToke
var assignmentContext = _startupContextProvider.SetContext(encryptedAssignmentContext);
await SpecializeMSISideCar(assignmentContext);

bool success = _instanceManager.StartAssignment(assignmentContext);
_logger.LogDebug($"StartAssignment invoked (Success={success})");
try
{
bool success = await _instanceManager.AssignInstanceAsync(assignmentContext);
_logger.LogDebug("AssignInstanceAsync was invoked (Success={success})", success);
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to assign instance.");
}
}
else
{
Expand Down
12 changes: 12 additions & 0 deletions src/WebJobs.Script.WebHost/Management/IInstanceManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,18 @@ public interface IInstanceManager

Task<string> ValidateContext(HostAssignmentContext assignmentContext);

/// <summary>
/// Asynchronously assigns a host instance.
/// </summary>
/// <param name="assignmentContext">The <see cref="HostAssignmentContext"/> that will be applied to the instance being assigned to the application.</param>
/// <returns><see langword="true"/> if environment validation succeeds; otherwise <see langword="false"/>.</returns>
Task<bool> AssignInstanceAsync(HostAssignmentContext assignmentContext);

/// <summary>
/// Validates the assignment context and begins the assignment process in a "fire and forget" pattern.
/// </summary>
/// <param name="assignmentContext">The <see cref="HostAssignmentContext"/> that will be applied to the instance being assigned to the application.</param>
/// <returns><see langword="true"/> if environment validation succeeds; otherwise <see langword="false"/>.</returns>
bool StartAssignment(HostAssignmentContext assignmentContext);

Task<string> SpecializeMSISidecar(HostAssignmentContext assignmentContext);
Expand Down
123 changes: 81 additions & 42 deletions src/WebJobs.Script.WebHost/Management/LinuxInstanceManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,11 @@

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Azure.Storage;
using Microsoft.Azure.Storage.File;
using Microsoft.Azure.WebJobs.Script.Diagnostics;
using Microsoft.Azure.WebJobs.Script.WebHost.Configuration;
using Microsoft.Azure.WebJobs.Script.WebHost.Management.LinuxSpecialization;
using Microsoft.Azure.WebJobs.Script.WebHost.Models;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Newtonsoft.Json;

namespace Microsoft.Azure.WebJobs.Script.WebHost.Management
{
Expand All @@ -29,6 +21,7 @@ public abstract class LinuxInstanceManager : IInstanceManager
private readonly IEnvironment _environment;
private readonly HttpClient _client;
private readonly IScriptWebHostEnvironment _webHostEnvironment;
private Task _assignment;

private HostAssignmentContext _assignmentContext;

Expand All @@ -44,70 +37,101 @@ public LinuxInstanceManager(IHttpClientFactory httpClientFactory, IScriptWebHost

public abstract Task<string> SpecializeMSISidecar(HostAssignmentContext context);

public bool StartAssignment(HostAssignmentContext context)
public async Task<bool> AssignInstanceAsync(HostAssignmentContext context)
{
if (!_webHostEnvironment.InStandbyMode)
if (!IsValidEnvironment(context))
{
// This is only true when specializing pinned containers.
if (!context.Environment.TryGetValue(EnvironmentSettingNames.ContainerStartContext, out string startContext))
return false;
}

if (context.IsWarmupRequest)
{
await HandleWarmupRequestAsync(context);
return true;
}

lock (_assignmentLock)
{
if (_assignmentContext == null)
{
_assignmentContext = context;
_assignment = AssignAsync(context);
}
else if (!_assignmentContext.Equals(context))
{
_logger.LogError("Assign called while host is not in placeholder mode and start context is not present.");
return false;
}
}

if (_environment.IsContainerReady())
await _assignment;
return true;
}

public bool StartAssignment(HostAssignmentContext context)
{
if (!IsValidEnvironment(context))
{
_logger.LogError("Assign called while container is marked as specialized.");
return false;
}

if (context.IsWarmupRequest)
{
// Based on profiling download code jit-ing holds up cold start.
// Pre-jit to avoid paying the cost later.
Task.Run(async () => await DownloadWarmupAsync(context.GetRunFromPkgContext()));
Task.Run(async () => await HandleWarmupRequestAsync(context));
return true;
}
else if (_assignmentContext == null)

lock (_assignmentLock)
{
lock (_assignmentLock)
if (_assignmentContext != null)
{
if (_assignmentContext != null)
{
return _assignmentContext.Equals(context);
}
_assignmentContext = context;
return _assignmentContext.Equals(context);
}
_assignmentContext = context;
_assignment = AssignAsync(context);
}

_logger.LogInformation($"Starting Assignment. Cloud Name: {_environment.GetCloudName()}");

// set a flag which will cause any incoming http requests to buffer
// until specialization is complete
// the host is guaranteed not to receive any requests until AFTER assign
// has been initiated, so setting this flag here is sufficient to ensure
// that any subsequent incoming requests while the assign is in progress
// will be delayed until complete
_webHostEnvironment.DelayRequests();
return true;
}

// start the specialization process in the background
Task.Run(async () => await AssignAsync(context));
public abstract Task<string> ValidateContext(HostAssignmentContext assignmentContext);

return true;
private bool IsValidEnvironment(HostAssignmentContext context)
{
if (!_webHostEnvironment.InStandbyMode)
{
// This is only true when specializing pinned containers.
if (!context.Environment.TryGetValue(EnvironmentSettingNames.ContainerStartContext, out string startContext))
{
_logger.LogError("Assign called while host is not in placeholder mode and start context is not present.");
return false;
}
}
else

if (_environment.IsContainerReady())
{
// No lock needed here since _assignmentContext is not null when we are here
return _assignmentContext.Equals(context);
_logger.LogError("Assign called while container is marked as specialized.");
return false;
}
}

public abstract Task<string> ValidateContext(HostAssignmentContext assignmentContext);
return true;
}

private async Task AssignAsync(HostAssignmentContext assignmentContext)
{
await Task.Yield(); // This may be called from within a lock. When AssignAsync is awaited, control flow will return to the caller and the lock will be released when it exits the lock scope.

try
{
_logger.LogInformation($"Starting Assignment. Cloud Name: {_environment.GetCloudName()}");

// set a flag which will cause any incoming http requests to buffer
// until specialization is complete
// the host is guaranteed not to receive any requests until AFTER assign
// has been initiated, so setting this flag here is sufficient to ensure
// that any subsequent incoming requests while the assign is in progress
// will be delayed until complete
_webHostEnvironment.DelayRequests();

// first make all environment and file system changes required for
// the host to be specialized
_logger.LogInformation("Applying {environmentCount} app setting(s)", assignmentContext.Environment.Count);
Expand All @@ -133,6 +157,21 @@ private async Task AssignAsync(HostAssignmentContext assignmentContext)
}
}

private async Task HandleWarmupRequestAsync(HostAssignmentContext assignmentContext)
{
try
{
await DownloadWarmupAsync(assignmentContext.GetRunFromPkgContext());
}
catch (Exception ex)
{
_logger.LogError(ex, "Warmup download failed");
await _meshServiceClient.NotifyHealthEvent(ContainerHealthEventType.Warning, GetType(), "Warmup download failed");
throw;
}
return;
}

protected abstract Task ApplyContextAsync(HostAssignmentContext assignmentContext);

protected abstract Task<string> DownloadWarmupAsync(RunFromPackageContext context);
Expand Down
Loading
Loading