Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add continuous resource monitoring to AutoML.IMonitor #6520

Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9647027
Fix a typo
andrasfuchs Nov 25, 2022
269b1bd
Fix trial cancellation bug
andrasfuchs Nov 25, 2022
a2c5781
Move performance related properties to TrialPerformanceMetrics and ad…
andrasfuchs Nov 25, 2022
e3fd992
Add new class and property explanations
andrasfuchs Nov 25, 2022
88fdefa
Revert "Fix trial cancellation bug"
andrasfuchs Dec 2, 2022
b03e46a
Remove pipeline info from the IMonitor Running event
andrasfuchs Dec 6, 2022
bf69dd2
Remove FreeSpaceOnDrives from TrialPerformanceMetrics
andrasfuchs Dec 6, 2022
38cf838
Change the default resource check interval to 5 seconds
andrasfuchs Dec 6, 2022
7f40df5
Remove StartedAtUtc property from TrialSettings
andrasfuchs Dec 22, 2022
8aa0ad8
move ReportTrialResourceUsage to IPerformanceMonitor
LittleLittleCloud Jan 3, 2023
739d865
Update AutoMLExperimentExtension.cs
LittleLittleCloud Jan 3, 2023
aeb651c
Merge pull request #2 from LittleLittleCloud/u/xiaoyun/add-cancellati…
andrasfuchs Jan 5, 2023
fc82c4c
Pause the performance monitor if the trial is not running
andrasfuchs Jan 6, 2023
d0ce0cd
Add StartedAtUtc and EndedAtUtc to TrialSettings
andrasfuchs Jan 8, 2023
4149a4b
cancel trial when as is
LittleLittleCloud Feb 6, 2023
7d3257a
fix tests
LittleLittleCloud Feb 6, 2023
c5c2d83
Merge branch 'main' into add-cancellation-and-resource-monitoring-to-…
LittleLittleCloud Feb 7, 2023
3919324
fix tests
LittleLittleCloud Feb 7, 2023
13ba949
fix tests
LittleLittleCloud Feb 7, 2023
488ff20
use workingset to evaluate memory usage
LittleLittleCloud Feb 8, 2023
49ac8ae
remove handler
LittleLittleCloud Feb 8, 2023
3722dcb
add handler back
LittleLittleCloud Feb 8, 2023
ff55857
add more logging
LittleLittleCloud Feb 8, 2023
509f963
add more logger
LittleLittleCloud Feb 8, 2023
1240335
add logging
LittleLittleCloud Feb 9, 2023
5a27af4
fix tests
LittleLittleCloud Feb 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,53 @@ public static AutoMLExperiment SetPipeline(this AutoMLExperiment experiment, Swe
return experiment;
}

/// <summary>
/// Set <see cref="DefaultPerformanceMonitor"/> as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
/// </summary>
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
/// <param name="checkIntervalInMilliseconds">the interval in milliseconds for <see cref="DefaultPerformanceMonitor"/> to sample <see cref="TrialPerformanceMetrics"/></param>
/// <returns></returns>
public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, int checkIntervalInMilliseconds = 1000)
{
experiment.SetPerformanceMonitor((service) =>
{
var channel = service.GetService<IChannel>();

return new DefaultPerformanceMonitor(channel, checkIntervalInMilliseconds);
var settings = service.GetRequiredService<AutoMLExperiment.AutoMLExperimentSettings>();
return new DefaultPerformanceMonitor(settings, channel, checkIntervalInMilliseconds);
});

return experiment;
}

/// <summary>
/// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
/// </summary>
/// <typeparam name="TPerformanceMonitor"></typeparam>
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
/// <param name="factory"></param>
/// <returns></returns>
public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment, Func<IServiceProvider, TPerformanceMonitor> factory)
where TPerformanceMonitor : class, IPerformanceMonitor
{
experiment.ServiceCollection.AddTransient<IPerformanceMonitor>(factory);

return experiment;
}

/// <summary>
/// Set a custom performance monitor as <see cref="IPerformanceMonitor"/> for <see cref="AutoMLExperiment"/>.
/// </summary>
/// <typeparam name="TPerformanceMonitor"></typeparam>
/// <param name="experiment"><see cref="AutoMLExperiment"/></param>
/// <returns></returns>
public static AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(this AutoMLExperiment experiment)
where TPerformanceMonitor : class, IPerformanceMonitor
{
experiment.ServiceCollection.AddTransient<IPerformanceMonitor, TPerformanceMonitor>();

return experiment;
}

/// <summary>
/// Set <see cref="SmacTuner"/> as tuner for hyper-parameter optimization. The performance of smac is in a large extend determined
/// by <paramref name="numberOfTrees"/>, <paramref name="nMinForSpit"/> and <paramref name="splitRatio"/>, which are used to fit smac's inner
Expand Down
60 changes: 21 additions & 39 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Diagnostics;
using System.Linq;
using System.Text.Json;
using System.Threading;
Expand Down Expand Up @@ -193,22 +194,6 @@ public AutoMLExperiment SetTuner<TTuner>()
return this;
}

internal AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>()
where TPerformanceMonitor : class, IPerformanceMonitor
{
_serviceCollection.AddTransient<IPerformanceMonitor, TPerformanceMonitor>();

return this;
}

internal AutoMLExperiment SetPerformanceMonitor<TPerformanceMonitor>(Func<IServiceProvider, TPerformanceMonitor> factory)
where TPerformanceMonitor : class, IPerformanceMonitor
{
_serviceCollection.AddTransient<IPerformanceMonitor>(factory);

return this;
}

/// <summary>
/// Run experiment and return the best trial result synchronizely.
/// </summary>
Expand Down Expand Up @@ -249,25 +234,25 @@ public async Task<TrialResult> RunAsync(CancellationToken ct = default)
var trialNum = trialResultManager?.GetAllTrialResults().Max(t => t.TrialSettings?.TrialId) + 1 ?? 0;
var tuner = serviceProvider.GetService<ITuner>();
Contracts.Assert(tuner != null, "tuner can't be null");

while (!aggregateTrainingStopManager.IsStopTrainingRequested())
{
var setting = new TrialSettings()
var trialSettings = new TrialSettings()
{
TrialId = trialNum++,
Parameter = Parameter.CreateNestedParameter(),
StartedAtUtc = DateTime.UtcNow,
andrasfuchs marked this conversation as resolved.
Show resolved Hide resolved
};
var parameter = tuner.Propose(setting);
setting.Parameter = parameter;
var parameter = tuner.Propose(trialSettings);
trialSettings.Parameter = parameter;

monitor?.ReportRunningTrial(setting);
using (var trialCancellationTokenSource = new CancellationTokenSource())
{
monitor?.ReportRunningTrial(trialSettings);

void handler(object o, EventArgs e)
{
// only force-canceling running trials when there's completed trials.
// otherwise, wait for the current running trial to be completed.
if (_bestTrialResult != null)
trialCancellationTokenSource.Cancel();
trialCancellationTokenSource.Cancel();
}
try
{
Expand All @@ -276,27 +261,23 @@ void handler(object o, EventArgs e)
{
aggregateTrainingStopManager.OnStopTraining += handler;

performanceMonitor.MemoryUsageInMegaByte += (o, m) =>
performanceMonitor.PerformanceMetricsUpdated += (o, metrics) =>
{
if (_settings.MaximumMemoryUsageInMegaByte is double d && m > d && !trialCancellationTokenSource.IsCancellationRequested)
{
logger.Trace($"cancel current trial {setting.TrialId} because it uses {m} mb memory and the maximum memory usage is {d}");
trialCancellationTokenSource.Cancel();

GC.AddMemoryPressure(Convert.ToInt64(m) * 1024 * 1024);
GC.Collect();
}
performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource);
};

var trialTask = runner.RunAsync(trialSettings, trialCancellationTokenSource.Token);
performanceMonitor.Start();
logger.Trace($"trial setting - {JsonSerializer.Serialize(setting)}");
var trialResult = await runner.RunAsync(setting, trialCancellationTokenSource.Token);
logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}");
var trialResult = await trialTask;

var peakCpu = performanceMonitor?.GetPeakCpuUsage();
var peakMemoryInMB = performanceMonitor?.GetPeakMemoryUsageInMegaByte();
trialResult.PeakCpu = peakCpu;
trialResult.PeakMemoryInMegaByte = peakMemoryInMB;
trialResult.TrialSettings.EndedAtUtc = DateTime.UtcNow;

performanceMonitor.Pause();
monitor?.ReportCompletedTrial(trialResult);
tuner.Update(trialResult);
trialResultManager?.AddOrUpdateTrialResult(trialResult);
Expand All @@ -313,10 +294,11 @@ void handler(object o, EventArgs e)
}
catch (OperationCanceledException ex) when (aggregateTrainingStopManager.IsStopTrainingRequested() == false)
{
monitor?.ReportFailTrial(setting, ex);
trialSettings.EndedAtUtc = DateTime.UtcNow;
monitor?.ReportFailTrial(trialSettings, ex);
var result = new TrialResult
{
TrialSettings = setting,
TrialSettings = trialSettings,
Loss = double.MaxValue,
};

Expand All @@ -329,7 +311,8 @@ void handler(object o, EventArgs e)
}
catch (Exception ex)
{
monitor?.ReportFailTrial(setting, ex);
trialSettings.EndedAtUtc = DateTime.UtcNow;
monitor?.ReportFailTrial(trialSettings, ex);

if (!aggregateTrainingStopManager.IsStopTrainingRequested() && _bestTrialResult == null)
{
Expand All @@ -343,7 +326,6 @@ void handler(object o, EventArgs e)
finally
{
aggregateTrainingStopManager.OnStopTraining -= handler;

}
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public interface IMonitor

void ReportFailTrial(TrialSettings settings, Exception exception = null);

void ReportRunningTrial(TrialSettings setting);
void ReportRunningTrial(TrialSettings settings);
}

/// <summary>
Expand All @@ -30,6 +30,7 @@ internal class MLContextMonitor : IMonitor
private readonly IChannel _logger;
private readonly List<TrialResult> _completedTrials;
private readonly SweepablePipeline _pipeline;

public MLContextMonitor(IChannel logger, SweepablePipeline pipeline)
{
_logger = logger;
Expand All @@ -55,7 +56,7 @@ public virtual void ReportFailTrial(TrialSettings settings, Exception exception

public virtual void ReportRunningTrial(TrialSettings setting)
{
_logger.Info($"Update Running Trial - Id: {setting.TrialId} - Pipeline: {_pipeline.ToString(setting.Parameter)}");
_logger.Info($"Update Running Trial - Id: {setting.TrialId}");
}
}

Expand Down
62 changes: 51 additions & 11 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,47 +5,57 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Timers;
using Microsoft.ML.Runtime;
using Timer = System.Timers.Timer;

namespace Microsoft.ML.AutoML
{
internal interface IPerformanceMonitor : IDisposable
public interface IPerformanceMonitor : IDisposable
{
void Start();

void Pause();

void Stop();

double? GetPeakMemoryUsageInMegaByte();

double? GetPeakCpuUsage();

public event EventHandler<double> CpuUsage;
/// <summary>
/// The handler function every time <see cref="PerformanceMetricsUpdated"/> get fired.
/// </summary>
void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource);


public event EventHandler<double> MemoryUsageInMegaByte;
public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;
andrasfuchs marked this conversation as resolved.
Show resolved Hide resolved
}

internal class DefaultPerformanceMonitor : IPerformanceMonitor
public class DefaultPerformanceMonitor : IPerformanceMonitor
{
private readonly IChannel _logger;
private readonly AutoMLExperiment.AutoMLExperimentSettings _settings;
private Timer _timer;
private double? _peakCpuUsage;
private double? _peakMemoryUsage;
private readonly int _checkIntervalInMilliseconds;
private TimeSpan _totalCpuProcessorTime;

public DefaultPerformanceMonitor(IChannel logger, int checkIntervalInMilliseconds)
public DefaultPerformanceMonitor(AutoMLExperiment.AutoMLExperimentSettings settings, IChannel logger, int checkIntervalInMilliseconds)
{
_settings = settings;
_logger = logger;
_checkIntervalInMilliseconds = checkIntervalInMilliseconds;
}


public event EventHandler<double> CpuUsage;

public event EventHandler<double> MemoryUsageInMegaByte;
public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;


public void Dispose()
Expand All @@ -71,9 +81,18 @@ public void Start()
_totalCpuProcessorTime = Process.GetCurrentProcess().TotalProcessorTime;
_timer.Elapsed += OnCheckCpuAndMemoryUsage;
_timer.AutoReset = true;
_timer.Enabled = true;
_logger?.Trace($"{typeof(DefaultPerformanceMonitor)} has been started");
}

// trigger the PerformanceMetricsUpdated event and (re)start the timer
_timer.Enabled = false;
SampleCpuAndMemoryUsage();
_timer.Enabled = true;
}

public void Pause()
{
_timer.Enabled = false;
}

public void Stop()
Expand Down Expand Up @@ -110,9 +129,30 @@ private void SampleCpuAndMemoryUsage()
// calculate Memory Usage in MB
var memoryUsage = process.PrivateMemorySize64 * 1.0 / (1024 * 1024);
_peakMemoryUsage = Math.Max(memoryUsage, _peakMemoryUsage ?? 0);

andrasfuchs marked this conversation as resolved.
Show resolved Hide resolved
var metrics = new TrialPerformanceMetrics()
{
CpuUsage = cpuUsageInTotal,
MemoryUsage = memoryUsage,
PeakCpuUsage = _peakCpuUsage,
PeakMemoryUsage = _peakMemoryUsage
};

_logger?.Trace($"current CPU: {cpuUsageInTotal}, current Memory(mb): {memoryUsage}");
MemoryUsageInMegaByte?.Invoke(this, memoryUsage);
CpuUsage?.Invoke(this, cpuUsageInTotal);

PerformanceMetricsUpdated?.Invoke(this, metrics);
}
}

public virtual void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource)
{
if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested)
{
_logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}");
trialCancellationTokenSource.Cancel();

GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024);
GC.Collect();
}
}
}
Expand Down
13 changes: 12 additions & 1 deletion src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,21 @@ public void AddTrainingStopManager(IStopTrainingManager manager)
_managers.Add(manager);
manager.OnStopTraining += (o, e) =>
{
OnStopTraining?.Invoke(this, e);
if (_managers.Exists(manager.Equals))
{
OnStopTraining?.Invoke(this, e);
}
};
}

public void RemoveTrainingStopManagerIfExist(IStopTrainingManager manager)
{
if (_managers.Exists(manager.Equals))
{
_managers.RemoveAll(manager.Equals);
}
}

public void Update(TrialResult result)
{
foreach (var manager in _managers)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Text;

namespace Microsoft.ML.AutoML
{
/// <summary>
/// Performance metrics for a trial.
/// </summary>
public class TrialPerformanceMetrics
{
/// <summary>
/// Peak memory usage during the trial in megabytes
/// </summary>
public double? PeakMemoryUsage { get; set; }
/// <summary>
/// Peak CPU usage during the trial
/// </summary>
public double? PeakCpuUsage { get; set; }
/// <summary>
/// Current CPU usage of the runner process
/// </summary>
public double CpuUsage { get; internal set; }
/// <summary>
/// Current memory usage of the runner process in megabytes
/// </summary>
public double MemoryUsage { get; internal set; }
}
}
Loading