Skip to content

AutoML graph output changes and training metrics exposure #148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/Microsoft.ML.PipelineInference/AutoInference.cs
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,14 @@ private bool GetDataVariableName(IExceptionContext ectx, string nameOfData, JTok
public sealed class RunSummary
{
public double MetricValue { get; }
public double TrainingMetricValue { get; }
public int NumRowsInTraining { get; }
public long RunTimeMilliseconds { get; }

public RunSummary(double metricValue, int numRows, long runTimeMilliseconds)
public RunSummary(double metricValue, int numRows, long runTimeMilliseconds, double trainingMetricValue)
{
MetricValue = metricValue;
TrainingMetricValue = trainingMetricValue;
NumRowsInTraining = numRows;
RunTimeMilliseconds = runTimeMilliseconds;
}
Expand Down Expand Up @@ -303,7 +305,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows)
var stopwatch = new Stopwatch();
var probabilityUtils = new Sweeper.Algorithms.SweeperProbabilityUtils(_host);

while (!_terminator.ShouldTerminate(_history))
while (!_terminator.ShouldTerminate(_history))
{
// Get next set of candidates
var currentBatchSize = batchSize;
Expand Down Expand Up @@ -341,16 +343,18 @@ private void ProcessPipeline(Sweeper.Algorithms.SweeperProbabilityUtils utils, S

// Run pipeline, and time how long it takes
stopwatch.Restart();
double d = candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows),
Tuple<double, double> result = candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows),
_testData, Metric, TrainerKind);
stopwatch.Stop();
double d = result.Item1;
double d2 = result.Item2;

// Handle key collisions on sorted list
while (_sortedSampledElements.ContainsKey(d))
d += 1e-10;

// Save performance score
candidate.PerformanceSummary = new RunSummary(d, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds);
candidate.PerformanceSummary = new RunSummary(d, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds, d2);
_sortedSampledElements.Add(candidate.PerformanceSummary.MetricValue, candidate);
_history.Add(candidate);
}
Expand Down
24 changes: 20 additions & 4 deletions src/Microsoft.ML.PipelineInference/AutoMlUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,35 @@ namespace Microsoft.ML.Runtime.PipelineInference
{
public static class AutoMlUtils
{
public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView data, string metricColumnName)
public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView result, string metricColumnName, IDataView trainResult = null)
{
double metricValue = 0;
double trainingMetricValue = -1d;
int numRows = 0;
var schema = data.Schema;
var schema = result.Schema;
schema.TryGetColumnIndex(metricColumnName, out var metricCol);

using (var cursor = data.GetRowCursor(col => col == metricCol))
using (var cursor = result.GetRowCursor(col => col == metricCol))
{
var getter = cursor.GetGetter<double>(metricCol);
cursor.MoveNext();
getter(ref metricValue);
}

return new AutoInference.RunSummary(metricValue, numRows, 0);
if (trainResult != null)
{
var trainSchema = trainResult.Schema;
trainSchema.TryGetColumnIndex(metricColumnName, out var trainingMetricCol);

using (var cursor = trainResult.GetRowCursor(col => col == trainingMetricCol))
{
var getter = cursor.GetGetter<double>(trainingMetricCol);
cursor.MoveNext();
getter(ref trainingMetricValue);
}
}

return new AutoInference.RunSummary(metricValue, numRows, 0, trainingMetricValue);
}

public static CommonInputs.IEvaluatorInput CloneEvaluatorInstance(CommonInputs.IEvaluatorInput evalInput) =>
Expand Down Expand Up @@ -618,5 +632,7 @@ public static Tuple<string, string[]>[] ConvertToSweepArgumentStrings(TlcModule.
}
return results;
}

public static string GenerateOverallTrainingMetricVarName(Guid id) => $"Var_Training_OM_{id:N}";
}
}
19 changes: 14 additions & 5 deletions src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,24 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input)
var col1 = new KeyValuePair<string, ColumnType>("Graph", TextType.Instance);
var col2 = new KeyValuePair<string, ColumnType>("MetricValue", PrimitiveType.FromKind(DataKind.R8));
var col3 = new KeyValuePair<string, ColumnType>("PipelineId", TextType.Instance);
var col4 = new KeyValuePair<string, ColumnType>("TrainingMetricValue", PrimitiveType.FromKind(DataKind.R8));
var col5 = new KeyValuePair<string, ColumnType>("FirstInput", TextType.Instance);
var col6 = new KeyValuePair<string, ColumnType>("PredictorModel", TextType.Instance);

if (rows.Count == 0)
{
var host = env.Register("ExtractSweepResult");
outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3));
outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3, col4, col5, col6));
}
else
{
var builder = new ArrayDataViewBuilder(env);
builder.AddColumn(col1.Key, (PrimitiveType)col1.Value, rows.Select(r => new DvText(r.GraphJson)).ToArray());
builder.AddColumn(col2.Key, (PrimitiveType)col2.Value, rows.Select(r => r.MetricValue).ToArray());
builder.AddColumn(col3.Key, (PrimitiveType)col3.Value, rows.Select(r => new DvText(r.PipelineId)).ToArray());
builder.AddColumn(col4.Key, (PrimitiveType)col4.Value, rows.Select(r => r.TrainingMetricValue).ToArray());
builder.AddColumn(col5.Key, (PrimitiveType)col5.Value, rows.Select(r => new DvText(r.FirstInput)).ToArray());
builder.AddColumn(col6.Key, (PrimitiveType)col6.Value, rows.Select(r => new DvText(r.PredictorModel)).ToArray());
outputView = builder.GetDataView();
}
return new Output { Results = outputView, State = autoMlState };
Expand Down Expand Up @@ -132,11 +138,11 @@ public static CommonOutputs.MacroOutput<Output> PipelineSweep(
// Extract performance summaries and assign to previous candidate pipelines.
foreach (var pipeline in autoMlState.BatchCandidates)
{
if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId),
out var v))
if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId), out var v) &&
node.Context.TryGetVariable(AutoMlUtils.GenerateOverallTrainingMetricVarName(pipeline.UniqueId), out var v2))
{
pipeline.PerformanceSummary =
AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name);
AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name, (IDataView)v2.Value);
autoMlState.AddEvaluated(pipeline);
}
}
Expand Down Expand Up @@ -168,14 +174,17 @@ public static CommonOutputs.MacroOutput<Output> PipelineSweep(
{
// Add train test experiments to current graph for candidate pipeline
var subgraph = new Experiment(env);
var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph);
var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph, true);

// Change variable name to reference pipeline ID in output map, context and entrypoint output.
var uniqueName = ExperimentUtils.GenerateOverallMetricVarName(p.UniqueId);
var uniqueNameTraining = AutoMlUtils.GenerateOverallTrainingMetricVarName(p.UniqueId);
var sgNode = EntryPointNode.ValidateNodes(env, node.Context,
new JArray(subgraph.GetNodes().Last()), node.Catalog).Last();
sgNode.RenameOutputVariable(trainTestOutput.OverallMetrics.VarName, uniqueName, cascadeChanges: true);
sgNode.RenameOutputVariable(trainTestOutput.TrainingOverallMetrics.VarName, uniqueNameTraining, cascadeChanges: true);
trainTestOutput.OverallMetrics.VarName = uniqueName;
trainTestOutput.TrainingOverallMetrics.VarName = uniqueNameTraining;
expNodes.Add(sgNode);

// Store indicators, to pass to next iteration of macro.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
<ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
<ProjectReference Include="..\Microsoft.ML.StandardLearners\Microsoft.ML.StandardLearners.csproj" />
<ProjectReference Include="..\Microsoft.ML.Sweeper\Microsoft.ML.Sweeper.csproj" />
<ProjectReference Include="..\Microsoft.ML\Microsoft.ML.csproj" />
</ItemGroup>

</Project>
78 changes: 64 additions & 14 deletions src/Microsoft.ML.PipelineInference/PipelinePattern.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,24 @@ public sealed class PipelineResultRow
{
public string GraphJson { get; }
public double MetricValue { get; }
public double TrainingMetricValue { get; }
public string PipelineId { get; }
public string FirstInput { get; }
public string PredictorModel { get; }

public PipelineResultRow()
{ }

public PipelineResultRow(string graphJson, double metricValue, string pipelineId)
public PipelineResultRow(string graphJson, double metricValue,
string pipelineId, double trainingMetricValue, string firstInput,
string predictorModel)
{
GraphJson = graphJson;
MetricValue = metricValue;
PipelineId = pipelineId;
TrainingMetricValue = trainingMetricValue;
FirstInput = firstInput;
PredictorModel = predictorModel;
}
}

Expand Down Expand Up @@ -111,7 +119,8 @@ public AutoInference.EntryPointGraphDef ToEntryPointGraph(Experiment experiment
public bool Equals(PipelinePattern obj) => obj != null && UniqueId == obj.UniqueId;

// REVIEW: We may want to allow for sweeping with CV in the future, so we will need to add new methods like this, or refactor these in that case.
public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind, out Models.TrainTestEvaluator.Output resultsOutput)
public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind,
bool includeTrainingMetrics, out Models.TrainTestEvaluator.Output resultsOutput)
{
var graphDef = ToEntryPointGraph();
var subGraph = graphDef.Graph;
Expand All @@ -136,7 +145,8 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD
Model = finalOutput
},
PipelineId = UniqueId.ToString("N"),
Kind = MacroUtils.TrainerKindApiValue<Models.MacroUtilsTrainerKinds>(trainerKind)
Kind = MacroUtils.TrainerKindApiValue<Models.MacroUtilsTrainerKinds>(trainerKind),
IncludeTrainingMetrics = includeTrainingMetrics
};

var experiment = _env.CreateExperiment();
Expand All @@ -150,7 +160,7 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD
}

public Models.TrainTestEvaluator.Output AddAsTrainTest(Var<IDataView> trainData, Var<IDataView> testData,
MacroUtils.TrainerKinds trainerKind, Experiment experiment = null)
MacroUtils.TrainerKinds trainerKind, Experiment experiment = null, bool includeTrainingMetrics = false)
{
experiment = experiment ?? _env.CreateExperiment();
var graphDef = ToEntryPointGraph(experiment);
Expand All @@ -174,7 +184,8 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var<IDataView> trainData,
TrainingData = trainData,
TestingData = testData,
Kind = MacroUtils.TrainerKindApiValue<Models.MacroUtilsTrainerKinds>(trainerKind),
PipelineId = UniqueId.ToString("N")
PipelineId = UniqueId.ToString("N"),
IncludeTrainingMetrics = includeTrainingMetrics
};
var trainTestOutput = experiment.Add(trainTestInput);
return trainTestOutput;
Expand All @@ -183,34 +194,55 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var<IDataView> trainData,
/// <summary>
/// Runs a train-test experiment on the current pipeline, through entrypoints.
/// </summary>
public double RunTrainTestExperiment(IDataView trainData, IDataView testData, AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind)
public Tuple<double, double> RunTrainTestExperiment(IDataView trainData, IDataView testData, AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tuple<double, double> [](start = 15, length = 21)

Hi George, now that this is returning multiple values, do you think it might be nice to restructure this a little so that it is using out parameters? My reasoning is this will make the code more clear.

I'll tell you why I think this: I started in AutoInference.cs, where I saw this tuple being created. I wondered what was in it. But the usage there didn't help me, specifically:

double d = result.Item1;
double d2 = result.Item2;

I eventually made my way here, but there was no documentation. I read through and eventually found it

If there were two out parameters, one named metricValue and one named trainingMetricValue, then the method would be (to some extent) more self-documenting.

For this reason and some others we've tended to avoid the use of Tuple. (There happens to be a lovely new concept of tuples in C# 7 that solves most of what we think of as tuples problems, but using that is perhaps a bit more involved, requiring subscription to nugets etc. we may not want to commit to right now.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea. I'll make the suggested change.

{
var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, out var trainTestOutput);
var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, true, out var trainTestOutput);
experiment.Run();

var dataOut = experiment.GetOutput(trainTestOutput.OverallMetrics);
var schema = dataOut.Schema;
schema.TryGetColumnIndex(metric.Name, out var metricCol);
double metricValue = 0;
double trainingMetricValue = 0;

using (var cursor = dataOut.GetRowCursor(col => col == metricCol))
{
var getter = cursor.GetGetter<double>(metricCol);
double metricValue = 0;
cursor.MoveNext();
getter(ref metricValue);
return metricValue;
}

dataOut = experiment.GetOutput(trainTestOutput.TrainingOverallMetrics);
schema = dataOut.Schema;
schema.TryGetColumnIndex(metric.Name, out metricCol);

using (var cursor = dataOut.GetRowCursor(col => col == metricCol))
{
var getter = cursor.GetGetter<double>(metricCol);
cursor.MoveNext();
getter(ref trainingMetricValue);
return new Tuple<double, double>(metricValue, trainingMetricValue);
}
}

public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data, string graphColName, string metricColName, string idColName)
public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data,
string graphColName, string metricColName, string idColName, string trainingMetricColName,
string firstInputColName, string predictorModelColName)
{
var results = new List<PipelineResultRow>();
var schema = data.Schema;
if (!schema.TryGetColumnIndex(graphColName, out var graphCol))
throw env.ExceptNotSupp($"Column name {graphColName} not found");
if (!schema.TryGetColumnIndex(metricColName, out var metricCol))
throw env.ExceptNotSupp($"Column name {metricColName} not found");
if (!schema.TryGetColumnIndex(trainingMetricColName, out var trainingMetricCol))
throw env.ExceptNotSupp($"Column name {trainingMetricColName} not found");
if (!schema.TryGetColumnIndex(idColName, out var pipelineIdCol))
throw env.ExceptNotSupp($"Column name {idColName} not found");
if (!schema.TryGetColumnIndex(firstInputColName, out var firstInputCol))
throw env.ExceptNotSupp($"Column name {firstInputColName} not found");
if (!schema.TryGetColumnIndex(predictorModelColName, out var predictorModelCol))
throw env.ExceptNotSupp($"Column name {predictorModelColName} not found");

using (var cursor = data.GetRowCursor(col => true))
{
Expand All @@ -225,15 +257,33 @@ public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView
var getter3 = cursor.GetGetter<DvText>(pipelineIdCol);
DvText pipelineId = new DvText();
getter3(ref pipelineId);
results.Add(new PipelineResultRow(graphJson.ToString(), metricValue, pipelineId.ToString()));
var getter4 = cursor.GetGetter<double>(trainingMetricCol);
double trainingMetricValue = 0;
getter4(ref trainingMetricValue);
var getter5 = cursor.GetGetter<DvText>(firstInputCol);
DvText firstInput = new DvText();
getter5(ref firstInput);
var getter6 = cursor.GetGetter<DvText>(predictorModelCol);
DvText predictorModel = new DvText();
getter6(ref predictorModel);

results.Add(new PipelineResultRow(graphJson.ToString(),
metricValue, pipelineId.ToString(), trainingMetricValue,
firstInput.ToString(), predictorModel.ToString()));
}
}

return results.ToArray();
}

public PipelineResultRow ToResultRow() =>
new PipelineResultRow(ToEntryPointGraph().Graph.ToJsonString(),
PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N"));
public PipelineResultRow ToResultRow() {
var graphDef = ToEntryPointGraph();

return new PipelineResultRow($"{{'Nodes' : [{graphDef.Graph.ToJsonString()}]}}",
PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N"),
PerformanceSummary?.TrainingMetricValue ?? -1d,
graphDef.GetSubgraphFirstNodeDataVarName(_env),
graphDef.ModelOutput.VarName);
}
}
}
Loading