Skip to content

Commit

Permalink
Role mapped improvements (#496)
Browse files Browse the repository at this point in the history
* RoleMappedSchema/Data change to use constructors
* Nuke all create methods, pointless "no-roles" constructor.
* Nuke TrainUtils.CreateExamples/CreateExamplesOpt
* Opportunistically improve code quality and reporting of Kmeans++
  • Loading branch information
TomFinley authored Jul 5, 2018
1 parent 52cc874 commit f85e722
Show file tree
Hide file tree
Showing 57 changed files with 387 additions and 437 deletions.
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Api/ComponentCreation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public static RoleMappedData CreateExamples(this IHostEnvironment env, IDataView
env.CheckValueOrNull(weight);
env.CheckValueOrNull(custom);

return TrainUtils.CreateExamples(data, label, features, group, weight, name: null, custom: custom);
return new RoleMappedData(data, label, features, group, weight, name: null, custom: custom);
}

/// <summary>
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Api/GenerateCodeCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ public void Run()
{
var roles = ModelFileUtils.LoadRoleMappingsOrNull(_host, fs);
scorer = roles != null
? _host.CreateDefaultScorer(RoleMappedData.CreateOpt(transformPipe, roles), pred)
: _host.CreateDefaultScorer(_host.CreateExamples(transformPipe, "Features"), pred);
? _host.CreateDefaultScorer(new RoleMappedData(transformPipe, roles, opt: true), pred)
: _host.CreateDefaultScorer(new RoleMappedData(transformPipe, label: null, "Features"), pred);
}

var nonScoreSb = new StringBuilder();
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Api/PredictionEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ internal BatchPredictionEngine(IHostEnvironment env, Stream modelStream, bool ig
{
var roles = ModelFileUtils.LoadRoleMappingsOrNull(env, modelStream);
pipe = roles != null
? env.CreateDefaultScorer(RoleMappedData.CreateOpt(pipe, roles), predictor)
: env.CreateDefaultScorer(env.CreateExamples(pipe, "Features"), predictor);
? env.CreateDefaultScorer(new RoleMappedData(pipe, roles, opt: true), predictor)
: env.CreateDefaultScorer(new RoleMappedData(pipe, label: null, "Features"), predictor);
}

_pipeEngine = new PipeEngine<TDst>(env, pipe, ignoreMissingColumns, outputSchemaDefinition);
Expand Down
1 change: 0 additions & 1 deletion src/Microsoft.ML.Core/Data/MetadataUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,6 @@ public static bool HasSlotNames(this ISchema schema, int col, int vectorSize)
public static void GetSlotNames(RoleMappedSchema schema, RoleMappedSchema.ColumnRole role, int vectorSize, ref VBuffer<DvText> slotNames)
{
Contracts.CheckValueOrNull(schema);
Contracts.CheckValue(role.Value, nameof(role));
Contracts.CheckParam(vectorSize >= 0, nameof(vectorSize));

IReadOnlyList<ColumnInfo> list;
Expand Down
361 changes: 218 additions & 143 deletions src/Microsoft.ML.Core/Data/RoleMappedSchema.cs

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions src/Microsoft.ML.Data/Commands/CrossValidationCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ private RoleMappedData ApplyAllTransformsToData(IHostEnvironment env, IChannel c
RoleMappedData srcData, IDataView marker)
{
var pipe = ApplyTransformUtils.ApplyAllTransformsToData(env, srcData.Data, dstData, marker);
return RoleMappedData.Create(pipe, srcData.Schema.GetColumnRoleNames());
return new RoleMappedData(pipe, srcData.Schema.GetColumnRoleNames());
}

/// <summary>
Expand All @@ -277,7 +277,7 @@ private RoleMappedData CreateRoleMappedData(IHostEnvironment env, IChannel ch, I
// Training pipe and examples.
var customCols = TrainUtils.CheckAndGenerateCustomColumns(ch, Args.CustomColumn);

return TrainUtils.CreateExamples(data, label, features, group, weight, name, customCols);
return new RoleMappedData(data, label, features, group, weight, name, customCols);
}

private string GetSplitColumn(IChannel ch, IDataView input, ref IDataView output)
Expand Down Expand Up @@ -568,7 +568,7 @@ private FoldResult RunFold(int fold)
{
using (var file = host.CreateOutputFile(modelFileName))
{
var rmd = RoleMappedData.Create(
var rmd = new RoleMappedData(
CompositeDataLoader.ApplyTransform(host, _loader, null, null,
(e, newSource) => ApplyTransformUtils.ApplyAllTransformsToData(e, trainData.Data, newSource)),
trainData.Schema.GetColumnRoleNames());
Expand All @@ -581,17 +581,17 @@ private FoldResult RunFold(int fold)
if (!evalComp.IsGood())
evalComp = EvaluateUtils.GetEvaluatorType(ch, scorePipe.Schema);
var eval = evalComp.CreateInstance(host);
// Note that this doesn't require the provided columns to exist (because of "Opt").
// Note that this doesn't require the provided columns to exist (because of the "opt" parameter).
// We don't normally expect the scorer to drop columns, but if it does, we should not require
// all the columns in the test pipeline to still be present.
var dataEval = RoleMappedData.CreateOpt(scorePipe, testData.Schema.GetColumnRoleNames());
var dataEval = new RoleMappedData(scorePipe, testData.Schema.GetColumnRoleNames(), opt: true);

var dict = eval.Evaluate(dataEval);
RoleMappedData perInstance = null;
if (_savePerInstance)
{
var perInst = eval.GetPerInstanceMetrics(dataEval);
perInstance = RoleMappedData.CreateOpt(perInst, dataEval.Schema.GetColumnRoleNames());
perInstance = new RoleMappedData(perInst, dataEval.Schema.GetColumnRoleNames(), opt: true);
}
ch.Done();
return new FoldResult(dict, dataEval.Schema.Schema, perInstance, trainData.Schema);
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Commands/DataCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ protected void LoadModelObjects(
// can be loaded with no data at all, to get their schemas.
if (trainPipe == null)
trainPipe = ModelFileUtils.LoadLoader(Host, rep, new MultiFileSource(null), loadTransforms: true);
trainSchema = RoleMappedSchema.Create(trainPipe.Schema, trainRoleMappings);
trainSchema = new RoleMappedSchema(trainPipe.Schema, trainRoleMappings);
}
// If the role mappings are null, an alternative would be to fail. However the idea
// is that the scorer should always still succeed, although perhaps with reduced
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Data/Commands/EvaluateCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
evalComp = EvaluateUtils.GetEvaluatorType(ch, input.Schema);

var eval = evalComp.CreateInstance(env);
var data = TrainUtils.CreateExamples(input, label, null, group, weight, null, customCols);
var data = new RoleMappedData(input, label, null, group, weight, null, customCols);
return eval.GetPerInstanceMetrics(data);
}
}
Expand Down Expand Up @@ -236,7 +236,7 @@ private void RunCore(IChannel ch)
if (!evalComp.IsGood())
evalComp = EvaluateUtils.GetEvaluatorType(ch, view.Schema);
var evaluator = evalComp.CreateInstance(Host);
var data = TrainUtils.CreateExamples(view, label, null, group, weight, name, customCols);
var data = new RoleMappedData(view, label, null, group, weight, name, customCols);
var metrics = evaluator.Evaluate(data);
MetricWriter.PrintWarnings(ch, metrics);
evaluator.PrintFoldResults(ch, metrics);
Expand All @@ -248,7 +248,7 @@ private void RunCore(IChannel ch)
if (!string.IsNullOrWhiteSpace(Args.OutputDataFile))
{
var perInst = evaluator.GetPerInstanceMetrics(data);
var perInstData = TrainUtils.CreateExamples(perInst, label, null, group, weight, name, customCols);
var perInstData = new RoleMappedData(perInst, label, null, group, weight, name, customCols);
var idv = evaluator.GetPerInstanceDataViewToSave(perInstData);
MetricWriter.SavePerInstance(Host, ch, Args.OutputDataFile, idv);
}
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Commands/SavePredictorCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ public static void LoadModel(IHostEnvironment env, Stream modelStream, bool load
if (roles != null)
{
var emptyView = ModelFileUtils.LoadPipeline(env, rep, new MultiFileSource(null));
schema = RoleMappedSchema.CreateOpt(emptyView.Schema, roles);
schema = new RoleMappedSchema(emptyView.Schema, roles, opt: true);
}
else
{
Expand Down
41 changes: 16 additions & 25 deletions src/Microsoft.ML.Data/Commands/ScoreCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,7 @@ private void RunCore(IChannel ch)

ch.Trace("Creating loader");

IPredictor predictor;
IDataLoader loader;
RoleMappedSchema trainSchema;
LoadModelObjects(ch, true, out predictor, true, out trainSchema, out loader);
LoadModelObjects(ch, true, out var predictor, true, out var trainSchema, out var loader);
ch.AssertValue(predictor);
ch.AssertValueOrNull(trainSchema);
ch.AssertValue(loader);
Expand All @@ -116,7 +113,7 @@ private void RunCore(IChannel ch)
string group = TrainUtils.MatchNameOrDefaultOrNull(ch, loader.Schema,
nameof(Args.GroupColumn), Args.GroupColumn, DefaultColumnNames.GroupId);
var customCols = TrainUtils.CheckAndGenerateCustomColumns(ch, Args.CustomColumn);
var schema = TrainUtils.CreateRoleMappedSchemaOpt(loader.Schema, feat, group, customCols);
var schema = new RoleMappedSchema(loader.Schema, label: null, feature: feat, group: group, custom: customCols, opt: true);
var mapper = bindable.Bind(Host, schema);

if (!scorer.IsGood())
Expand Down Expand Up @@ -153,22 +150,20 @@ private void RunCore(IChannel ch)
Args.OutputAllColumns == true || Utils.Size(Args.OutputColumn) == 0;

if (Args.OutputAllColumns == true && Utils.Size(Args.OutputColumn) != 0)
ch.Warning("outputAllColumns=+ always writes all columns irrespective of outputColumn specified.");
ch.Warning(nameof(Args.OutputAllColumns) + "=+ always writes all columns irrespective of " + nameof(Args.OutputColumn) + " specified.");

if (!outputAllColumns && Utils.Size(Args.OutputColumn) != 0)
{
foreach (var outCol in Args.OutputColumn)
{
int dummyColIndex;
if (!loader.Schema.TryGetColumnIndex(outCol, out dummyColIndex))
if (!loader.Schema.TryGetColumnIndex(outCol, out int dummyColIndex))
throw ch.ExceptUserArg(nameof(Arguments.OutputColumn), "Column '{0}' not found.", outCol);
}
}

int colMax;
uint maxScoreId = 0;
if (!outputAllColumns)
maxScoreId = loader.Schema.GetMaxMetadataKind(out colMax, MetadataUtils.Kinds.ScoreColumnSetId);
maxScoreId = loader.Schema.GetMaxMetadataKind(out int colMax, MetadataUtils.Kinds.ScoreColumnSetId);
ch.Assert(outputAllColumns || maxScoreId > 0); // score set IDs are one-based
var cols = new List<int>();
for (int i = 0; i < loader.Schema.ColumnCount; i++)
Expand Down Expand Up @@ -211,12 +206,12 @@ private bool ShouldAddColumn(ISchema schema, int i, uint scoreSet, bool outputNa
{
switch (schema.GetColumnName(i))
{
case "Label":
case "Name":
case "Names":
return true;
default:
break;
case "Label":
case "Name":
case "Names":
return true;
default:
break;
}
}
if (Args.OutputColumn != null && Array.FindIndex(Args.OutputColumn, schema.GetColumnName(i).Equals) >= 0)
Expand All @@ -229,8 +224,7 @@ public static class ScoreUtils
{
public static IDataScorerTransform GetScorer(IPredictor predictor, RoleMappedData data, IHostEnvironment env, RoleMappedSchema trainSchema)
{
ISchemaBoundMapper mapper;
var sc = GetScorerComponentAndMapper(predictor, null, data.Schema, env, out mapper);
var sc = GetScorerComponentAndMapper(predictor, null, data.Schema, env, out var mapper);
return sc.CreateInstance(env, data.Data, mapper, trainSchema);
}

Expand All @@ -247,9 +241,8 @@ public static IDataScorerTransform GetScorer(SubComponent<IDataScorerTransform,
env.CheckValueOrNull(customColumns);
env.CheckValueOrNull(trainSchema);

var schema = TrainUtils.CreateRoleMappedSchemaOpt(input.Schema, featureColName, groupColName, customColumns);
ISchemaBoundMapper mapper;
var sc = GetScorerComponentAndMapper(predictor, scorer, schema, env, out mapper);
var schema = new RoleMappedSchema(input.Schema, label: null, feature: featureColName, group: groupColName, custom: customColumns, opt: true);
var sc = GetScorerComponentAndMapper(predictor, scorer, schema, env, out var mapper);
return sc.CreateInstance(env, input, mapper, trainSchema);
}

Expand Down Expand Up @@ -280,7 +273,7 @@ public static SubComponent<IDataScorerTransform, SignatureDataScorer> GetScorerC
Contracts.AssertValue(mapper);

string loadName = null;
DvText scoreKind = default(DvText);
DvText scoreKind = default;
if (mapper.OutputSchema.ColumnCount > 0 &&
mapper.OutputSchema.TryGetMetadata(TextType.Instance, MetadataUtils.Kinds.ScoreColumnKind, 0, ref scoreKind) &&
scoreKind.HasChars)
Expand Down Expand Up @@ -311,10 +304,8 @@ public static ISchemaBindableMapper GetSchemaBindableMapper(IHostEnvironment env
env.CheckValue(predictor, nameof(predictor));
env.CheckValueOrNull(scorerSettings);

ISchemaBindableMapper bindable;

// See if we can instantiate a mapper using scorer arguments.
if (scorerSettings.IsGood() && TryCreateBindableFromScorer(env, predictor, scorerSettings, out bindable))
if (scorerSettings.IsGood() && TryCreateBindableFromScorer(env, predictor, scorerSettings, out var bindable))
return bindable;

// The easy case is that the predictor implements the interface.
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Commands/TestCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ private void RunCore(IChannel ch)
if (!evalComp.IsGood())
evalComp = EvaluateUtils.GetEvaluatorType(ch, scorePipe.Schema);
var evaluator = evalComp.CreateInstance(Host);
var data = TrainUtils.CreateExamples(scorePipe, label, null, group, weight, name, customCols);
var data = new RoleMappedData(scorePipe, label, null, group, weight, name, customCols);
var metrics = evaluator.Evaluate(data);
MetricWriter.PrintWarnings(ch, metrics);
evaluator.PrintFoldResults(ch, metrics);
Expand All @@ -128,7 +128,7 @@ private void RunCore(IChannel ch)
if (!string.IsNullOrWhiteSpace(Args.OutputDataFile))
{
var perInst = evaluator.GetPerInstanceMetrics(data);
var perInstData = TrainUtils.CreateExamples(perInst, label, null, group, weight, name, customCols);
var perInstData = new RoleMappedData(perInst, label, null, group, weight, name, customCols);
var idv = evaluator.GetPerInstanceDataViewToSave(perInstData);
MetricWriter.SavePerInstance(Host, ch, Args.OutputDataFile, idv);
}
Expand Down
Loading

0 comments on commit f85e722

Please sign in to comment.