Closed
Description
System information
- OS version/distro: Windows 10 Pro x64
- .NET Version (eg., dotnet --info): .NET Core 3.0
- ML.NET Version: 1.5.0-preview
Issue
Trying to use variable number of properties (dynamic schema) for the trainer using dataView.SelectColumns. This creates correct trainer with only 2 features, but prediction engine still requires to specify original input model and uses all 10+ features, even though all features except selected 2 were set to 0.
What did you do?
- use input model with 10 features / properties
- create data view and select only 2 of these features
- use LGBM as a trainer
- create 3 input items with labels - Strategy1, Strategy2, Strategy3 and train estimator
- try to make prediction providing test item identical to Strategy3
What happened?
- output schema in CreatePredictionEngine shows that there are 10+ columns, even though, when I created a data view for training, I selected only 2 features
- result of prediction is always the same - Strategy1, most probably because trainer always compares 10+ features instead of 2, even though all features except selected 2 were set to 0
What did you expect?
- if estimator was trained to use only 2 features / input properties, then prediction engine should use provided data view schema and should also work only with 2 selected properties
- in the code below I'd like to make sure that properties Contrast, Param1 ... Param5 are ignored by prediction engine
Source code / logs
public class MyInputModel
{
[ColumnName(nameof(PredictorLabelsEnum.Strategy)), LoadColumn(0)]
public string Strategy { get; set; }
[ColumnName(nameof(InputNamesEnum.Pitch)), LoadColumn(1)]
public float Pitch { get; set; }
[ColumnName(nameof(InputNamesEnum.Energy)), LoadColumn(2)]
public float Energy { get; set; }
[ColumnName(nameof(InputNamesEnum.Contrast)), LoadColumn(3, 8), VectorType(6)]
public float[] Contrast { get; set; }
[ColumnName(nameof(InputNamesEnum.Param1)), LoadColumn(9)]
public float Param1 { get; set; }
[ColumnName(nameof(InputNamesEnum.Param2)), LoadColumn(10)]
public float Param2 { get; set; }
[ColumnName(nameof(InputNamesEnum.Param3)), LoadColumn(11)]
public float Param3 { get; set; }
[ColumnName(nameof(InputNamesEnum.Param4)), LoadColumn(12)]
public float Param4 { get; set; }
[ColumnName(nameof(InputNamesEnum.Param5)), LoadColumn(13)]
public float Param5 { get; set; }
}
public IEstimator<ITransformer> GetPipeline(IEnumerable<string> columns)
{
var pipeline = Context
.Transforms
.Conversion
.MapValueToKey(new[] { new InputOutputColumnPair("Label", "Strategy") }) // use property "strategy" as categorizable label
.Append(Context.Transforms.Concatenate("Combination", columns.ToArray())) // merge properties selected for analysis into "Combination"
.Append(Context.Transforms.NormalizeMinMax(new[] { new InputOutputColumnPair("Features", "Combination") })); // normalize selected properties as "Features"
return pipeline;
}
public IEstimator<ITransformer> GetEstimator()
{
var estimator = Context
.MulticlassClassification
.Trainers
.LightGbm()
.Append(Context.Transforms.Conversion.MapKeyToValue(new[] { new InputOutputColumnPair("Prediction", "PredictedLabel") }));
return estimator;
}
public byte[] SaveModel(IEnumerable<MyInputModel> items)
{
var columns = new [] { "Pitch", "Energy" };
var estimator = GetEstimator();
var pipeline = GetPipeline(columns);
var sourceInputs = Context.Data.LoadFromEnumerable(items);
var inputs = Context
.Transforms
.SelectColumns(columns.Concat(new List<string> { "Strategy" }).ToArray()) // model has ~10 properties, we select only 2 of them
.Fit(sourceInputs)
.Transform(sourceInputs);
var pipelineModel = pipeline.Fit(inputs);
var pipelineView = pipelineModel.Transform(inputs);
var estimatorModel = pipeline.Append(estimator).Fit(inputs);
var model = new byte[0];
using (var memoryStream = new MemoryStream())
{
Context.Model.Save(estimatorModel, pipelineView.Schema, memoryStream);
model = memoryStream.ToArray();
}
return model;
}
public string LoadModelAndEstimate(byte[] predictor)
{
var prediction = string.Empty;
// let's make input identical to Strategy3, but somehow predicted result is still Strategy1
var input = new MyInputModel
{
Pitch = 50,
Energy = 10,
Contrast = new [] { 0, 0, 0, 0, 0, 0 },
Param1 = 0,
Param2 = 0,
Param3 = 0,
Param4 = 0,
Param5 = 0
};
using (var stream = new MemoryStream(predictor))
{
var model = Context.Model.Load(stream, out var schema) as TransformerChain<ITransformer>;
var chain = (model.LastTransformer as IEnumerable<ITransformer>).First() as MulticlassPredictionTransformer<OneVersusAllModelParameters>;
var chainModel = chain.Model as OneVersusAllModelParameters; // here I see only 3 properties with weights - Pitch, Energy, Label
var engine = Context.Model.CreatePredictionEngine<MyInputModel, MyOutputModel>(model); // here output schema shows 10+ columns, even though I expect 3
// also tried to specify data view schema from the model explicitly for prediction engine
// var engine = Context.Model.CreatePredictionEngine<MyInputModel, MyOutputModel>(model, schema);
prediction = engine.Predict(input);
}
return prediction;
}
Example
var testData =
[
{
Strategy = "Strategy1",
Pitch = 115,
Energy = 50,
Contrast = new [] { 0, 0, 0, 0, 0, 0 },
Param1 = 0, Param2 = 0, Param3 = 0, Param4 = 0, Param5 = 0
},
{
Strategy = "Strategy2",
Pitch = 90,
Energy = 30,
Contrast = new [] { 0, 0, 0, 0, 0, 0 },
Param1 = 0, Param2 = 0, Param3 = 0, Param4 = 0, Param5 = 0
},
{
Strategy = "Strategy3",
Pitch = 50,
Energy = 10,
Contrast = new [] { 0, 0, 0, 0, 0, 0 },
Param1 = 0, Param2 = 0, Param3 = 0, Param4 = 0, Param5 = 0
}
]
var trainData =
[
{
Strategy = "Strategy3",
Pitch = 50,
Energy = 10,
Contrast = new [] { 0, 0, 0, 0, 0, 0 },
Param1 = 0, Param2 = 0, Param3 = 0, Param4 = 0, Param5 = 0
}
]