Skip to content

Commit d6e34fb

Browse files
yaeldMSeerhardt
authored andcommitted
Remove label requirement for PCA anomaly detector entry point. (dotnet#221)
* Remove label requirement for PCA anomaly detector entry point. * Fix EntryPointCatalog test.
1 parent bc40bc7 commit d6e34fb

File tree

5 files changed

+287
-19
lines changed

5 files changed

+287
-19
lines changed

ZBaselines/Common/EntryPoints/core_manifest.json

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10585,18 +10585,6 @@
1058510585
"IsNullable": false,
1058610586
"Default": "Features"
1058710587
},
10588-
{
10589-
"Name": "LabelColumn",
10590-
"Type": "String",
10591-
"Desc": "Column to use for labels",
10592-
"Aliases": [
10593-
"lab"
10594-
],
10595-
"Required": false,
10596-
"SortOrder": 3.0,
10597-
"IsNullable": false,
10598-
"Default": "Label"
10599-
},
1060010588
{
1060110589
"Name": "WeightColumn",
1060210590
"Type": "String",
@@ -10727,8 +10715,6 @@
1072710715
}
1072810716
],
1072910717
"InputKind": [
10730-
"ITrainerInputWithWeight",
10731-
"ITrainerInputWithLabel",
1073210718
"ITrainerInput"
1073310719
],
1073410720
"OutputKind": [

src/Microsoft.ML.PCA/PcaTrainer.cs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public sealed class RandomizedPcaTrainer : TrainerBase<RoleMappedData, PcaPredic
4949
internal const string Summary = "This algorithm trains an approximate PCA using Randomized SVD algorithm. "
5050
+ "This PCA can be made into Kernel PCA by using Random Fourier Features transform.";
5151

52-
public class Arguments : LearnerInputBaseWithWeight
52+
public class Arguments : LearnerInputBase
5353
{
5454
[Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k", SortOrder = 50)]
5555
[TGUI(SuggestedSweeps = "10,20,40,80")]
@@ -62,11 +62,14 @@ public class Arguments : LearnerInputBaseWithWeight
6262
public int Oversampling = 20;
6363

6464
[Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean", ShortName = "center")]
65-
[TlcModule.SweepableDiscreteParam("Center", null, isBool:true)]
65+
[TlcModule.SweepableDiscreteParam("Center", null, isBool: true)]
6666
public bool Center = true;
6767

6868
[Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation", ShortName = "seed")]
6969
public int? Seed;
70+
71+
[Argument(ArgumentType.AtMostOnce, HelpText = "Column to use for example weight", ShortName = "weight", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly)]
72+
public Optional<string> WeightColumn = Optional<string>.Implicit(DefaultColumnNames.Weight);
7073
}
7174

7275
private int _dimension;
@@ -294,8 +297,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm
294297

295298
return LearnerEntryPointsUtils.Train<Arguments, CommonOutputs.AnomalyDetectionOutput>(host, input,
296299
() => new RandomizedPcaTrainer(host, input),
297-
() => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn),
298-
() => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
300+
getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
299301
}
300302
}
301303

src/Microsoft.ML/CSharpApi.cs

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,18 @@ public void Add(Microsoft.ML.Trainers.OrdinaryLeastSquaresRegressor input, Micro
550550
_jsonNodes.Add(Serialize("Trainers.OrdinaryLeastSquaresRegressor", input, output));
551551
}
552552

553+
public Microsoft.ML.Trainers.PcaAnomalyDetector.Output Add(Microsoft.ML.Trainers.PcaAnomalyDetector input)
554+
{
555+
var output = new Microsoft.ML.Trainers.PcaAnomalyDetector.Output();
556+
Add(input, output);
557+
return output;
558+
}
559+
560+
public void Add(Microsoft.ML.Trainers.PcaAnomalyDetector input, Microsoft.ML.Trainers.PcaAnomalyDetector.Output output)
561+
{
562+
_jsonNodes.Add(Serialize("Trainers.PcaAnomalyDetector", input, output));
563+
}
564+
553565
public Microsoft.ML.Trainers.PoissonRegressor.Output Add(Microsoft.ML.Trainers.PoissonRegressor input)
554566
{
555567
var output = new Microsoft.ML.Trainers.PoissonRegressor.Output();
@@ -1090,6 +1102,18 @@ public void Add(Microsoft.ML.Transforms.OptionalColumnCreator input, Microsoft.M
10901102
_jsonNodes.Add(Serialize("Transforms.OptionalColumnCreator", input, output));
10911103
}
10921104

1105+
public Microsoft.ML.Transforms.PcaCalculator.Output Add(Microsoft.ML.Transforms.PcaCalculator input)
1106+
{
1107+
var output = new Microsoft.ML.Transforms.PcaCalculator.Output();
1108+
Add(input, output);
1109+
return output;
1110+
}
1111+
1112+
public void Add(Microsoft.ML.Transforms.PcaCalculator input, Microsoft.ML.Transforms.PcaCalculator.Output output)
1113+
{
1114+
_jsonNodes.Add(Serialize("Transforms.PcaCalculator", input, output));
1115+
}
1116+
10931117
public Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter.Output Add(Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter input)
10941118
{
10951119
var output = new Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter.Output();
@@ -6739,6 +6763,97 @@ public OrdinaryLeastSquaresRegressorPipelineStep(Output output)
67396763
}
67406764
}
67416765

6766+
namespace Trainers
6767+
{
6768+
6769+
/// <summary>
6770+
/// Train an PCA Anomaly model.
6771+
/// </summary>
6772+
public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
6773+
{
6774+
6775+
6776+
/// <summary>
6777+
/// The number of components in the PCA
6778+
/// </summary>
6779+
[TlcModule.SweepableDiscreteParamAttribute("Rank", new object[]{10, 20, 40, 80})]
6780+
public int Rank { get; set; } = 20;
6781+
6782+
/// <summary>
6783+
/// Oversampling parameter for randomized PCA training
6784+
/// </summary>
6785+
[TlcModule.SweepableDiscreteParamAttribute("Oversampling", new object[]{10, 20, 40})]
6786+
public int Oversampling { get; set; } = 20;
6787+
6788+
/// <summary>
6789+
/// If enabled, data is centered to be zero mean
6790+
/// </summary>
6791+
[TlcModule.SweepableDiscreteParamAttribute("Center", new object[]{false, true})]
6792+
public bool Center { get; set; } = true;
6793+
6794+
/// <summary>
6795+
/// The seed for random number generation
6796+
/// </summary>
6797+
public int? Seed { get; set; }
6798+
6799+
/// <summary>
6800+
/// Column to use for example weight
6801+
/// </summary>
6802+
public Microsoft.ML.Runtime.EntryPoints.Optional<string> WeightColumn { get; set; }
6803+
6804+
/// <summary>
6805+
/// The data to be used for training
6806+
/// </summary>
6807+
public Var<Microsoft.ML.Runtime.Data.IDataView> TrainingData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
6808+
6809+
/// <summary>
6810+
/// Column to use for features
6811+
/// </summary>
6812+
public string FeatureColumn { get; set; } = "Features";
6813+
6814+
/// <summary>
6815+
/// Normalize option for the feature column
6816+
/// </summary>
6817+
public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto;
6818+
6819+
/// <summary>
6820+
/// Whether learner should cache input training data
6821+
/// </summary>
6822+
public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto;
6823+
6824+
6825+
public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IAnomalyDetectionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput
6826+
{
6827+
/// <summary>
6828+
/// The trained model
6829+
/// </summary>
6830+
public Var<Microsoft.ML.Runtime.EntryPoints.IPredictorModel> PredictorModel { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.IPredictorModel>();
6831+
6832+
}
6833+
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
6834+
{
6835+
if (!(previousStep is ILearningPipelineDataStep dataStep))
6836+
{
6837+
throw new InvalidOperationException($"{ nameof(PcaAnomalyDetector)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
6838+
}
6839+
6840+
TrainingData = dataStep.Data;
6841+
Output output = experiment.Add(this);
6842+
return new PcaAnomalyDetectorPipelineStep(output);
6843+
}
6844+
6845+
private class PcaAnomalyDetectorPipelineStep : ILearningPipelinePredictorStep
6846+
{
6847+
public PcaAnomalyDetectorPipelineStep(Output output)
6848+
{
6849+
Model = output.PredictorModel;
6850+
}
6851+
6852+
public Var<IPredictorModel> Model { get; }
6853+
}
6854+
}
6855+
}
6856+
67426857
namespace Trainers
67436858
{
67446859

@@ -11417,6 +11532,170 @@ public OptionalColumnCreatorPipelineStep(Output output)
1141711532
}
1141811533
}
1141911534

11535+
namespace Transforms
11536+
{
11537+
11538+
public sealed partial class PcaTransformColumn : OneToOneColumn<PcaTransformColumn>, IOneToOneColumn
11539+
{
11540+
/// <summary>
11541+
/// The name of the weight column
11542+
/// </summary>
11543+
public string WeightColumn { get; set; }
11544+
11545+
/// <summary>
11546+
/// The number of components in the PCA
11547+
/// </summary>
11548+
public int? Rank { get; set; }
11549+
11550+
/// <summary>
11551+
/// Oversampling parameter for randomized PCA training
11552+
/// </summary>
11553+
public int? Oversampling { get; set; }
11554+
11555+
/// <summary>
11556+
/// If enabled, data is centered to be zero mean
11557+
/// </summary>
11558+
public bool? Center { get; set; }
11559+
11560+
/// <summary>
11561+
/// The seed for random number generation
11562+
/// </summary>
11563+
public int? Seed { get; set; }
11564+
11565+
/// <summary>
11566+
/// Name of the new column
11567+
/// </summary>
11568+
public string Name { get; set; }
11569+
11570+
/// <summary>
11571+
/// Name of the source column
11572+
/// </summary>
11573+
public string Source { get; set; }
11574+
11575+
}
11576+
11577+
/// <summary>
11578+
/// Train an PCA Anomaly model.
11579+
/// </summary>
11580+
public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
11581+
{
11582+
11583+
public PcaCalculator()
11584+
{
11585+
}
11586+
11587+
public PcaCalculator(params string[] inputColumns)
11588+
{
11589+
if (inputColumns != null)
11590+
{
11591+
foreach (string input in inputColumns)
11592+
{
11593+
AddColumn(input);
11594+
}
11595+
}
11596+
}
11597+
11598+
public PcaCalculator(params ValueTuple<string, string>[] inputOutputColumns)
11599+
{
11600+
if (inputOutputColumns != null)
11601+
{
11602+
foreach (ValueTuple<string, string> inputOutput in inputOutputColumns)
11603+
{
11604+
AddColumn(inputOutput.Item2, inputOutput.Item1);
11605+
}
11606+
}
11607+
}
11608+
11609+
public void AddColumn(string source)
11610+
{
11611+
var list = Column == null ? new List<Transforms.PcaTransformColumn>() : new List<Transforms.PcaTransformColumn>(Column);
11612+
list.Add(OneToOneColumn<Transforms.PcaTransformColumn>.Create(source));
11613+
Column = list.ToArray();
11614+
}
11615+
11616+
public void AddColumn(string name, string source)
11617+
{
11618+
var list = Column == null ? new List<Transforms.PcaTransformColumn>() : new List<Transforms.PcaTransformColumn>(Column);
11619+
list.Add(OneToOneColumn<Transforms.PcaTransformColumn>.Create(name, source));
11620+
Column = list.ToArray();
11621+
}
11622+
11623+
11624+
/// <summary>
11625+
/// New column definition(s) (optional form: name:src)
11626+
/// </summary>
11627+
public Transforms.PcaTransformColumn[] Column { get; set; }
11628+
11629+
/// <summary>
11630+
/// The name of the weight column
11631+
/// </summary>
11632+
public string WeightColumn { get; set; }
11633+
11634+
/// <summary>
11635+
/// The number of components in the PCA
11636+
/// </summary>
11637+
public int Rank { get; set; } = 20;
11638+
11639+
/// <summary>
11640+
/// Oversampling parameter for randomized PCA training
11641+
/// </summary>
11642+
public int Oversampling { get; set; } = 20;
11643+
11644+
/// <summary>
11645+
/// If enabled, data is centered to be zero mean
11646+
/// </summary>
11647+
public bool Center { get; set; } = true;
11648+
11649+
/// <summary>
11650+
/// The seed for random number generation
11651+
/// </summary>
11652+
public int Seed { get; set; }
11653+
11654+
/// <summary>
11655+
/// Input dataset
11656+
/// </summary>
11657+
public Var<Microsoft.ML.Runtime.Data.IDataView> Data { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
11658+
11659+
11660+
public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput
11661+
{
11662+
/// <summary>
11663+
/// Transformed dataset
11664+
/// </summary>
11665+
public Var<Microsoft.ML.Runtime.Data.IDataView> OutputData { get; set; } = new Var<Microsoft.ML.Runtime.Data.IDataView>();
11666+
11667+
/// <summary>
11668+
/// Transform model
11669+
/// </summary>
11670+
public Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel> Model { get; set; } = new Var<Microsoft.ML.Runtime.EntryPoints.ITransformModel>();
11671+
11672+
}
11673+
public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment)
11674+
{
11675+
if (!(previousStep is ILearningPipelineDataStep dataStep))
11676+
{
11677+
throw new InvalidOperationException($"{ nameof(PcaCalculator)} only supports an { nameof(ILearningPipelineDataStep)} as an input.");
11678+
}
11679+
11680+
Data = dataStep.Data;
11681+
Output output = experiment.Add(this);
11682+
return new PcaCalculatorPipelineStep(output);
11683+
}
11684+
11685+
private class PcaCalculatorPipelineStep : ILearningPipelineDataStep
11686+
{
11687+
public PcaCalculatorPipelineStep(Output output)
11688+
{
11689+
Data = output.OutputData;
11690+
Model = output.Model;
11691+
}
11692+
11693+
public Var<IDataView> Data { get; }
11694+
public Var<ITransformModel> Model { get; }
11695+
}
11696+
}
11697+
}
11698+
1142011699
namespace Transforms
1142111700
{
1142211701

test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1086,7 +1086,7 @@ public void EntryPointLogisticRegressionMultiClass()
10861086
[Fact]
10871087
public void EntryPointPcaAnomaly()
10881088
{
1089-
TestEntryPointRoutine("MNIST.Train.0-class.tiny.txt", "Trainers.PcaAnomalyDetector");
1089+
TestEntryPointRoutine("MNIST.Train.0-class.tiny.txt", "Trainers.PcaAnomalyDetector", "col=Features:R4:1-784");
10901090
}
10911091

10921092
[Fact]

test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
</PropertyGroup>
55

66
<ItemGroup>
7+
<ProjectReference Include="..\..\src\Microsoft.ML.PCA\Microsoft.ML.PCA.csproj" />
78
<ProjectReference Include="..\..\src\Microsoft.ML.PipelineInference\Microsoft.ML.PipelineInference.csproj" />
89
<ProjectReference Include="..\..\src\Microsoft.ML.StandardLearners\Microsoft.ML.StandardLearners.csproj" />
910
<ProjectReference Include="..\..\src\Microsoft.ML\Microsoft.ML.csproj" />

0 commit comments

Comments
 (0)