Skip to content

RobustScalingNormalizer entrypoint added #5310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions src/Microsoft.ML.Data/Transforms/NormalizeColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
[assembly: LoadableClass(NormalizeTransform.BinNormalizerSummary, typeof(IDataTransform), typeof(NormalizeTransform), typeof(NormalizeTransform.BinArguments), typeof(SignatureDataTransform),
NormalizeTransform.BinNormalizerUserName, "BinNormalizer", NormalizeTransform.BinNormalizerShortName)]

[assembly: LoadableClass(NormalizeTransform.RobustScalingNormalizerSummary, typeof(IDataTransform), typeof(NormalizeTransform), typeof(NormalizeTransform.RobustScalingArguments), typeof(SignatureDataTransform),
NormalizeTransform.RobustScalingNormalizerUserName, "RobustScalingNormalizer", NormalizeTransform.RobustScalingNormalizerShortName)]

[assembly: LoadableClass(typeof(NormalizeTransform.AffineColumnFunction), null, typeof(SignatureLoadColumnFunction),
"Affine Normalizer", AffineNormSerializationUtils.LoaderSignature)]

Expand Down Expand Up @@ -266,18 +269,22 @@ public sealed class RobustScalingArguments : AffineArgumentsBase
internal const string BinNormalizerSummary = "The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.";
internal const string SupervisedBinNormalizerSummary = "Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. "
+ "The new value is bin_number / number_of_bins.";
internal const string RobustScalingNormalizerSummary = "Optionally centers the data and scales based on the range of data and the quantile min and max values provided. "
+ "This method is more robust to outliers.";

internal const string MinMaxNormalizerUserName = "Min-Max Normalizer";
internal const string MeanVarNormalizerUserName = "MeanVar Normalizer";
internal const string LogMeanVarNormalizerUserName = "LogMeanVar Normalizer";
internal const string BinNormalizerUserName = "Binning Normalizer";
internal const string SupervisedBinNormalizerUserName = "Supervised Binning Normalizer";
internal const string RobustScalingNormalizerUserName = "Robust Scaling Normalizer";

internal const string MinMaxNormalizerShortName = "MinMax";
internal const string MeanVarNormalizerShortName = "MeanVar";
internal const string LogMeanVarNormalizerShortName = "LogMeanVar";
internal const string BinNormalizerShortName = "Bin";
internal const string SupervisedBinNormalizerShortName = "SupBin";
internal const string RobustScalingNormalizerShortName = "RobScal";

/// <summary>
/// A helper method to create a MinMax normalizer.
Expand Down Expand Up @@ -373,6 +380,28 @@ internal static IDataTransform Create(IHostEnvironment env, BinArguments args, I
return normalizer.Fit(input).MakeDataTransform(input);
}

/// <summary>
/// Factory method corresponding to SignatureDataTransform.
/// </summary>
internal static IDataTransform Create(IHostEnvironment env, RobustScalingArguments args, IDataView input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(args, nameof(args));
env.CheckValue(args.Columns, nameof(args.Columns));

var columns = args.Columns
.Select(col => new NormalizingEstimator.RobustScalingColumnOptions(
col.Name,
col.Source ?? col.Name,
col.MaximumExampleCount ?? args.MaximumExampleCount,
args.CenterData,
args.QuantileMin,
args.QuantileMax))
.ToArray();
var normalizer = new NormalizingEstimator(env, columns);
return normalizer.Fit(input).MakeDataTransform(input);
}

internal abstract partial class AffineColumnFunction : IColumnFunction
{
protected readonly IHost Host;
Expand Down
12 changes: 12 additions & 0 deletions src/Microsoft.ML.Data/Transforms/NormalizeUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,18 @@ public static CommonOutputs.TransformOutput Bin(IHostEnvironment env, NormalizeT
return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf };
}

[TlcModule.EntryPoint(Name = "Transforms.RobustScalingNormalizer", Desc = NormalizeTransform.RobustScalingNormalizerSummary, UserName = NormalizeTransform.RobustScalingNormalizerUserName, ShortName = NormalizeTransform.RobustScalingNormalizerShortName)]
public static CommonOutputs.TransformOutput RobustScaling(IHostEnvironment env, NormalizeTransform.RobustScalingArguments input)
{
Contracts.CheckValue(env, nameof(env));
var host = env.Register("RobustScaling");
host.CheckValue(input, nameof(input));
EntryPointUtils.CheckInputArgs(host, input);

var xf = NormalizeTransform.Create(host, input, input.Data);
return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf };
}

[TlcModule.EntryPoint(Name = "Transforms.ConditionalNormalizer", Desc = "Normalize the columns only if needed", UserName = "Normalize If Needed")]
public static CommonOutputs.MacroOutput<CommonOutputs.TransformOutput> IfNeeded(
IHostEnvironment env,
Expand Down
1 change: 1 addition & 0 deletions test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ Transforms.PcaCalculator PCA is a dimensionality-reduction transform which compu
Transforms.PermutationFeatureImportance Permutation Feature Importance (PFI) Microsoft.ML.Transforms.PermutationFeatureImportanceEntryPoints PermutationFeatureImportance Microsoft.ML.Transforms.PermutationFeatureImportanceArguments Microsoft.ML.Transforms.PermutationFeatureImportanceOutput
Transforms.PredictedLabelColumnOriginalValueConverter Transforms a predicted label column to its original values, unless it is of type bool. Microsoft.ML.EntryPoints.FeatureCombiner ConvertPredictedLabel Microsoft.ML.EntryPoints.FeatureCombiner+PredictedLabelInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.RandomNumberGenerator Adds a column with a generated number sequence. Microsoft.ML.Transforms.RandomNumberGenerator Generate Microsoft.ML.Transforms.GenerateNumberTransform+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.RobustScalingNormalizer Optionally centers the data and scales based on the range of data and the quantile min and max values provided. This method is more robust to outliers. Microsoft.ML.Data.Normalize RobustScaling Microsoft.ML.Transforms.NormalizeTransform+RobustScalingArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.RowRangeFilter Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values. Microsoft.ML.EntryPoints.SelectRows FilterByRange Microsoft.ML.Transforms.RangeFilter+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.RowSkipAndTakeFilter Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging. Microsoft.ML.EntryPoints.SelectRows SkipAndTakeFilter Microsoft.ML.Transforms.SkipTakeFilter+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.RowSkipFilter Allows limiting input to a subset of rows by skipping a number of rows. Microsoft.ML.EntryPoints.SelectRows SkipFilter Microsoft.ML.Transforms.SkipTakeFilter+SkipOptions Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Expand Down
160 changes: 160 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -22947,6 +22947,166 @@
"ITransformOutput"
]
},
{
"Name": "Transforms.RobustScalingNormalizer",
"Desc": "Optionally centers the data and scales based on the range of data and the quantile min and max values provided. This method is more robust to outliers.",
"FriendlyName": "Robust Scaling Normalizer",
"ShortName": "RobScal",
"Inputs": [
{
"Name": "CenterData",
"Type": "Bool",
"Desc": "Should the data be centered around 0",
"Aliases": [
"center"
],
"Required": false,
"SortOrder": 1.0,
"IsNullable": false,
"Default": true
},
{
"Name": "Column",
"Type": {
"Kind": "Array",
"ItemType": {
"Kind": "Struct",
"Fields": [
{
"Name": "FixZero",
"Type": "Bool",
"Desc": "Whether to map zero to zero, preserving sparsity",
"Aliases": [
"zero"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
"Name": "MaxTrainingExamples",
"Type": "Int",
"Desc": "Max number of examples used to train the normalizer",
"Aliases": [
"maxtrain"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
"Name": "Name",
"Type": "String",
"Desc": "Name of the new column",
"Aliases": [
"name"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": null
},
{
"Name": "Source",
"Type": "String",
"Desc": "Name of the source column",
"Aliases": [
"src"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": null
}
]
}
},
"Desc": "New column definition(s) (optional form: name:src)",
"Aliases": [
"col"
],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
"Name": "Data",
"Type": "DataView",
"Desc": "Input dataset",
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
"Name": "QuantileMin",
"Type": "UInt",
"Desc": "Minimum quantile value. Defaults to 25",
"Aliases": [
"qmin"
],
"Required": false,
"SortOrder": 2.0,
"IsNullable": false,
"Default": 25
},
{
"Name": "QuantileMax",
"Type": "UInt",
"Desc": "Maximum quantile value. Defaults to 75",
"Aliases": [
"qmax"
],
"Required": false,
"SortOrder": 3.0,
"IsNullable": false,
"Default": 75
},
{
"Name": "FixZero",
"Type": "Bool",
"Desc": "Whether to map zero to zero, preserving sparsity",
"Aliases": [
"zero"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": true
},
{
"Name": "MaxTrainingExamples",
"Type": "Int",
"Desc": "Max number of examples used to train the normalizer",
"Aliases": [
"maxtrain"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": 1000000000
}
],
"Outputs": [
{
"Name": "OutputData",
"Type": "DataView",
"Desc": "Transformed dataset"
},
{
"Name": "Model",
"Type": "TransformModel",
"Desc": "Transform model"
}
],
"InputKind": [
"ITransformInput"
],
"OutputKind": [
"ITransformOutput"
]
},
{
"Name": "Transforms.RowRangeFilter",
"Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.",
Expand Down
55 changes: 55 additions & 0 deletions test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2329,6 +2329,56 @@ public void EntryPointParseColumns()
cmd.Run();
}

[Fact]
public void RobustScalerNormalizerEntryPoint()
{
var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename);
var outputPath = DeleteOutputPath("data.idv");

string inputGraph = string.Format(@"
{{
'Nodes': [
{{
'Name': 'Data.CustomTextLoader',
'Inputs': {{
'InputFile': '$file1'
}},
'Outputs': {{
'Data': '$data1'
}}
}},
{{
'Name': 'Transforms.RobustScalingNormalizer',
'Inputs': {{
'Data': '$data1',
'Column': [
{{
'Name': 'Features',
'Source': 'Features'
}}
]
}},
'Outputs': {{
'OutputData': '$data2'
}}
}}
],
'Inputs' : {{
'file1' : '{0}'
}},
'Outputs' : {{
'data2' : '{1}'
}}
}}", EscapePath(dataPath), EscapePath(outputPath));

var jsonPath = DeleteOutputPath("graph.json");
File.WriteAllLines(jsonPath, new[] { inputGraph });

var args = new ExecuteGraphCommand.Arguments() { GraphPath = jsonPath };
var cmd = new ExecuteGraphCommand(Env, args);
cmd.Run();
}

[Fact]
public void EntryPointCountFeatures()
{
Expand Down Expand Up @@ -2935,6 +2985,11 @@ public void EntryPointConvert()
'Name': 'Label2',
'Source': 'LB'
},
{
'Name': 'Label3',
'Source': 'LB',
'Type': 'TX'
},
{
'Name': 'Feat',
'Source': 'FT',
Expand Down