Skip to content

Commit

Permalink
Auto.ML: Fix issue when parsing float string fails on pl-PL culture s…
Browse files Browse the repository at this point in the history
…et using Regression Experiment (#5163)

* Fix issue when parsing float string fails on pl-PL culture set

* Added InvariantCulture float parsing as per CodeReview request

* Update src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs

Co-authored-by: Justin Ormont <justinormont@users.noreply.github.com>

* Update Parameters.cs

* Added PL test

* Added multiple cultures

* debugging CI failure

* Debug runSpecific

* Revert "Debug runSpecific"

This reverts commit 95b7280.

* Removed LightGBM and addressed comments

* Increased time

* Increase time

* Increased time

Co-authored-by: Justin Ormont <justinormont@users.noreply.github.com>
Co-authored-by: Antonio Velazquez <anvelazq@microsoft.com>
  • Loading branch information
3 people authored Oct 30, 2020
1 parent a9ab7fc commit 6ccf479
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 18 deletions.
2 changes: 1 addition & 1 deletion src/Microsoft.ML.AutoML/Sweepers/Parameters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public LongParameterValue(string name, long value)
{
_name = name;
_value = value;
_valueText = _value.ToString("D");
_valueText = _value.ToString("D", CultureInfo.InvariantCulture);
}

public bool Equals(IParameterValue other)
Expand Down
11 changes: 9 additions & 2 deletions src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System;
using System.Collections.Generic;
using System.Globalization;
using Microsoft.ML.Internal.CpuMath;

namespace Microsoft.ML.AutoML
Expand Down Expand Up @@ -98,13 +99,15 @@ public static float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, Pa
}
else if (sweepParam is LongValueGenerator lvg)
{
var longValue = GetIfIParameterValueOfT<long>(pset) ?? long.Parse(pset.ValueText, CultureInfo.InvariantCulture);
// Normalizing all numeric parameters to [0,1] range.
result.Add(lvg.NormalizeValue(new LongParameterValue(pset.Name, long.Parse(pset.ValueText))));
result.Add(lvg.NormalizeValue(new LongParameterValue(pset.Name, longValue)));
}
else if (sweepParam is FloatValueGenerator fvg)
{
var floatValue = GetIfIParameterValueOfT<float>(pset) ?? float.Parse(pset.ValueText, CultureInfo.InvariantCulture);
// Normalizing all numeric parameters to [0,1] range.
result.Add(fvg.NormalizeValue(new FloatParameterValue(pset.Name, float.Parse(pset.ValueText))));
result.Add(fvg.NormalizeValue(new FloatParameterValue(pset.Name, floatValue)));
}
else
{
Expand All @@ -115,6 +118,10 @@ public static float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, Pa
return result.ToArray();
}

private static T? GetIfIParameterValueOfT<T>(IParameterValue parameterValue)
where T : struct =>
parameterValue is IParameterValue<T> pvt ? pvt.Value : default(T?);

public static ParameterSet FloatArrayAsParameterSet(IValueGenerator[] sweepParams, float[] array, bool expandedCategoricals = true)
{
Runtime.Contracts.Assert(array.Length == sweepParams.Length);
Expand Down
66 changes: 51 additions & 15 deletions test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Globalization;
using System.Linq;
using System.Threading;
using Microsoft.ML.Data;
using Microsoft.ML.TestFramework;
using Microsoft.ML.TestFramework.Attributes;
Expand Down Expand Up @@ -102,22 +105,55 @@ private void Context_Log(object sender, LoggingEventArgs e)
//throw new NotImplementedException();
}

[Fact]
public void AutoFitRegressionTest()
[Theory]
[InlineData("en-US")]
[InlineData("ar-SA")]
[InlineData("pl-PL")]
public void AutoFitRegressionTest(string culture)
{
var context = new MLContext(1);
var dataPath = DatasetUtil.GetMlNetGeneratedRegressionDataset();
var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(dataPath);
var validationData = context.Data.TakeRows(trainData, 20);
trainData = context.Data.SkipRows(trainData, 20);
var result = context.Auto()
.CreateRegressionExperiment(0)
.Execute(trainData, validationData,
new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel });
var originalCulture = Thread.CurrentThread.CurrentCulture;
try
{
Thread.CurrentThread.CurrentCulture = new CultureInfo(culture);

// If users run AutoML with a different locale, sometimes
// the sweeper encounters problems when parsing some strings.
// So testing in another culture is necessary.
// Furthermore, these issues might only occur after ~70
// iterations, so more experiment time is needed for this to
// occur.
uint experimentTime = (uint) (culture == "en-US" ? 0 : 180);

var experimentSettings = new RegressionExperimentSettings { MaxExperimentTimeInSeconds = experimentTime};
if (!Environment.Is64BitProcess)
{
// LightGBM isn't available on x86 machines
experimentSettings.Trainers.Remove(RegressionTrainer.LightGbm);
}

var context = new MLContext(1);
var dataPath = DatasetUtil.GetMlNetGeneratedRegressionDataset();
var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
var trainData = textLoader.Load(dataPath);
var validationData = context.Data.TakeRows(trainData, 20);
trainData = context.Data.SkipRows(trainData, 20);
var result = context.Auto()
.CreateRegressionExperiment(experimentSettings)
.Execute(trainData, validationData,
new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel });

Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9));

// Ensure experimentTime allows enough iterations to fully test the internationalization code
// If the below assertion fails, increase the experiment time so the number of iterations is met
Assert.True(culture == "en-US" || result.RunDetails.Count() >= 75, $"RunDetails.Count() = {result.RunDetails.Count()}, below 75");

Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9));
}
finally
{
Thread.CurrentThread.CurrentCulture = originalCulture;
}
}

[LightGBMFact]
Expand Down Expand Up @@ -351,4 +387,4 @@ private TextLoader.Options GetLoaderArgsRank(string labelColumnName, string grou
};
}
}
}
}

0 comments on commit 6ccf479

Please sign in to comment.