Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addresses #4226 . Fixes problem when loading NormalizerTransformer from disk. #4321

Merged
merged 22 commits into from
Oct 28, 2019
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
4d7609e
Fixed problem when saving and loading normalizer transformers that wo…
antoniovs1029 Oct 8, 2019
89a54ef
Added test of normalizer loaded from disk for a multidimensional vector
antoniovs1029 Oct 9, 2019
7217cbf
Update test case
antoniovs1029 Oct 9, 2019
50baf55
Updated test
antoniovs1029 Oct 9, 2019
0406afd
Cleaning up
antoniovs1029 Oct 9, 2019
0699296
Added test for backward compatibility with normalizer transformer wit…
antoniovs1029 Oct 10, 2019
0982fb4
Added TODO comment to test
antoniovs1029 Oct 11, 2019
7f53c82
Changed the order of saving and loading ItemKind byte
antoniovs1029 Oct 11, 2019
d5639e7
Updating dependency on models repo to use one model in a backwardcomp…
antoniovs1029 Oct 11, 2019
da59714
Fixed test that checks backward compatibility of normalizer transformer
antoniovs1029 Oct 12, 2019
bc52016
Moved assert statement and removed redundant cast
antoniovs1029 Oct 12, 2019
5d61c22
Avoid loading column 'float0' which was actually not used by the tran…
antoniovs1029 Oct 14, 2019
662bb9d
Fixed indentation
antoniovs1029 Oct 14, 2019
0628770
Added test reproducing the original scenario
antoniovs1029 Oct 14, 2019
8975755
Changed visibility of classes made for test
antoniovs1029 Oct 14, 2019
58916b8
Minor fixes in the test case
antoniovs1029 Oct 15, 2019
87364c2
Changed "Fact" to "OnnxFact" on test from DnnImageFeaturizerTest.cs
antoniovs1029 Oct 17, 2019
2ac5b8e
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
antoniovs1029 Oct 24, 2019
db96aaa
Merge remote-tracking branch 'upstream/master' into is04normalizer
antoniovs1029 Oct 25, 2019
740f267
Use WriteIntArray and ReadIntArray extension methods to make code mor…
antoniovs1029 Oct 28, 2019
499bf91
Merge remote-tracking branch 'upstream/master' into is04normalizer
antoniovs1029 Oct 28, 2019
6614c5f
Merge remote-tracking branch 'upstream/master' into is04normalizer
antoniovs1029 Oct 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build/Dependencies.props
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
<BenchmarkDotNetVersion>0.11.3</BenchmarkDotNetVersion>
<MicrosoftCodeAnalysisTestingVersion>1.0.0-beta1-63812-02</MicrosoftCodeAnalysisTestingVersion>
<MicrosoftMLTestDatabasesPackageVersion>0.0.5-test</MicrosoftMLTestDatabasesPackageVersion>
<MicrosoftMLTestModelsPackageVersion>0.0.5-test</MicrosoftMLTestModelsPackageVersion>
<MicrosoftMLTestModelsPackageVersion>0.0.6-test</MicrosoftMLTestModelsPackageVersion>
<MicrosoftMLTensorFlowTestModelsVersion>0.0.11-test</MicrosoftMLTensorFlowTestModelsVersion>
<MicrosoftMLOnnxTestModelsVersion>0.0.5-test</MicrosoftMLOnnxTestModelsVersion>
<SystemDataSqlClientVersion>4.6.1</SystemDataSqlClientVersion>
Expand Down
60 changes: 48 additions & 12 deletions src/Microsoft.ML.Data/Transforms/Normalizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ private static VersionInfo GetVersionInfo()
{
return new VersionInfo(
modelSignature: "NORMALZR",
verWrittenCur: 0x00010001, // Initial
// verWrittenCur: 0x00010001 // Initial
verWrittenCur: 0x00010002, // Support for multidimensional vectors
verReadableCur: 0x00010001,
verWeCanReadBack: 0x00010001,
loaderSignature: LoaderSignature,
Expand Down Expand Up @@ -385,39 +386,74 @@ internal ColumnOptions(string name, string inputColumnName, DataViewType inputTy
internal static DataViewType LoadType(ModelLoadContext ctx)
{
Contracts.AssertValue(ctx);

if (ctx.Header.ModelVerWritten < 0x00010002)
{
// *** Previous Binary format ***
antoniovs1029 marked this conversation as resolved.
Show resolved Hide resolved
// - bool: is vector
// - int: vector size
// - byte: ItemKind of input column (only R4 and R8 are valid)
bool isVectorOld = ctx.Reader.ReadBoolean();
int vectorSize = ctx.Reader.ReadInt32();
Contracts.CheckDecode(vectorSize >= 0);
Contracts.CheckDecode(vectorSize > 0 || !isVectorOld);
InternalDataKind itemKindOld = (InternalDataKind)ctx.Reader.ReadByte();
Contracts.CheckDecode(itemKindOld == InternalDataKind.R4 || itemKindOld == InternalDataKind.R8);
var itemTypeOld = ColumnTypeExtensions.PrimitiveTypeFromKind(itemKindOld);
return isVectorOld ? (DataViewType)(new VectorDataViewType(itemTypeOld, vectorSize)) : itemTypeOld;
}

// *** Binary format ***
// - bool: is vector
// - int: vector size
// - byte: ItemKind of input column (only R4 and R8 are valid)
bool isVector = ctx.Reader.ReadBoolean();
int vectorSize = ctx.Reader.ReadInt32();
Contracts.CheckDecode(vectorSize >= 0);
Contracts.CheckDecode(vectorSize > 0 || !isVector);
// If it is a vector:
// - int: number of dimensions
// - ints: as many as dimensions, each one represent the size of each dimension

bool isVector = ctx.Reader.ReadBoolean();
InternalDataKind itemKind = (InternalDataKind)ctx.Reader.ReadByte();
Contracts.CheckDecode(itemKind == InternalDataKind.R4 || itemKind == InternalDataKind.R8);

var itemType = ColumnTypeExtensions.PrimitiveTypeFromKind(itemKind);
return isVector ? (DataViewType)(new VectorDataViewType(itemType, vectorSize)) : itemType;

if (!isVector)
return itemType;

int ndimensions = ctx.Reader.ReadInt32();
Contracts.CheckDecode(ndimensions > 0);
antoniovs1029 marked this conversation as resolved.
Show resolved Hide resolved

var dimensions = new int[ndimensions];
for (int i = 0; i < ndimensions; i++)
dimensions[i] = ctx.Reader.ReadInt32();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use ctx.Reader.ReadIntArray() function, its a util extension function that is designed for reading arrays.


return new VectorDataViewType(itemType, dimensions);
}

internal static void SaveType(ModelSaveContext ctx, DataViewType type)
{
Contracts.AssertValue(ctx);
// *** Binary format ***
// - bool: is vector
// - int: vector size
// - byte: ItemKind of input column (only R4 and R8 are valid)
// If it is a vector:
// - int: number of dimensions of the vector
// - ints: as many as dimensions, each one represents the size of each dimension

VectorDataViewType vectorType = type as VectorDataViewType;
ctx.Writer.Write(vectorType != null);

Contracts.Assert(vectorType == null || vectorType.IsKnownSize);
ctx.Writer.Write(vectorType?.Size ?? 0);

DataViewType itemType = vectorType?.ItemType ?? type;
itemType.RawType.TryGetDataKind(out InternalDataKind itemKind);
Contracts.Assert(itemKind == InternalDataKind.R4 || itemKind == InternalDataKind.R8);
ctx.Writer.Write((byte)itemKind);

Contracts.Assert(vectorType == null || vectorType.IsKnownSize);
if (vectorType != null)
{
var dims = vectorType.Dimensions;
ctx.Writer.Write(dims.Length);
for (int i = 0; i < dims.Length; i++)
ctx.Writer.Write(dims[i]);
antoniovs1029 marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

Expand Down
89 changes: 89 additions & 0 deletions test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
using Microsoft.ML.TestFramework.Attributes;
using Microsoft.ML.Tools;
using Microsoft.ML.Transforms;
using System.Linq;
using Xunit;
using Xunit.Abstractions;
using static Microsoft.ML.Transforms.NormalizingTransformer;
Expand Down Expand Up @@ -886,5 +887,93 @@ void TestNormalizeLogMeanVarianceFixZeroVec()
Assert.Equal(0f, transformedDataArray[2].Features[1]);
Assert.Equal(0f, transformedDataArray[2].Features[4]);
}

[Fact]
public void TestNormalizeBackCompatibility2()
{
// Tests backward compatibility with a normalizing transformer
// whose version is "verWrittenCur: 0x00010001"

string dataPath = GetDataPath(TestDatasets.iris.trainFilename);

var loader = new TextLoader(Env, new TextLoader.Options
{
Columns = new[] {
new TextLoader.Column("float1", DataKind.Single, 1),
new TextLoader.Column("float4", DataKind.Single, new[]{new TextLoader.Range(1, 4) }),
new TextLoader.Column("double1", DataKind.Double, 1),
new TextLoader.Column("double4", DataKind.Double, new[]{new TextLoader.Range(1, 4) }),
new TextLoader.Column("int1", DataKind.Int32, 0),
new TextLoader.Column("float0", DataKind.Single, new[]{ new TextLoader.Range { Min = 1, VariableEnd = true } }),
},
HasHeader = true
}, new MultiFileSource(dataPath));

var data = loader.Load(dataPath);

var modelPath = Path.Combine("TestModels", "normalizer_verwrit-00010001.zip");
var normalizer = ML.Model.Load(modelPath, out var schema);

var outputPath = GetOutputPath("NormalizerEstimator", "normalized2.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
using (var fs = File.Create(outputPath))
{
var transformedData = normalizer.Transform(data);
var dataView = ML.Transforms.DropColumns(new[] { "float0" }).Fit(transformedData).Transform(transformedData);
antoniovs1029 marked this conversation as resolved.
Show resolved Hide resolved
DataSaverUtils.SaveDataView(ch, saver, dataView, fs, keepHidden: true);
}
}

CheckEquality("NormalizerEstimator", "normalized2.tsv", "normalized.tsv");

Done();
}

public class TensorData
{
private const int dim1 = 2;
private const int dim2 = 3;
private const int dim3 = 4;
private const int size = dim1 * dim2 * dim3;

[VectorType(dim1, dim2, dim3)]
public float[] input { get; set; }

public static TensorData[] GetTensorData()
{
var tensor1 = Enumerable.Range(0, size).Select(
x => (float)x).ToArray();

var tensor2 = Enumerable.Range(0, size).Select(
x => (float)(x + 10000)).ToArray();

return new TensorData[]
{
new TensorData() { input = tensor1},
new TensorData() { input = tensor2}
};
}
}

[Fact]
antoniovs1029 marked this conversation as resolved.
Show resolved Hide resolved
void TestSavingNormalizerWithMultidimensionalVectorInput()
{
var samples = TensorData.GetTensorData();
var data = ML.Data.LoadFromEnumerable(samples);
var model = ML.Transforms.NormalizeMinMax("output", "input").Fit(data);
var transformedData = model.Transform(data);

var modelAndSchemaPath = GetOutputPath("TestSavingNormalizerWithMultidimensionalVectorInput.zip");
ML.Model.Save(model, data.Schema, modelAndSchemaPath);
var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema);
var transformedData2 = loadedModel.Transform(data);

var dimensions1 = (transformedData.Schema["output"].Type as VectorDataViewType).Dimensions;
var dimensions2 = (transformedData2.Schema["output"].Type as VectorDataViewType).Dimensions;

Assert.True(dimensions1.SequenceEqual(dimensions2));
}
}
}