Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update #3

Merged
merged 4 commits into from
May 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build/Dependencies.props
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<SystemDrawingCommonPackageVersion>4.5.0</SystemDrawingCommonPackageVersion>
<SystemIOFileSystemAccessControl>4.5.0</SystemIOFileSystemAccessControl>
<SystemSecurityPrincipalWindows>4.5.0</SystemSecurityPrincipalWindows>
<TensorFlowVersion>1.12.0</TensorFlowVersion>
<TensorFlowVersion>1.13.1</TensorFlowVersion>
</PropertyGroup>

<!-- Code Analyzer Dependencies -->
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Samples.Dynamic
{
public static class LoadFromEnumerable
{
// Creating IDataView from IEnumerable, and setting the size of the vector at runtime.
// When the data model is defined through types, setting the size of the vector is done through the VectorType
// annotation. When the size of the data is not known at compile time, the Schema can be directly modified at runtime
// and the size of the vector set there.
// This is important, because most of the ML.NET trainers require the Features vector to be of known size.
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
IEnumerable<DataPointVector> enumerableKnownSize = new DataPointVector[]
{
new DataPointVector{ Features = new float[]{ 1.2f, 3.4f, 4.5f, 3.2f, 7,5f } },
new DataPointVector{ Features = new float[]{ 4.2f, 3.4f, 14.65f, 3.2f, 3,5f } },
new DataPointVector{ Features = new float[]{ 1.6f, 3.5f, 4.5f, 6.2f, 3,5f } },
};

// Load dataset into an IDataView.
IDataView data = mlContext.Data.LoadFromEnumerable(enumerableKnownSize);
var featureColumn = data.Schema["Features"].Type as VectorDataViewType;
// Inspecting the schema
Console.WriteLine($"Is the size of the Features column known: {featureColumn.IsKnownSize}.\nSize: {featureColumn.Size}");

// Preview
//
// Is the size of the Features column known? True.
// Size: 5.

// If the size of the vector is unknown at compile time, it can be set at runtime.
IEnumerable<DataPoint> enumerableUnknownSize = new DataPoint[]
{
new DataPoint{ Features = new float[]{ 1.2f, 3.4f, 4.5f } },
new DataPoint{ Features = new float[]{ 4.2f, 3.4f, 1.6f } },
new DataPoint{ Features = new float[]{ 1.6f, 3.5f, 4.5f } },
};

// The feature dimension (typically this will be the Count of the array of the features vector
// known at runtime).
int featureDimension = 3;
var definedSchema = SchemaDefinition.Create(typeof(DataPoint));
featureColumn = definedSchema["Features"].ColumnType as VectorDataViewType;
Console.WriteLine($"Is the size of the Features column known: {featureColumn.IsKnownSize}.\nSize: {featureColumn.Size}");

// Preview
//
// Is the size of the Features column known? False.
// Size: 0.

// Set the column type to be a known-size vector.
var vectorItemType = ((VectorDataViewType)definedSchema[0].ColumnType).ItemType;
definedSchema[0].ColumnType = new VectorDataViewType(vectorItemType, featureDimension);

// Read the data into an IDataView with the modified schema supplied in
IDataView data2 = mlContext.Data.LoadFromEnumerable(enumerableUnknownSize, definedSchema);

featureColumn = data2.Schema["Features"].Type as VectorDataViewType;
// Inspecting the schema
Console.WriteLine($"Is the size of the Features column known: {featureColumn.IsKnownSize}.\nSize: {featureColumn.Size}");

// Preview
//
// Is the size of the Features column known? True.
// Size: 3.
}
}

public class DataPoint
{
public float[] Features { get; set; }
}

public class DataPointVector
{
[VectorType(5)]
public float[] Features { get; set; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
using System;
using System.Collections.Generic;
using System.IO;
using Microsoft.ML;

namespace Samples.Dynamic
{
public static class SaveAndLoadFromBinary
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create a list of training data points.
var dataPoints = new List<DataPoint>()
{
new DataPoint(){ Label = 0, Features = 4},
new DataPoint(){ Label = 0, Features = 5},
new DataPoint(){ Label = 0, Features = 6},
new DataPoint(){ Label = 1, Features = 8},
new DataPoint(){ Label = 1, Features = 9},
};

// Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
IDataView data = mlContext.Data.LoadFromEnumerable(dataPoints);

// Create a FileStream object and write the IDataView to it as a binary IDV file.
using (FileStream stream = new FileStream("data.idv", FileMode.Create))
mlContext.Data.SaveAsBinary(data, stream);

// Create an IDataView object by loading the binary IDV file.
IDataView loadedData = mlContext.Data.LoadFromBinary("data.idv");

// Inspect the data that is loaded from the previously saved binary file.
var loadedDataEnumerable = mlContext.Data.CreateEnumerable<DataPoint>(loadedData, reuseRowObject: false);
foreach (DataPoint row in loadedDataEnumerable)
Console.WriteLine($"{row.Label}, {row.Features}");

// Preview of the loaded data.
// 0, 4
// 0, 5
// 0, 6
// 1, 8
// 1, 9
}

// Example with label and feature values. A data set is a collection of such examples.
private class DataPoint
{
public float Label { get; set; }

public float Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using System;
using System.Collections.Generic;
using System.IO;
using Microsoft.ML;

namespace Samples.Dynamic
{
public static class SaveAndLoadFromText
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create a list of training data points.
var dataPoints = new List<DataPoint>()
{
new DataPoint(){ Label = 0, Features = 4},
new DataPoint(){ Label = 0, Features = 5},
new DataPoint(){ Label = 0, Features = 6},
new DataPoint(){ Label = 1, Features = 8},
new DataPoint(){ Label = 1, Features = 9},
};

// Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
IDataView data = mlContext.Data.LoadFromEnumerable(dataPoints);

// Create a FileStream object and write the IDataView to it as a text file.
using (FileStream stream = new FileStream("data.tsv", FileMode.Create))
mlContext.Data.SaveAsText(data, stream);

// Create an IDataView object by loading the text file.
IDataView loadedData = mlContext.Data.LoadFromTextFile("data.tsv");

// Inspect the data that is loaded from the previously saved text file.
var loadedDataEnumerable = mlContext.Data.CreateEnumerable<DataPoint>(loadedData, reuseRowObject: false);
foreach (DataPoint row in loadedDataEnumerable)
Console.WriteLine($"{row.Label}, {row.Features}");

// Preview of the loaded data.
// 0, 4
// 0, 5
// 0, 6
// 1, 8
// 1, 9
}

// Example with label and feature values. A data set is a collection of such examples.
private class DataPoint
{
public float Label { get; set; }

public float Features { get; set; }
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ public static IDataView LoadFromBinary(this DataOperationsCatalog catalog, IMult
/// </summary>
/// <param name="catalog">The catalog.</param>
/// <param name="path">The path to the file to load from.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LoadFromBinary](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs)]
/// ]]>
/// </format>
/// </example>
public static IDataView LoadFromBinary(this DataOperationsCatalog catalog, string path)
{
Contracts.CheckNonEmpty(path, nameof(path));
Expand All @@ -54,6 +61,13 @@ public static IDataView LoadFromBinary(this DataOperationsCatalog catalog, strin
/// <param name="data">The data view to save.</param>
/// <param name="stream">The stream to write to.</param>
/// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[SaveAsBinary](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs)]
/// ]]>
/// </format>
/// </example>
public static void SaveAsBinary(this DataOperationsCatalog catalog, IDataView data, Stream stream,
bool keepHidden = false)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ internal DataOperationsCatalog(IHostEnvironment env)
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LoadFromEnumerable](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
/// [!code-csharp[LoadFromEnumerable](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadFromEnumerable.cs)]
/// ]]>
/// </format>
/// </example>
Expand Down
14 changes: 14 additions & 0 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,13 @@ public static IDataView LoadFromTextFile<TInput>(this DataOperationsCatalog cata
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="path">Specifies a file from which to load.</param>
/// <param name="options">Defines the settings of the load operation.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LoadFromTextFile](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
/// ]]>
/// </format>
/// </example>
public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path,
TextLoader.Options options = null)
{
Expand All @@ -186,6 +193,13 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str
/// <param name="schema">Whether to write the header comment with the schema.</param>
/// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param>
/// <param name="forceDense">Whether to save columns in dense format even if they are sparse vectors.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[SaveAsText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs)]
/// ]]>
/// </format>
/// </example>
public static void SaveAsText(this DataOperationsCatalog catalog,
IDataView data,
Stream stream,
Expand Down
8 changes: 5 additions & 3 deletions src/Microsoft.ML.Data/Transforms/KeyToVector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ private ValueGetter<VBuffer<float>> MakeGetterInd(DataViewRow input, int iinfo)
int lenDst = checked(size * lenSrc);
var values = src.GetValues();
int cntSrc = values.Length;
var editor = VBufferEditor.Create(ref dst, lenDst, cntSrc);
var editor = VBufferEditor.Create(ref dst, lenDst, cntSrc, keepOldOnResize: false, requireIndicesOnDense: true);

int count = 0;
if (src.IsDense)
Expand Down Expand Up @@ -814,14 +814,16 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema)

var metadata = new List<SchemaShape.Column>();
if (col.Annotations.TryFindColumn(AnnotationUtils.Kinds.KeyValues, out var keyMeta))
if (col.Kind != SchemaShape.Column.VectorKind.VariableVector && keyMeta.ItemType is TextDataViewType)
if (((colInfo.OutputCountVector && col.IsKey) || col.Kind != SchemaShape.Column.VectorKind.VariableVector) && keyMeta.ItemType is TextDataViewType)
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, keyMeta.ItemType, false));
if (!colInfo.OutputCountVector && (col.Kind == SchemaShape.Column.VectorKind.Scalar || col.Kind == SchemaShape.Column.VectorKind.Vector))
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.CategoricalSlotRanges, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Int32, false));
if (!colInfo.OutputCountVector || (col.Kind == SchemaShape.Column.VectorKind.Scalar))
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BooleanDataViewType.Instance, false));

result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false, new SchemaShape(metadata));
result[colInfo.Name] = new SchemaShape.Column(colInfo.Name,
col.Kind == SchemaShape.Column.VectorKind.VariableVector && !colInfo.OutputCountVector ? SchemaShape.Column.VectorKind.VariableVector : SchemaShape.Column.VectorKind.Vector,
NumberDataViewType.Single, false, new SchemaShape(metadata));
}

return new SchemaShape(result.Values);
Expand Down
4 changes: 3 additions & 1 deletion src/Microsoft.ML.Transforms/KeyToVectorMapping.cs
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,9 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, keyMeta.ItemType, false));
if (col.Kind == SchemaShape.Column.VectorKind.Scalar)
metadata.Add(new SchemaShape.Column(AnnotationUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BooleanDataViewType.Instance, false));
result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false, new SchemaShape(metadata));
result[colInfo.outputColumnName] = new SchemaShape.Column(colInfo.outputColumnName,
col.Kind == SchemaShape.Column.VectorKind.VariableVector ? SchemaShape.Column.VectorKind.VariableVector : SchemaShape.Column.VectorKind.Vector,
NumberDataViewType.Single, false, new SchemaShape(metadata));
}

return new SchemaShape(result.Values);
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
9D748CF9FA3A18BC9456CF3F4E44DE519403FB542A85D1916BB9B1E3AFD90139258936C780E78488721D0872A365BE07CB97A53A6C851BEE5362D5221AE17BF3

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A09EA6EF85BAEF567AE33538D0FEF648317AC4357A6C4C2AF4890E2C60E16A73014645118FFE3A5A56E03E0C941B7770AB7342532EBF07066784A72443970AE7

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
301762731DFCFFABEC7C5ED95453CC27E56B10E06B3EB4CB83B1F552A5345D00E087F65B9FC99D74219016B4C53CC70A7FFC13C29C1D10FC8EEEFA6B18896144
31 changes: 31 additions & 0 deletions test/BaselineOutput/Common/Categorical/oneHot.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=A:I4:0
#@ col=B:I4:1-2
#@ col=C:I4:3-**
#@ col={name=CatA type=U4 src={ min=-1} key=2}
#@ col={name=CatA src={ min=-1 max=0 vector=+}}
#@ col={name=CatB type=U4 src={ min=-1} key=2}
#@ col={name=CatB src={ min=-1 max=1 vector=+}}
#@ col={name=CatC type=U4 src={ min=-1} key=2}
#@ col={name=CatC src={ min=-1 max=0 vector=+}}
#@ col={name=CatD type=U4 src={ min=-1} key=2}
#@ col={name=CatVA type=U4 src={ min=-1 max=0 vector=+} key=3}
#@ col={name=CatVA src={ min=-1 max=1 vector=+}}
#@ col={name=CatVB type=U4 src={ min=-1 max=0 vector=+} key=3}
#@ col={name=CatVB src={ min=-1 max=4 vector=+}}
#@ col={name=CatVC type=U4 src={ min=-1 max=0 vector=+} key=3}
#@ col={name=CatVC src={ min=-1 max=4 vector=+}}
#@ col={name=CatVD type=U4 src={ min=-1 max=0 vector=+} key=3}
#@ col={name=CatVVA type=U4 src={ min=-1 var=+} key=3}
#@ col={name=CatVVA src={ min=-1 max=1 vector=+}}
#@ col={name=CatVVB type=U4 src={ min=-1 var=+} key=3}
#@ col={name=CatVVB src={ min=-1 var=+}}
#@ col={name=CatVVC type=U4 src={ min=-1 var=+} key=3}
#@ col={name=CatVVC src={ min=-1 var=+}}
#@ col={name=CatVVD type=U4 src={ min=-1 var=+} key=3}
#@ }
A "" "" CatA 1 4 CatB Bit2 Bit1 Bit0 CatC 1 4 CatD "" "" 2 3 4 "" "" [0].Bit2 [0].Bit1 [0].Bit0 [1].Bit2 [1].Bit1 [1].Bit0 "" "" [0].2 [0].3 [0].4 [1].2 [1].3 [1].4 "" "" 3 4 2
1 2 3 3 4 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1
4 2 4 2 4 3 1 0 1 1 0 0 1 1 0 1 1 0 2 1 0 1 0 2 0 0 0 0 1 0 0 2 1 0 0 0 0 1 0 2 2 1 0 1 1 1 2 1 0 0 1 0 0 0 1 0 0 0 2 1 0 0 0 1 0 1 0 1 0 0 2 1 0
Loading