Skip to content

Cherry pick for samples {Image, Categorical, FeatureSelection} #3221

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 0 additions & 120 deletions docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.Transforms.OneHotEncodingEstimator;

namespace Microsoft.ML.Samples.Dynamic
namespace Samples.Dynamic
{
public static class OneHotEncoding
{
Expand All @@ -17,53 +17,39 @@ public static void Example()
// Get a small dataset as an IEnumerable.
var samples = new List<DataPoint>()
{
new DataPoint(){ Label = 0, Education = "0-5yrs" },
new DataPoint(){ Label = 1, Education = "0-5yrs" },
new DataPoint(){ Label = 45, Education = "6-11yrs" },
new DataPoint(){ Label = 50, Education = "6-11yrs" },
new DataPoint(){ Label = 50, Education = "11-15yrs" },
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "11-15yrs" },
};

// Convert training data to IDataView.
var trainData = mlContext.Data.LoadFromEnumerable(samples);
var data = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for one hot encoding the Education column.
var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag);
// Fit to data.
var bagTransformer = bagPipeline.Fit(trainData);
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education");

// Get transformed data
var bagTransformedData = bagTransformer.Transform(trainData);
// Getting the data of the newly created column, so we can preview it.
var bagEncodedColumn = bagTransformedData.GetColumn<float[]>("EducationOneHotEncoded");
// Fit and transform the data.
var oneHotEncodedData = pipeline.Fit(data).Transform(data);

PrintDataColumn(oneHotEncodedData, "EducationOneHotEncoded");
// We have 3 slots, because there are three categories in the 'Education' column.
// 1 0 0
// 1 0 0
// 0 1 0
// 0 1 0
// 0 0 1

// A pipeline for one hot encoding the Education column (using keying).
var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key);
// Fit to data.
var keyTransformer = keyPipeline.Fit(trainData);

// Get transformed data
var keyTransformedData = keyTransformer.Transform(trainData);
// Getting the data of the newly created column, so we can preview it.
var keyEncodedColumn = keyTransformedData.GetColumn<uint>("EducationOneHotEncoded");
// Fit and Transform data.
oneHotEncodedData = keyPipeline.Fit(data).Transform(data);

Console.WriteLine("One Hot Encoding based on the bagging strategy.");
foreach (var row in bagEncodedColumn)
{
for (var i = 0; i < row.Length; i++)
Console.Write($"{row[i]} ");
}

// data column obtained post-transformation.
// Since there are only two categories in the Education column of the trainData, the output vector
// for one hot will have two slots.
//
// 0 0 0
// 0 0 0
// 0 0 1
// 0 0 1
// 0 1 0
var keyEncodedColumn = oneHotEncodedData.GetColumn<uint>("EducationOneHotEncoded");

Console.WriteLine("One Hot Encoding with key type output.");
Console.WriteLine("One Hot Encoding of single column 'Education', with key type output.");
foreach (var element in keyEncodedColumn)
Console.WriteLine(element);

Expand All @@ -72,13 +58,20 @@ public static void Example()
// 2
// 2
// 3

}
private static void PrintDataColumn(IDataView transformedData, string columnName)
{
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);

foreach (var row in countSelectColumn)
{
for (var i = 0; i < row.Length; i++)
Console.Write($"{row[i]}\t");
Console.WriteLine();
}
}
private class DataPoint
{
public float Label { get; set; }

public string Education { get; set; }
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;

namespace Samples.Dynamic
{
public static class OneHotEncodingMultiColumn
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
var samples = new List<DataPoint>()
{
new DataPoint(){ Education = "0-5yrs", ZipCode = "98005" },
new DataPoint(){ Education = "0-5yrs", ZipCode = "98052" },
new DataPoint(){ Education = "6-11yrs", ZipCode = "98005" },
new DataPoint(){ Education = "6-11yrs", ZipCode = "98052" },
new DataPoint(){ Education = "11-15yrs", ZipCode = "98005" },
};

// Convert training data to IDataView.
var data = mlContext.Data.LoadFromEnumerable(samples);

// Multi column example : A pipeline for one hot encoding two columns 'Education' and 'ZipCode'
var multiColumnKeyPipeline = mlContext.Transforms.Categorical.OneHotEncoding(
new InputOutputColumnPair[] {
new InputOutputColumnPair("Education"),
new InputOutputColumnPair("ZipCode"),
});

// Fit and Transform data.
var transformedData = multiColumnKeyPipeline.Fit(data).Transform(data);

var convertedData = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, true);

Console.WriteLine("One Hot Encoding of two columns 'Education' and 'ZipCode'.");
foreach (var item in convertedData)
Console.WriteLine("{0}\t\t\t{1}", string.Join(" ", item.Education), string.Join(" ", item.ZipCode));

// 1 0 0 1 0
// 1 0 0 0 1
// 0 1 0 1 0
// 0 1 0 0 1
// 0 0 1 1 0
}

private class DataPoint
{
public string Education { get; set; }

public string ZipCode { get; set; }
}

private class TransformedData
{
public float[] Education { get; set; }

public float[] ZipCode { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
{
public static class OneHotHashEncoding
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
var samples = new List<DataPoint>()
{
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "0-5yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "6-11yrs" },
new DataPoint(){ Education = "11-15yrs" },
};

// Convert training data to IDataView.
var data = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for one hot hash encoding the 'Education' column.
var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education", numberOfBits: 3);

// Fit and transform the data.
var hashEncodedData = pipeline.Fit(data).Transform(data);

PrintDataColumn(hashEncodedData, "EducationOneHotHashEncoded");
// We have 8 slots, because we used numberOfBits = 3.

// 0 0 0 1 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 0 0 1

// A pipeline for one hot hash encoding the 'Education' column (using keying strategy).
var keyPipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("EducationOneHotHashEncoded", "Education",
outputKind: OneHotEncodingEstimator.OutputKind.Key,
numberOfBits: 3);

// Fit and transform the data.
var hashKeyEncodedData = keyPipeline.Fit(data).Transform(data);

// Getting the data of the newly created column, so we can preview it.
var keyEncodedColumn = hashKeyEncodedData.GetColumn<uint>("EducationOneHotHashEncoded");

Console.WriteLine("One Hot Hash Encoding of single column 'Education', with key type output.");
foreach (var element in keyEncodedColumn)
Console.WriteLine(element);

// 4
// 4
// 5
// 5
// 8
}

private static void PrintDataColumn(IDataView transformedData, string columnName)
{
var countSelectColumn = transformedData.GetColumn<float[]>(transformedData.Schema[columnName]);

foreach (var row in countSelectColumn)
{
for (var i = 0; i < row.Length; i++)
Console.Write($"{row[i]}\t");
Console.WriteLine();
}
}

private class DataPoint
{
public string Education { get; set; }
}
}
}
Loading