Skip to content

Cherry pick sample update (Concate, Select, Drop, Copy) #3300

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 45 additions & 36 deletions docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Concatenate.cs
Original file line number Diff line number Diff line change
@@ -1,64 +1,73 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic
namespace Samples.Dynamic
{
public static class ConcatTransform
public static class Concatenate
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
var data = SamplesUtils.DatasetUtils.GetInfertData();
var trainData = mlContext.Data.LoadFromEnumerable(data);
// Create a small dataset as an IEnumerable.
var samples = new List<InputData>()
{
new InputData(){ Feature1 = 0.1f, Feature2 = new[]{ 1.1f, 2.1f, 3.1f}, Feature3 = 1 },
new InputData(){ Feature1 = 0.2f, Feature2 = new[]{ 1.2f, 2.2f, 3.2f}, Feature3 = 2 },
new InputData(){ Feature1 = 0.3f, Feature2 = new[]{ 1.3f, 2.3f, 3.3f}, Feature3 = 3 },
new InputData(){ Feature1 = 0.4f, Feature2 = new[]{ 1.4f, 2.4f, 3.4f}, Feature3 = 4 },
new InputData(){ Feature1 = 0.5f, Feature2 = new[]{ 1.5f, 2.5f, 3.5f}, Feature3 = 5 },
new InputData(){ Feature1 = 0.6f, Feature2 = new[]{ 1.6f, 2.6f, 3.6f}, Feature3 = 6 },
};

// Preview of the data.
//
// Age Case Education induced parity pooled.stratum row_num ...
// 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ...
// 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ...
// 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ...
// 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ...
// 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ...
// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for concatenating the Age, Parity and Induced columns together into a vector that will be the Features column.
// Concatenation is necessary because learners take **feature vectors** as inputs.
// e.g. var regressionTrainer = mlContext.Regression.Trainers.FastTree(labelColumn: "Label", featureColumn: "Features");
string outputColumnName = "Features";
var pipeline = mlContext.Transforms.Concatenate(outputColumnName, new[] { "Age", "Parity", "Induced" });
// A pipeline for concatenating the "Feature1", "Feature2" and "Feature3" columns together into a vector that will be the Features column.
// Concatenation is necessary because trainers take feature vectors as inputs.
//
// Please note that the "Feature3" column is converted from int32 to float using the ConvertType.
// The Concatenate requires all columns to be of same type.
var pipeline = mlContext.Transforms.Conversion.ConvertType("Feature3", outputKind: DataKind.Single)
.Append(mlContext.Transforms.Concatenate("Features", new[] { "Feature1", "Feature2", "Feature3" }));

// The transformed data.
var transformedData = pipeline.Fit(trainData).Transform(trainData);
var transformedData = pipeline.Fit(dataview).Transform(dataview);

// Now let's take a look at what this concatenation did.
// We can extract the newly created column as an IEnumerable of SampleInfertDataWithFeatures, the class we define above.
var featuresColumn = mlContext.Data.CreateEnumerable<SampleInfertDataWithFeatures>(transformedData, reuseRowObject: false);
// We can extract the newly created column as an IEnumerable of TransformedData.
var featuresColumn = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false);

// And we can write out a few rows
Console.WriteLine($"{outputColumnName} column obtained post-transformation.");
Console.WriteLine($"Features column obtained post-transformation.");
foreach (var featureRow in featuresColumn)
{
foreach (var value in featureRow.Features.GetValues())
Console.Write($"{value} ");
Console.WriteLine("");
}
Console.WriteLine(string.Join(" ", featureRow.Features));

// Expected output:
// Features column obtained post-transformation.
//
// 26 6 1
// 42 1 1
// 39 6 2
// 34 4 2
// 35 3 1
// Features column obtained post-transformation.
// 0.1 1.1 2.1 3.1 1
// 0.2 1.2 2.2 3.2 2
// 0.3 1.3 2.3 3.3 3
// 0.4 1.4 2.4 3.4 4
// 0.5 1.5 2.5 3.5 5
// 0.6 1.6 2.6 3.6 6
}

private class InputData
{
public float Feature1;
[VectorType(3)]
public float[] Feature2;
public int Feature3;
}

private class SampleInfertDataWithFeatures
private sealed class TransformedData
{
public VBuffer<float> Features { get; set; }
public float[] Features { get; set; }
}
}
}
79 changes: 33 additions & 46 deletions docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CopyColumns.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;

namespace Microsoft.ML.Samples.Dynamic
namespace Samples.Dynamic
{
public static class CopyColumns
{
Expand All @@ -11,72 +12,58 @@ public static void Example()
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
IEnumerable<SamplesUtils.DatasetUtils.SampleInfertData> data = SamplesUtils.DatasetUtils.GetInfertData();
var trainData = mlContext.Data.LoadFromEnumerable(data);
// Create a small dataset as an IEnumerable.
var samples = new List<InputData>()
{
new InputData(){ ImageId = 1, Features = new [] { 1.0f, 1.0f, 1.0f} },
new InputData(){ ImageId = 2, Features = new [] { 2.0f, 2.0f, 2.0f} },
new InputData(){ ImageId = 3, Features = new [] { 3.0f, 3.0f, 3.0f} },
new InputData(){ ImageId = 4, Features = new [] { 4.0f, 4.0f, 4.0f} },
new InputData(){ ImageId = 5, Features = new [] { 5.0f, 5.0f, 5.0f} },
new InputData(){ ImageId = 6, Features = new [] { 6.0f, 6.0f, 6.0f} },
};

// Preview of the data.
//
// Age Case Education induced parity pooled.stratum row_num ...
// 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ...
// 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ...
// 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ...
// 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ...
// 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ...
// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// CopyColumns is commonly used to rename columns.
// For example, if you want to train towards Age, and your learner expects a "Label" column, you can
// use CopyColumns to rename Age to Label. Technically, the Age columns still exists, but it won't be
// For example, if you want to train towards ImageId, and your trainer expects a "Label" column, you can
// use CopyColumns to rename ImageId to Label. Technically, the ImageId column still exists, but it won't be
// materialized unless you actually need it somewhere (e.g. if you were to save the transformed data
// without explicitly dropping the column). This is a general property of IDataView's lazy evaluation.
string labelColumnName = "Label";
var pipeline = mlContext.Transforms.CopyColumns(labelColumnName, "Age") as IEstimator<ITransformer>;

// You also may want to copy a column to perform some hand-featurization using built-in transforms or
// a CustomMapping transform. For example, we could make an indicator variable if a feature, such as Parity
// goes above some threshold. We simply copy the Parity column to a new column, then pass it through a custom function.
Action<InputRow, OutputRow> mapping = (input, output) =>output.CustomValue = input.CustomValue > 4 ? 1 : 0;
pipeline = pipeline.Append(mlContext.Transforms.CopyColumns("CustomValue", "Parity"))
.Append(mlContext.Transforms.CustomMapping(mapping, null));
var pipeline = mlContext.Transforms.CopyColumns("Label", "ImageId");

// Now we can transform the data and look at the output to confirm the behavior of CopyColumns.
// Don't forget that this operation doesn't actually evaluate data until we read the data below.
var transformedData = pipeline.Fit(trainData).Transform(trainData);
var transformedData = pipeline.Fit(dataview).Transform(dataview);

// We can extract the newly created column as an IEnumerable of SampleInfertDataTransformed, the class we define below.
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataTransformed>(transformedData, reuseRowObject: false);
var rowEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false);

// And finally, we can write out the rows of the dataset, looking at the columns of interest.
Console.WriteLine($"Label, Parity, and CustomValue columns obtained post-transformation.");
Console.WriteLine($"Label and ImageId columns obtained post-transformation.");
foreach (var row in rowEnumerable)
{
Console.WriteLine($"Label: {row.Label} Parity: {row.Parity} CustomValue: {row.CustomValue}");
}
Console.WriteLine($"Label: {row.Label} ImageId: {row.ImageId}");

// Expected output:
// Label, Parity, and CustomValue columns obtained post-transformation.
// Label: 26 Parity: 6 CustomValue: 1
// Label: 42 Parity: 1 CustomValue: 0
// Label: 39 Parity: 6 CustomValue: 1
// Label: 34 Parity: 4 CustomValue: 0
// Label: 35 Parity: 3 CustomValue: 0
}

private class SampleInfertDataTransformed
{
public float Label { get; set; }
public float Parity { get; set; }
public float CustomValue { get; set; }
// ImageId and Label columns obtained post-transformation.
// Label: 1 ImageId: 1
// Label: 2 ImageId: 2
// Label: 3 ImageId: 3
// Label: 4 ImageId: 4
// Label: 5 ImageId: 5
// Label: 6 ImageId: 6
}

private class OutputRow
private class InputData
{
public float CustomValue { get; set; }
public int ImageId { get; set; }
public float[] Features { get; set; }
}

private class InputRow
private class TransformedData : InputData
{
public float CustomValue { get; set; }
public int Label { get; set; }
}
}
}
79 changes: 41 additions & 38 deletions docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/DropColumns.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
using System;
using System.Collections.Generic;
using Microsoft.ML;

namespace Microsoft.ML.Samples.Dynamic
namespace Samples.Dynamic
{
public static class DropColumns
{
Expand All @@ -11,71 +12,73 @@ public static void Example()
// as well as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
IEnumerable<SamplesUtils.DatasetUtils.SampleInfertData> data = SamplesUtils.DatasetUtils.GetInfertData();
var trainData = mlContext.Data.LoadFromEnumerable(data);
// Create a small dataset as an IEnumerable.
var samples = new List<InputData>()
{
new InputData(){ Age = 21, Gender = "Male", Education = "BS", ExtraColumn = 1 },
new InputData(){ Age = 23, Gender = "Female", Education = "MBA", ExtraColumn = 2 },
new InputData(){ Age = 28, Gender = "Male", Education = "PhD", ExtraColumn = 3 },
new InputData(){ Age = 22, Gender = "Male", Education = "BS", ExtraColumn = 4 },
new InputData(){ Age = 23, Gender = "Female", Education = "MS", ExtraColumn = 5 },
new InputData(){ Age = 27, Gender = "Female", Education = "PhD", ExtraColumn = 6 },
};

// Preview of the data.
//
// Age Case Education Induced Parity Pooled.stratum Row_num ...
// 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ...
// 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ...
// 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ...
// 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ...
// 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ...
// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// Drop the Age and Education columns from the dataset.
var pipeline = mlContext.Transforms.DropColumns("Age", "Education");
// Drop the ExtraColumn from the dataset.
var pipeline = mlContext.Transforms.DropColumns("ExtraColumn");

// Now we can transform the data and look at the output.
// Don't forget that this operation doesn't actually operate on data until we perform an action that requires
// the data to be materialized.
var transformedData = pipeline.Fit(trainData).Transform(trainData);
var transformedData = pipeline.Fit(dataview).Transform(dataview);

// Now let's take a look at what the DropColumns operations did.
// We can extract the transformed data as an IEnumerable of SampleInfertDataNonExistentColumns, the class we define below.
// When we try to pull out the Age and Education columns, ML.NET will raise an exception on the first non-existent column
// that it tries to access.
// We can extract the transformed data as an IEnumerable of InputData, the class we define below.
// When we try to pull out the Age, Gender, Education and ExtraColumn columns, ML.NET will raise an exception on the ExtraColumn
try
{
var failingRowEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataNonExistentColumns>(transformedData, reuseRowObject: false);
} catch(ArgumentOutOfRangeException exception)
var failingRowEnumerable = mlContext.Data.CreateEnumerable<InputData>(transformedData, reuseRowObject: false);
}
catch (ArgumentOutOfRangeException exception)
{
Console.WriteLine($"Age and Education were not available, so an exception was thrown: {exception.Message}.");
Console.WriteLine($"ExtraColumn is not available, so an exception is thrown: {exception.Message}.");
}

// Expected output:
// Age and Education were not available, so an exception was thrown: Could not find column 'Age'.
// ExtraColumn is not available, so an exception is thrown: Could not find column 'ExtraColumn'.
// Parameter name: Schema

// And we can write a few columns out to see that the rest of the data is still available.
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataTransformed>(transformedData, reuseRowObject: false);
var rowEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false);
Console.WriteLine($"The columns we didn't drop are still available.");
foreach (var row in rowEnumerable)
{
Console.WriteLine($"Case: {row.Case} Induced: {row.Induced} Parity: {row.Parity}");
}
Console.WriteLine($"Age: {row.Age} Gender: {row.Gender} Education: {row.Education}");

// Expected output:
// The columns we didn't drop are still available.
// Case: 1 Induced: 1 Parity: 6
// Case: 1 Induced: 1 Parity: 1
// Case: 1 Induced: 2 Parity: 6
// Case: 1 Induced: 2 Parity: 4
// Case: 1 Induced: 1 Parity: 3
// Age: 21 Gender: Male Education: BS
// Age: 23 Gender: Female Education: MBA
// Age: 28 Gender: Male Education: PhD
// Age: 22 Gender: Male Education: BS
// Age: 23 Gender: Female Education: MS
// Age: 27 Gender: Female Education: PhD
}

private class SampleInfertDataNonExistentColumns
private class InputData
{
public float Age { get; set; }
public float Education { get; set; }
public int Age { get; set; }
public string Gender { get; set; }
public string Education { get; set; }
public float ExtraColumn { get; set; }
}

private class SampleInfertDataTransformed
private class TransformedData
{
public float Case { get; set; }
public float Induced { get; set; }
public float Parity { get; set; }
public int Age { get; set; }
public string Gender { get; set; }
public string Education { get; set; }
}
}
}
Loading