Skip to content

Bring ensembles into codebase #379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jun 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Microsoft.ML.sln
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Sweeper.Tests"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.LightGBM", "src\Microsoft.ML.LightGBM\Microsoft.ML.LightGBM.csproj", "{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Ensemble", "src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj", "{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -228,6 +230,10 @@ Global
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}.Debug|Any CPU.Build.0 = Debug|Any CPU
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}.Release|Any CPU.ActiveCfg = Release|Any CPU
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}.Release|Any CPU.Build.0 = Release|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -267,6 +273,7 @@ Global
{9252A8EB-ABFB-440C-AB4D-1D562753CE0F} = {487213C9-E8A9-4F94-85D7-28A05DBBFE3A}
{3DEB504D-7A07-48CE-91A2-8047461CB3D4} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
Expand Down
22 changes: 22 additions & 0 deletions src/Microsoft.ML.Ensemble/Batch.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime.Data;

namespace Microsoft.ML.Runtime.Ensemble
{
public sealed class Batch
{
public readonly RoleMappedData TrainInstances;
public readonly RoleMappedData TestInstances;

public Batch(RoleMappedData trainData, RoleMappedData testData)
{
Contracts.CheckValue(trainData, nameof(trainData));
Contracts.CheckValue(testData, nameof(testData));
TrainInstances = trainData;
TestInstances = testData;
}
}
}
114 changes: 114 additions & 0 deletions src/Microsoft.ML.Ensemble/EnsembleUtils.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Internal.Utilities;

namespace Microsoft.ML.Runtime.Ensemble
{
internal static class EnsembleUtils
{
/// <summary>
/// Return a dataset with non-selected features zeroed out.
/// </summary>
public static RoleMappedData SelectFeatures(IHost host, RoleMappedData data, BitArray features)
{
Contracts.AssertValue(host);
Contracts.AssertValue(data);
Contracts.Assert(data.Schema.Feature != null);
Contracts.AssertValue(features);

var type = data.Schema.Feature.Type;
Contracts.Assert(features.Length == type.VectorSize);
int card = Utils.GetCardinality(features);
if (card == type.VectorSize)
return data;

// REVIEW: This doesn't preserve metadata on the features column. Should it?
var name = data.Schema.Feature.Name;
var view = LambdaColumnMapper.Create(
host, "FeatureSelector", data.Data, name, name, type, type,
(ref VBuffer<Single> src, ref VBuffer<Single> dst) => SelectFeatures(ref src, features, card, ref dst));

var res = RoleMappedData.Create(view, data.Schema.GetColumnRoleNames());
return res;
}

/// <summary>
/// Fill dst with values selected from src if the indices of the src values are set in includedIndices,
/// otherwise assign default(T). The length of dst will be equal to src.Length.
/// </summary>
public static void SelectFeatures<T>(ref VBuffer<T> src, BitArray includedIndices, int cardinality, ref VBuffer<T> dst)
{
Contracts.Assert(Utils.Size(includedIndices) == src.Length);
Contracts.Assert(cardinality == Utils.GetCardinality(includedIndices));
Contracts.Assert(cardinality < src.Length);

var values = dst.Values;
var indices = dst.Indices;

if (src.IsDense)
{
if (cardinality >= src.Length / 2)
{
T defaultValue = default;
if (Utils.Size(values) < src.Length)
values = new T[src.Length];
for (int i = 0; i < src.Length; i++)
values[i] = !includedIndices[i] ? defaultValue : src.Values[i];
dst = new VBuffer<T>(src.Length, values, indices);
}
else
{
if (Utils.Size(values) < cardinality)
values = new T[cardinality];
if (Utils.Size(indices) < cardinality)
indices = new int[cardinality];

int count = 0;
for (int i = 0; i < src.Length; i++)
{
if (includedIndices[i])
{
Contracts.Assert(count < cardinality);
values[count] = src.Values[i];
indices[count] = i;
count++;
}
}

Contracts.Assert(count == cardinality);
dst = new VBuffer<T>(src.Length, count, values, indices);
}
}
else
{
int valuesSize = Utils.Size(values);
int indicesSize = Utils.Size(indices);
if (valuesSize < src.Count || indicesSize < src.Count)
{
if (valuesSize < cardinality)
values = new T[cardinality];
if (indicesSize < cardinality)
indices = new int[cardinality];
}

int count = 0;
for (int i = 0; i < src.Count; i++)
{
if (includedIndices[src.Indices[i]])
{
values[count] = src.Values[i];
indices[count] = src.Indices[i];
count++;
}
}

dst = new VBuffer<T>(src.Length, count, values, indices);
}
}
}
}
Loading