Skip to content

Xml docs for trainers and a minor infrastructure changes #455

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/Microsoft.ML.Core/EntryPoints/ModuleArgs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,11 @@ public sealed class EntryPointAttribute : Attribute
/// Short name of the Entry Point
/// </summary>
public string ShortName { get; set; }

/// <summary>
/// Remarks on the Entry Point, for more extensive XML documentation on the C#API
/// </summary>
public string Remarks { get; set; }
}

/// <summary>
Expand Down
2 changes: 2 additions & 0 deletions src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ public sealed class EntryPointInfo
public readonly string Description;
public readonly string ShortName;
public readonly string FriendlyName;
public readonly string Remarks;
public readonly MethodInfo Method;
public readonly Type InputType;
public readonly Type OutputType;
Expand All @@ -63,6 +64,7 @@ internal EntryPointInfo(IExceptionContext ectx, MethodInfo method,
Method = method;
ShortName = attribute.ShortName;
FriendlyName = attribute.UserName;
Remarks = attribute.Remarks;
ObsoleteAttribute = obsoleteAttribute;

// There are supposed to be 2 parameters, env and input for non-macro nodes.
Expand Down
25 changes: 25 additions & 0 deletions src/Microsoft.ML.FastTree/FastTree.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,31 @@ public abstract class FastTreeTrainerBase<TArgs, TPredictor> :

protected string InnerArgs => CmdParser.GetSettings(Host, Args, new TArgs());

internal const string Remarks = @"<remarks>
<para>FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm.
Gradient boosting is a machine learning technique for regression problems.
It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next.
So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
</para>
<para>
MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves.
A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input.
At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x <= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature.
The functions that can be produced by a regression tree are all the piece-wise constant functions.
</para>
<para>
The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
</para>
<list type='bullet'>
<item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
<item>In case of a regression problem, the output is the predicted value of the function.</item>
<item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
</list>
<a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
<a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
</remarks>";

public override bool NeedNormalization => false;

public override bool WantCaching => false;
Expand Down
6 changes: 5 additions & 1 deletion src/Microsoft.ML.FastTree/FastTreeClassification.cs
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,11 @@ public void AdjustTreeOutputs(IChannel ch, RegressionTree tree,

public static partial class FastTree
{
[TlcModule.EntryPoint(Name = "Trainers.FastTreeBinaryClassifier", Desc = FastTreeBinaryClassificationTrainer.Summary, UserName = FastTreeBinaryClassificationTrainer.UserNameValue, ShortName = FastTreeBinaryClassificationTrainer.ShortName)]
[TlcModule.EntryPoint(Name = "Trainers.FastTreeBinaryClassifier",
Desc = FastTreeBinaryClassificationTrainer.Summary,
Remarks = FastTreeBinaryClassificationTrainer.Remarks,
UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
ShortName = FastTreeBinaryClassificationTrainer.ShortName)]
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
6 changes: 5 additions & 1 deletion src/Microsoft.ML.FastTree/FastTreeRanking.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1096,7 +1096,11 @@ public static FastTreeRankingPredictor Create(IHostEnvironment env, ModelLoadCon

public static partial class FastTree
{
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRanker", Desc = FastTreeRankingTrainer.Summary, UserName = FastTreeRankingTrainer.UserNameValue, ShortName = FastTreeRankingTrainer.ShortName)]
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRanker",
Desc = FastTreeRankingTrainer.Summary,
Remarks = FastTreeRankingTrainer.Remarks,
UserName = FastTreeRankingTrainer.UserNameValue,
ShortName = FastTreeRankingTrainer.ShortName)]
public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
6 changes: 5 additions & 1 deletion src/Microsoft.ML.FastTree/FastTreeRegression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ public static FastTreeRegressionPredictor Create(IHostEnvironment env, ModelLoad

public static partial class FastTree
{
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRegressor", Desc = FastTreeRegressionTrainer.Summary, UserName = FastTreeRegressionTrainer.UserNameValue, ShortName = FastTreeRegressionTrainer.ShortName)]
[TlcModule.EntryPoint(Name = "Trainers.FastTreeRegressor",
Desc = FastTreeRegressionTrainer.Summary,
Remarks = FastTreeRegressionTrainer.Remarks,
UserName = FastTreeRegressionTrainer.UserNameValue,
ShortName = FastTreeRegressionTrainer.ShortName)]
public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
12 changes: 9 additions & 3 deletions src/Microsoft.ML.FastTree/FastTreeTweedie.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase
{
public const string LoadNameValue = "FastTreeTweedieRegression";
public const string UserNameValue = "FastTree (Boosted Trees) Tweedie Regression";
public const string Summary = "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner " +
"is a generalization of Poisson, compound Poisson, and gamma regression.";
public const string Summary = "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.";
new public const string Remarks = @"<remarks>
<a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>
<a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a>
</remarks>";

public const string ShortName = "fttweedie";

Expand Down Expand Up @@ -460,7 +463,10 @@ protected override void Map(ref VBuffer<float> src, ref float dst)

public static partial class FastTree
{
[TlcModule.EntryPoint(Name = "Trainers.FastTreeTweedieRegressor", Desc = FastTreeTweedieTrainer.Summary, UserName = FastTreeTweedieTrainer.UserNameValue, ShortName = FastTreeTweedieTrainer.ShortName)]
[TlcModule.EntryPoint(Name = "Trainers.FastTreeTweedieRegressor",
Desc = FastTreeTweedieTrainer.Summary,
UserName = FastTreeTweedieTrainer.UserNameValue,
ShortName = FastTreeTweedieTrainer.ShortName)]
public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
22 changes: 22 additions & 0 deletions src/Microsoft.ML.FastTree/RandomForest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,28 @@ public abstract class RandomForestTrainerBase<TArgs, TPredictor> : FastTreeTrain
where TArgs : FastForestArgumentsBase, new()
where TPredictor : IPredictorProducing<Float>
{
new internal const string Remarks = @"<remarks>
Decision trees are non-parametric models that perform a sequence of simple tests on inputs.
This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed.
A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
<para>Decision trees have several advantages:</para>
<list type='bullet'>
<item>They are efficient in both computation and memory usage during training and prediction. </item>
<item>They can represent non-linear decision boundaries.</item>
<item>They perform integrated feature selection and classification. </item>
<item>They are resilient in the presence of noisy features.</item>
</list>
Fast forest is a random forest implementation.
The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction.
An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
This decision forest classifier consists of an ensemble of decision trees.
Generally, ensemble models provide better coverage and accuracy than single decision trees.
Each tree in a decision forest outputs a Gaussian distribution.
<a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
<a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
<a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
</remarks>";

private readonly bool _quantileEnabled;

protected RandomForestTrainerBase(IHostEnvironment env, TArgs args, bool quantileEnabled = false)
Expand Down
6 changes: 5 additions & 1 deletion src/Microsoft.ML.FastTree/RandomForestClassification.cs
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,11 @@ protected override void GetGradientInOneQuery(int query, int threadIndex)

public static partial class FastForest
{
[TlcModule.EntryPoint(Name = "Trainers.FastForestBinaryClassifier", Desc = FastForestClassification.Summary, UserName = FastForestClassification.UserNameValue, ShortName = FastForestClassification.ShortName)]
[TlcModule.EntryPoint(Name = "Trainers.FastForestBinaryClassifier",
Desc = FastForestClassification.Summary,
Remarks = FastForestClassification.Remarks,
UserName = FastForestClassification.UserNameValue,
ShortName = FastForestClassification.ShortName)]
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
6 changes: 5 additions & 1 deletion src/Microsoft.ML.FastTree/RandomForestRegression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,11 @@ public BasicImpl(Dataset trainData, Arguments args)

public static partial class FastForest
{
[TlcModule.EntryPoint(Name = "Trainers.FastForestRegressor", Desc = FastForestRegression.Summary, UserName = FastForestRegression.LoadNameValue, ShortName = FastForestRegression.ShortName)]
[TlcModule.EntryPoint(Name = "Trainers.FastForestRegressor",
Desc = FastForestRegression.Summary,
Remarks = FastForestRegression.Remarks,
UserName = FastForestRegression.LoadNameValue,
ShortName = FastForestRegression.ShortName)]
public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
14 changes: 13 additions & 1 deletion src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ public class KMeansPlusPlusTrainer : TrainerBase<RoleMappedData, KMeansPredictor
internal const string Summary = "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified "
+ "number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better "
+ "method for choosing the initial cluster centers.";
internal const string Remarks = @"<remarks>
K-means++ improves upon K-means by using the <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=252149'>Yinyang K-Means</a> method for choosing the initial cluster centers.
YYK-Means accelerates K-Means up to an order of magnitude while producing exactly the same clustering results (modulo floating point precision issues).
YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration.
It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations.
<a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a>.
<a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a>
</remarks>";

public enum InitAlgorithm
{
Expand Down Expand Up @@ -225,7 +233,11 @@ private static int ComputeNumThreads(IHost host, int? argNumThreads)
return Math.Max(1, maxThreads);
}

[TlcModule.EntryPoint(Name = "Trainers.KMeansPlusPlusClusterer", Desc = KMeansPlusPlusTrainer.Summary, UserName = UserNameValue, ShortName = ShortName)]
[TlcModule.EntryPoint(Name = "Trainers.KMeansPlusPlusClusterer",
Desc = Summary,
Remarks = Remarks,
UserName = UserNameValue,
ShortName = ShortName)]
public static CommonOutputs.ClusteringOutput TrainKMeans(IHostEnvironment env, Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
17 changes: 10 additions & 7 deletions src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

[assembly: LoadableClass(LightGbmBinaryTrainer.Summary, typeof(LightGbmBinaryTrainer), typeof(LightGbmArguments),
new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer), typeof(SignatureTreeEnsembleTrainer) },
"LightGBM Binary Classification", LightGbmBinaryTrainer.LoadNameValue, LightGbmBinaryTrainer.ShortName, DocName = "trainer/LightGBM.md")]
LightGbmBinaryTrainer.UserName, LightGbmBinaryTrainer.LoadNameValue, LightGbmBinaryTrainer.ShortName, DocName = "trainer/LightGBM.md")]

[assembly: LoadableClass(typeof(IPredictorProducing<float>), typeof(LightGbmBinaryPredictor), null, typeof(SignatureLoadModel),
"LightGBM Binary Executor",
Expand All @@ -27,6 +27,7 @@ public sealed class LightGbmBinaryPredictor : FastTreePredictionWrapper
{
public const string LoaderSignature = "LightGBMBinaryExec";
public const string RegistrationName = "LightGBMBinaryPredictor";

private static VersionInfo GetVersionInfo()
{
// REVIEW: can we decouple the version from FastTree predictor version ?
Expand Down Expand Up @@ -82,9 +83,10 @@ public static IPredictorProducing<float> Create(IHostEnvironment env, ModelLoadC

public sealed class LightGbmBinaryTrainer : LightGbmTrainerBase<float, IPredictorWithFeatureWeights<float>>
{
public const string Summary = "LightGBM Binary Classifier";
public const string LoadNameValue = "LightGBMBinary";
public const string ShortName = "LightGBM";
internal const string UserName = "LightGBM Binary Classifier";
internal const string LoadNameValue = "LightGBMBinary";
internal const string ShortName = "LightGBM";
internal const string Summary = "Train a LightGBM binary classification model.";

public LightGbmBinaryTrainer(IHostEnvironment env, LightGbmArguments args)
: base(env, args, PredictionKind.BinaryClassification, "LGBBINCL")
Expand Down Expand Up @@ -122,14 +124,15 @@ protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, Role
}

/// <summary>
/// A component to train an LightGBM model.
/// A component to train a LightGBM model.
/// </summary>
public static partial class LightGbm
{
[TlcModule.EntryPoint(
Name = "Trainers.LightGbmBinaryClassifier",
Desc = "Train a LightGBM binary class model.",
UserName = LightGbmBinaryTrainer.Summary,
Desc = LightGbmBinaryTrainer.Summary,
Remarks = LightGbmBinaryTrainer.Remarks,
UserName = LightGbmBinaryTrainer.UserName,
ShortName = LightGbmBinaryTrainer.ShortName)]
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LightGbmArguments input)
{
Expand Down
3 changes: 2 additions & 1 deletion src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,14 @@ protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, Role
}

/// <summary>
/// A component to train an LightGBM model.
/// A component to train a LightGBM model.
/// </summary>
public static partial class LightGbm
{
[TlcModule.EntryPoint(
Name = "Trainers.LightGbmClassifier",
Desc = "Train a LightGBM multi class model.",
Remarks = LightGbmMulticlassTrainer.Remarks,
UserName = LightGbmMulticlassTrainer.Summary,
ShortName = LightGbmMulticlassTrainer.ShortName)]
public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, LightGbmArguments input)
Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
using Microsoft.ML.Runtime.LightGBM;
using Microsoft.ML.Runtime.Model;

[assembly: LoadableClass(LightGbmRankingTrainer.Summary, typeof(LightGbmRankingTrainer), typeof(LightGbmArguments),
[assembly: LoadableClass(LightGbmRankingTrainer.UserName, typeof(LightGbmRankingTrainer), typeof(LightGbmArguments),
new[] { typeof(SignatureRankerTrainer), typeof(SignatureTrainer), typeof(SignatureTreeEnsembleTrainer) },
"LightGBM Ranking", LightGbmRankingTrainer.LoadNameValue, LightGbmRankingTrainer.ShortName, DocName = "trainer/LightGBM.md")]

Expand Down Expand Up @@ -73,7 +73,7 @@ public static LightGbmRankingPredictor Create(IHostEnvironment env, ModelLoadCon

public sealed class LightGbmRankingTrainer : LightGbmTrainerBase<float, LightGbmRankingPredictor>
{
public const string Summary = "LightGBM Ranking";
public const string UserName = "LightGBM Ranking";
public const string LoadNameValue = "LightGBMRanking";
public const string ShortName = "LightGBMRank";

Expand Down Expand Up @@ -123,14 +123,14 @@ protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, Role
}

/// <summary>
/// A component to train an LightGBM model.
/// A component to train a LightGBM model.
/// </summary>
public static partial class LightGbm
{
[TlcModule.EntryPoint(
Name = "Trainers.LightGbmRanker",
[TlcModule.EntryPoint(Name = "Trainers.LightGbmRanker",
Remarks = LightGbmMulticlassTrainer.Remarks,
Desc = "Train a LightGBM ranking model.",
UserName = LightGbmRankingTrainer.Summary,
UserName = LightGbmRankingTrainer.UserName,
ShortName = LightGbmRankingTrainer.ShortName)]
public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, LightGbmArguments input)
{
Expand Down
Loading