Skip to content

Commit d23d88e

Browse files
authored
Misc doc/sample fixes, improvements, and typos (#4899)
* Fix 2655 * xref typo in DataKind - related 4119 * Fix 3627 * Fix 3927 * Fix SvmLight sample * Fix 3841 * Partially addresses 3891 * Further clarify TextLoader documentation, fix 4898 * PR feedback * More PR feedback * Fix 3407 - clarify metrics * Feedback * Bad XML
1 parent e5a19af commit d23d88e

20 files changed

+411
-139
lines changed

docs/api-reference/time-series-pvalue.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ The lower its value, the more likely it is a spike. The p-value score is always
77

88
This score is the p-value of the current computed raw score according to a distribution of raw scores.
99
Here, the distribution is estimated based on the most recent raw score values up to certain depth back in the history.
10-
More specifically, this distribution is estimated using [kernel density estimation (https://en.wikipedia.org/wiki/Kernel_density_estimation) with the Gaussian [kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#In_non-parametric_statistics) of adaptive bandwidth.
10+
More specifically, this distribution is estimated using [kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation) with the Gaussian [kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#In_non-parametric_statistics) of adaptive bandwidth.
1111

1212
If the p-value score exceeds $1 - \frac{\text{confidence}}{100}$, the associated timestamp may get a non-zero alert value in spike detection, which means a spike point is detected.
1313
Note that $\text{confidence}$ is defined in the signatures of [DetectIidSpike](xref:Microsoft.ML.TimeSeriesCatalog.DetectIidSpike(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int32,System.Int32,Microsoft.ML.Transforms.TimeSeries.AnomalySide))

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingSvmLight.cs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
using System;
2-
using System.Collections.Generic;
32
using System.IO;
43
using System.Text;
54
using Microsoft.ML;
65
using Microsoft.ML.Data;
7-
using Microsoft.ML.Transforms;
8-
using Microsoft.VisualBasic.CompilerServices;
9-
using Tensorflow;
106

117
namespace Samples.Dynamic.DataOperations
128
{
@@ -31,9 +27,9 @@ public static void Example()
3127
else
3228
sb.Append("-1 ");
3329
if (line % 2 == 0)
34-
sb.Append("cost:1");
30+
sb.Append("cost:1 ");
3531
else
36-
sb.Append("cost:2");
32+
sb.Append("cost:2 ");
3733
for (int i = 1; i <= 10; i++)
3834
{
3935
if (random.NextDouble() > 0.5)

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingText.cs

Lines changed: 132 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,37 @@ public static void Example()
1414
{
1515
// Create 5 data files to illustrate different loading methods.
1616
var dataFiles = new List<string>();
17-
var random = new Random();
17+
var random = new Random(1);
1818
var dataDirectoryName = "DataDir";
1919
Directory.CreateDirectory(dataDirectoryName);
2020
for (int i = 0; i < 5; i++)
2121
{
2222
var fileName = Path.Combine(dataDirectoryName, $"Data_{i}.csv");
2323
dataFiles.Add(fileName);
2424
using (var fs = File.CreateText(fileName))
25-
// Write random lines without header
25+
{
26+
// Write without header with 10 random columns, forcing
27+
// approximately 80% of values to be 0.
2628
for (int line = 0; line < 10; line++)
27-
fs.WriteLine(random.NextDouble().ToString());
29+
{
30+
var sb = new StringBuilder();
31+
for (int pos = 0; pos < 10; pos++)
32+
{
33+
var value = random.NextDouble();
34+
sb.Append((value < 0.8 ? 0 : value).ToString() + '\t');
35+
}
36+
fs.WriteLine(sb.ToString(0, sb.Length - 1));
37+
}
38+
}
2839
}
2940

3041
// Create a TextLoader.
3142
var mlContext = new MLContext();
3243
var loader = mlContext.Data.CreateTextLoader(
3344
columns: new[]
34-
{
35-
new TextLoader.Column("RandomFeature", DataKind.Single, 0)
36-
},
45+
{
46+
new TextLoader.Column("Features", DataKind.Single, 0, 9)
47+
},
3748
hasHeader: false
3849
);
3950

@@ -55,11 +66,119 @@ public static void Example()
5566

5667
// Load all files using path wildcard.
5768
var multipleFilesWildcardData =
58-
loader.Load(Path.Combine(dataDirectoryName, "*"));
69+
loader.Load(Path.Combine(dataDirectoryName, "Data_*.csv"));
5970
PrintRowCount(multipleFilesWildcardData);
6071

6172
// Expected Output:
6273
// 50
74+
75+
76+
// Create a TextLoader with user defined type.
77+
var loaderWithCustomType =
78+
mlContext.Data.CreateTextLoader<Data>(hasHeader: false);
79+
80+
// Load a single file from path.
81+
var singleFileCustomTypeData = loaderWithCustomType.Load(dataFiles[0]);
82+
PrintRowCount(singleFileCustomTypeData);
83+
84+
// Expected Output:
85+
// 10
86+
87+
88+
// Create a TextLoader with unknown column length to illustrate
89+
// how a data sample may be used to infer column size.
90+
var dataSample = new MultiFileSource(dataFiles[0]);
91+
var loaderWithUnknownLength = mlContext.Data.CreateTextLoader(
92+
columns: new[]
93+
{
94+
new TextLoader.Column("Features",
95+
DataKind.Single,
96+
new[] { new TextLoader.Range(0, null) })
97+
},
98+
dataSample: dataSample
99+
);
100+
101+
var dataWithInferredLength = loaderWithUnknownLength.Load(dataFiles[0]);
102+
var featuresColumn = dataWithInferredLength.Schema.GetColumnOrNull("Features");
103+
if (featuresColumn.HasValue)
104+
Console.WriteLine(featuresColumn.Value.ToString());
105+
106+
// Expected Output:
107+
// Features: Vector<Single, 10>
108+
//
109+
// ML.NET infers the correct length of 10 for the Features column,
110+
// which is of type Vector<Single>.
111+
112+
PrintRowCount(dataWithInferredLength);
113+
114+
// Expected Output:
115+
// 10
116+
117+
118+
// Save the data with 10 rows to a text file to illustrate the use of
119+
// sparse format.
120+
var sparseDataFileName = Path.Combine(dataDirectoryName, "saved_data.tsv");
121+
using (FileStream stream = new FileStream(sparseDataFileName, FileMode.Create))
122+
mlContext.Data.SaveAsText(singleFileData, stream);
123+
124+
// Since there are many zeroes in the data, it will be saved in a sparse
125+
// representation to save disk space. The data may be forced to be saved
126+
// in a dense representation by setting forceDense to true. The sparse
127+
// data will look like the following:
128+
//
129+
// 10 7:0.943862259
130+
// 10 3:0.989767134
131+
// 10 0:0.949778438 8:0.823028445 9:0.886469543
132+
//
133+
// The sparse representation of the first row indicates that there are
134+
// 10 columns, the column 7 (8-th column) has value 0.943862259, and other
135+
// omitted columns have value 0.
136+
137+
// Create a TextLoader that allows sparse input.
138+
var sparseLoader = mlContext.Data.CreateTextLoader(
139+
columns: new[]
140+
{
141+
new TextLoader.Column("Features", DataKind.Single, 0, 9)
142+
},
143+
allowSparse: true
144+
);
145+
146+
// Load the saved sparse data.
147+
var sparseData = sparseLoader.Load(sparseDataFileName);
148+
PrintRowCount(sparseData);
149+
150+
// Expected Output:
151+
// 10
152+
153+
154+
// Create a TextLoader without any column schema using TextLoader.Options.
155+
// Since the sparse data file was saved with ML.NET, it has the schema
156+
// enoded in its header that the loader can understand:
157+
//
158+
// #@ TextLoader{
159+
// #@ sep=tab
160+
// #@ col=Features:R4:0-9
161+
// #@ }
162+
//
163+
// The schema syntax is unimportant since it is only used internally. In
164+
// short, it tells the loader that the values are separated by tabs, and
165+
// that columns 0-9 in the text file are to be read into one column named
166+
// "Features" of type Single (internal type R4).
167+
168+
var options = new TextLoader.Options()
169+
{
170+
AllowSparse = true,
171+
};
172+
var dataSampleWithSchema = new MultiFileSource(sparseDataFileName);
173+
var sparseLoaderWithSchema =
174+
mlContext.Data.CreateTextLoader(options, dataSample: dataSampleWithSchema);
175+
176+
// Load the saved sparse data.
177+
var sparseDataWithSchema = sparseLoaderWithSchema.Load(sparseDataFileName);
178+
PrintRowCount(sparseDataWithSchema);
179+
180+
// Expected Output:
181+
// 10
63182
}
64183

65184
private static void PrintRowCount(IDataView idv)
@@ -73,5 +192,11 @@ private static void PrintRowCount(IDataView idv)
73192

74193
Console.WriteLine(rowCount);
75194
}
195+
196+
private class Data
197+
{
198+
[LoadColumn(0, 9)]
199+
public float[] Features { get; set; }
200+
}
76201
}
77202
}

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ namespace Samples.Dynamic
66
{
77
public static class CustomMapping
88
{
9+
// This example shows how to define and apply a custom mapping of input
10+
// columns to output columns without defining a contract. Since a contract
11+
// is not defined, the pipeline containing this mapping cannot be saved and
12+
// loaded back.
913
public static void Example()
1014
{
1115
// Create a new ML context, for ML.NET operations. It can be used for

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ namespace Samples.Dynamic
77
{
88
public static class CustomMappingSaveAndLoad
99
{
10+
// This example shows how to define and apply a custom mapping of input
11+
// columns to output columns with a contract name. The contract name is
12+
// used in the CustomMappingFactoryAttribute that decorates the custom
13+
// mapping action. The pipeline containing the custom mapping can then be
14+
// saved to disk, and it can be loaded back after the assembly containing
15+
// the custom mapping action is registered.
1016
public static void Example()
1117
{
1218
// Create a new ML context, for ML.NET operations. It can be used for
@@ -24,7 +30,11 @@ public static void Example()
2430
var data = mlContext.Data.LoadFromEnumerable(samples);
2531

2632
// Custom transformations can be used to transform data directly, or as
27-
// part of a pipeline of estimators.
33+
// part of a pipeline of estimators. The contractName must be provided
34+
// in order for a pipeline containing a CustomMapping estimator to be
35+
// saved and loaded back. The contractName must be the same as in the
36+
// CustomMappingFactoryAttribute used to decorate the custom action
37+
// defined by the user.
2838
var pipeline = mlContext.Transforms.CustomMapping(new
2939
IsUnderThirtyCustomAction().GetMapping(), contractName:
3040
"IsUnderThirty");

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingWithInMemoryCustomType.cs

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ namespace Samples.Dynamic
88
{
99
class CustomMappingWithInMemoryCustomType
1010
{
11+
// This example shows how custom mapping actions can be performed on custom data
12+
// types that ML.NET doesn't know yet. The example tells a story of how two alien
13+
// bodies are merged to form a super alien with a single body.
14+
//
15+
// Here, the type AlienHero represents a single alien entity with a member "Name"
16+
// of type string and members "One" and "Two" of type AlienBody. It defines a custom
17+
// mapping action AlienFusionProcess that takes an AlienHero and "fuses" its two
18+
// AlienBody members to produce a SuperAlienHero entity with a "Name" member of type
19+
// string and a single "Merged" member of type AlienBody, where the merger is just
20+
// the addition of the various members of AlienBody.
1121
static public void Example()
1222
{
1323
var mlContext = new MLContext();
@@ -33,7 +43,7 @@ static public void Example()
3343
+ firstAlien.Merged.HandCount + " hands.");
3444

3545
// Expected output:
36-
// We got a super alien with name Super Unknown, age 4002, height 6000, weight 8000, and 10000 hands.
46+
// We got a super alien with name Super ML.NET, age 4002, height 6000, weight 8000, and 10000 hands.
3747

3848
// Create a prediction engine and print out its prediction.
3949
var engine = mlContext.Model.CreatePredictionEngine<AlienHero,
@@ -47,11 +57,14 @@ static public void Example()
4757
", and " + superAlien.Merged.HandCount + " hands.");
4858

4959
// Expected output:
50-
// We got a super alien with name Super Unknown, age 6, height 8, weight 10, and 12 hands.
60+
// We got a super alien with name Super TEN.LM, age 6, height 8, weight 10, and 12 hands.
5161
}
5262

5363
// A custom type which ML.NET doesn't know yet. Its value will be loaded as
54-
// a DataView column in this test.
64+
// a DataView column in this example.
65+
//
66+
// The type members represent the characteristics of an alien body that will
67+
// be merged in the AlienFusionProcess.
5568
private class AlienBody
5669
{
5770
public int Age { get; set; }
@@ -68,7 +81,11 @@ public AlienBody(int age, float height, float weight, int handCount)
6881
}
6982
}
7083

71-
// DataViewTypeAttribute applied to class AlienBody members.
84+
// DataViewTypeAttribute applied to class AlienBody members. This attribute
85+
// defines how class AlienBody is registered in ML.NET's type system. In this
86+
// case, AlienBody is registered as DataViewAlienBodyType in ML.NET. The RaceId
87+
// property allows different members of type AlienBody to be registered with
88+
// different types in ML.NEt (see usage in class AlienHero).
7289
private sealed class AlienTypeAttributeAttribute : DataViewTypeAttribute
7390
{
7491
public int RaceId { get; }
@@ -98,15 +115,18 @@ public override bool Equals(DataViewTypeAttribute other)
98115
}
99116

100117
// A custom class with a type which ML.NET doesn't know yet. Its value will
101-
// be loaded as a DataView row in this test. It will be the input of
118+
// be loaded as a DataView row in this example. It will be the input of
102119
// AlienFusionProcess.MergeBody(AlienHero, SuperAlienHero).
103120
//
104-
// The members One> and Two" would be mapped to different types inside
121+
// The members One and Two would be mapped to different types inside
105122
// ML.NET type system because they have different
106123
// AlienTypeAttributeAttribute's. For example, the column type of One would
107-
// be DataViewAlienBodyType
108-
// with RaceId=100.
109-
// </summary>
124+
// be DataViewAlienBodyType with RaceId=100.
125+
//
126+
// This type represents a "Hero" Alien that is a single entity with two bodies.
127+
// The "Hero" undergoes a fusion process defined in AlienFusionProcess to
128+
// become a SuperAlienHero with a single body that is a merger of the two
129+
// bodies.
110130
private class AlienHero
111131
{
112132
public string Name { get; set; }
@@ -129,14 +149,16 @@ public AlienHero(string name,
129149
int anotherAge, float anotherHeight, float anotherWeight, int
130150
anotherHandCount)
131151
{
132-
Name = "Unknown";
152+
Name = name;
133153
One = new AlienBody(age, height, weight, handCount);
134154
Two = new AlienBody(anotherAge, anotherHeight, anotherWeight,
135155
anotherHandCount);
136156
}
137157
}
138158

139-
// Type of AlienBody in ML.NET's type system.
159+
// Type of AlienBody in ML.NET's type system. This is the data view type that
160+
// will represent AlienBody in ML.NET's type system when it is registered as
161+
// such in AlienTypeAttributeAttribute.
140162
// It usually shows up as DataViewSchema.Column.Type among IDataView.Schema.
141163
private class DataViewAlienBodyType : StructuredDataViewType
142164
{
@@ -162,6 +184,8 @@ public override int GetHashCode()
162184

163185
// The output type of processing AlienHero using AlienFusionProcess
164186
// .MergeBody(AlienHero, SuperAlienHero).
187+
// This is a "fused" alien whose body is a merger of the two bodies
188+
// of AlienHero.
165189
private class SuperAlienHero
166190
{
167191
public string Name { get; set; }
@@ -194,6 +218,5 @@ public static Action<AlienHero, SuperAlienHero> GetMapping()
194218
return MergeBody;
195219
}
196220
}
197-
198221
}
199222
}

src/Microsoft.ML.Core/Data/DataKind.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ namespace Microsoft.ML.Data
1616
///
1717
/// | Type | Default Value | IsDefault Indicator |
1818
/// | -- | -- | -- |
19-
/// | <xref:Microsoft.ML.Data.DataKind.String> or [text](xref:Microsoft.ML.Data.TextDataViewType) | Empty or `null` string (both result in empty `System.ReadOnlyMemory<char>` | <xref:"System.ReadOnlyMemory`1.IsEmpty*> |
19+
/// | <xref:Microsoft.ML.Data.DataKind.String> or [text](xref:Microsoft.ML.Data.TextDataViewType) | Empty or `null` string (both result in empty `System.ReadOnlyMemory<char>` | <xref:System.ReadOnlyMemory`1.IsEmpty*> |
2020
/// | [Key](xref:Microsoft.ML.Data.KeyDataViewType) type (supported by the unsigned integer types in `DataKind`) | Not defined | Always `false` |
2121
/// | All other types | Default value of the corresponding system type as defined by .NET standard. In C#, default value expression `default(T)` provides that value. | Equality test with the default value |
2222
///

src/Microsoft.ML.Data/DataLoadSave/EstimatorChain.cs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,12 @@ public EstimatorChain<TNewTrans> Append<TNewTrans>(IEstimator<TNewTrans> estimat
9393

9494
/// <summary>
9595
/// Append a 'caching checkpoint' to the estimator chain. This will ensure that the downstream estimators will be trained against
96-
/// cached data. It is helpful to have a caching checkpoint before trainers that take multiple data passes.
96+
/// cached data. It is helpful to have a caching checkpoint before trainers or feature engineering that take multiple data passes.
97+
/// It is also helpful to have after a slow operation, for example after dataset loading from a slow source or after feature
98+
/// engineering that is slow on its apply phase, if downstream estimators will do multiple passes over the output of this operation.
99+
/// Adding a cache checkpoint at the end of an <see cref="EstimatorChain{TLastTransformer}"/> is meaningless and should be avoided.
100+
/// Cache checkpoints should be removed if disk thrashing or OutOfMemory exceptions are seen, which can occur on when the featured
101+
/// dataset immediately prior to the checkpoint is larger than available RAM.
97102
/// </summary>
98103
/// <param name="env">The host environment to use for caching.</param>
99104
public EstimatorChain<TLastTransformer> AppendCacheCheckpoint(IHostEnvironment env)

0 commit comments

Comments
 (0)