dotnet
diff --git a/‎docs/api-reference/time-series-pvalue.md
Lines changed: 1 addition & 1 deletion b/‎docs/api-reference/time-series-pvalue.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingSvmLight.cs
Lines changed: 2 additions & 6 deletions b/‎docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingSvmLight.cs
Lines changed: 2 additions & 6 deletions
diff --git a/‎docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingText.cs
Lines changed: 132 additions & 7 deletions b/‎docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingText.cs
Lines changed: 132 additions & 7 deletions
diff --git a/‎docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs
Lines changed: 4 additions & 0 deletions b/‎docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs
Lines changed: 11 additions & 1 deletion b/‎docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs
Lines changed: 11 additions & 1 deletion
diff --git a/‎docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingWithInMemoryCustomType.cs
Lines changed: 35 additions & 12 deletions b/‎docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingWithInMemoryCustomType.cs
Lines changed: 35 additions & 12 deletions
diff --git a/‎src/Microsoft.ML.Core/Data/DataKind.cs
Lines changed: 1 addition & 1 deletion b/‎src/Microsoft.ML.Core/Data/DataKind.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Microsoft.ML.Data/DataLoadSave/EstimatorChain.cs
Lines changed: 6 additions & 1 deletion b/‎src/Microsoft.ML.Data/DataLoadSave/EstimatorChain.cs
Lines changed: 6 additions & 1 deletion
@@ -7,7 +7,7 @@ The lower its value, the more likely it is a spike. The p-value score is always
 
 This score is the p-value of the current computed raw score according to a distribution of raw scores.
 Here, the distribution is estimated based on the most recent raw score values up to certain depth back in the history.
-More specifically, this distribution is estimated using [kernel density estimation (https://en.wikipedia.org/wiki/Kernel_density_estimation) with the Gaussian [kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#In_non-parametric_statistics) of adaptive bandwidth.
+More specifically, this distribution is estimated using [kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation) with the Gaussian [kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#In_non-parametric_statistics) of adaptive bandwidth.
 
 If the p-value score exceeds $1 - \frac{\text{confidence}}{100}$, the associated timestamp may get a non-zero alert value in spike detection, which means a spike point is detected.
 Note that $\text{confidence}$ is defined in the signatures of [DetectIidSpike](xref:Microsoft.ML.TimeSeriesCatalog.DetectIidSpike(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int32,System.Int32,Microsoft.ML.Transforms.TimeSeries.AnomalySide))
 
@@ -1,12 +1,8 @@
 using System;
-using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using Microsoft.ML;
 using Microsoft.ML.Data;
-using Microsoft.ML.Transforms;
-using Microsoft.VisualBasic.CompilerServices;
-using Tensorflow;
 
 namespace Samples.Dynamic.DataOperations
 {
@@ -31,9 +27,9 @@ public static void Example()
                     else
                         sb.Append("-1 ");
                     if (line % 2 == 0)
-                        sb.Append("cost:1");
+                        sb.Append("cost:1 ");
                     else
-                        sb.Append("cost:2");
+                        sb.Append("cost:2 ");
                     for (int i = 1; i <= 10; i++)
                     {
                         if (random.NextDouble() > 0.5)
 
@@ -14,26 +14,37 @@ public static void Example()
         {
             // Create 5 data files to illustrate different loading methods.
             var dataFiles = new List<string>();
-            var random = new Random();
+            var random = new Random(1);
             var dataDirectoryName = "DataDir";
             Directory.CreateDirectory(dataDirectoryName);
             for (int i = 0; i < 5; i++)
             {
                 var fileName = Path.Combine(dataDirectoryName, $"Data_{i}.csv");
                 dataFiles.Add(fileName);
                 using (var fs = File.CreateText(fileName))
-                    // Write random lines without header
+                {
+                    // Write without header with 10 random columns, forcing
+                    // approximately 80% of values to be 0.
                     for (int line = 0; line < 10; line++)
-                        fs.WriteLine(random.NextDouble().ToString());
+                    {
+                        var sb = new StringBuilder();
+                        for (int pos = 0; pos < 10; pos++)
+                        {
+                            var value = random.NextDouble();
+                            sb.Append((value < 0.8 ? 0 : value).ToString() + '\t');
+                        }
+                        fs.WriteLine(sb.ToString(0, sb.Length - 1));
+                    }
+                }
             }
 
             // Create a TextLoader.
             var mlContext = new MLContext();
             var loader = mlContext.Data.CreateTextLoader(
                 columns: new[]
-                    {
-                        new TextLoader.Column("RandomFeature", DataKind.Single, 0)
-                    },
+                {
+                    new TextLoader.Column("Features", DataKind.Single, 0, 9)
+                },
                 hasHeader: false
             );
 
@@ -55,11 +66,119 @@ public static void Example()
 
             // Load all files using path wildcard.
             var multipleFilesWildcardData = 
-                loader.Load(Path.Combine(dataDirectoryName, "*"));
+                loader.Load(Path.Combine(dataDirectoryName, "Data_*.csv"));
             PrintRowCount(multipleFilesWildcardData);
 
             // Expected Output:
             //   50
+
+
+            // Create a TextLoader with user defined type.
+            var loaderWithCustomType =
+                mlContext.Data.CreateTextLoader<Data>(hasHeader: false);
+
+            // Load a single file from path.
+            var singleFileCustomTypeData = loaderWithCustomType.Load(dataFiles[0]);
+            PrintRowCount(singleFileCustomTypeData);
+
+            // Expected Output:
+            //   10
+
+
+            // Create a TextLoader with unknown column length to illustrate
+            // how a data sample may be used to infer column size.
+            var dataSample = new MultiFileSource(dataFiles[0]);
+            var loaderWithUnknownLength = mlContext.Data.CreateTextLoader(
+                columns: new[]
+                {
+                    new TextLoader.Column("Features",
+                                          DataKind.Single,
+                                          new[] { new TextLoader.Range(0, null) })
+                },
+                dataSample: dataSample
+            );
+
+            var dataWithInferredLength = loaderWithUnknownLength.Load(dataFiles[0]);
+            var featuresColumn = dataWithInferredLength.Schema.GetColumnOrNull("Features");
+            if (featuresColumn.HasValue)
+                Console.WriteLine(featuresColumn.Value.ToString());
+
+            // Expected Output:
+            //   Features: Vector<Single, 10>
+            //
+            // ML.NET infers the correct length of 10 for the Features column,
+            // which is of type Vector<Single>.
+
+            PrintRowCount(dataWithInferredLength);
+            
+            // Expected Output:
+            //   10
+
+
+            // Save the data with 10 rows to a text file to illustrate the use of
+            // sparse format.
+            var sparseDataFileName = Path.Combine(dataDirectoryName, "saved_data.tsv");
+            using (FileStream stream = new FileStream(sparseDataFileName, FileMode.Create))
+                mlContext.Data.SaveAsText(singleFileData, stream);
+
+            // Since there are many zeroes in the data, it will be saved in a sparse
+            // representation to save disk space. The data may be forced to be saved
+            // in a dense representation by setting forceDense to true. The sparse
+            // data will look like the following:
+            //
+            //   10 7:0.943862259
+            //   10 3:0.989767134
+            //   10 0:0.949778438   8:0.823028445   9:0.886469543
+            //
+            // The sparse representation of the first row indicates that there are
+            // 10 columns, the column 7 (8-th column) has value 0.943862259, and other
+            // omitted columns have value 0.
+
+            // Create a TextLoader that allows sparse input.
+            var sparseLoader = mlContext.Data.CreateTextLoader(
+                columns: new[]
+                {
+                    new TextLoader.Column("Features", DataKind.Single, 0, 9)
+                },
+                allowSparse: true
+            );
+
+            // Load the saved sparse data.
+            var sparseData = sparseLoader.Load(sparseDataFileName);
+            PrintRowCount(sparseData);
+
+            // Expected Output:
+            //   10
+
+
+            // Create a TextLoader without any column schema using TextLoader.Options.
+            // Since the sparse data file was saved with ML.NET, it has the schema
+            // enoded in its header that the loader can understand:
+            //
+            // #@ TextLoader{
+            // #@   sep=tab
+            // #@   col=Features:R4:0-9
+            // #@ }
+            //
+            // The schema syntax is unimportant since it is only used internally. In
+            // short, it tells the loader that the values are separated by tabs, and
+            // that columns 0-9 in the text file are to be read into one column named
+            // "Features" of type Single (internal type R4).
+
+            var options = new TextLoader.Options()
+            {
+                AllowSparse = true,
+            };
+            var dataSampleWithSchema = new MultiFileSource(sparseDataFileName);
+            var sparseLoaderWithSchema =
+                mlContext.Data.CreateTextLoader(options, dataSample: dataSampleWithSchema);
+
+            // Load the saved sparse data.
+            var sparseDataWithSchema = sparseLoaderWithSchema.Load(sparseDataFileName);
+            PrintRowCount(sparseDataWithSchema);
+
+            // Expected Output:
+            //   10
         }
 
         private static void PrintRowCount(IDataView idv)
@@ -73,5 +192,11 @@ private static void PrintRowCount(IDataView idv)
 
             Console.WriteLine(rowCount);
         }
+
+        private class Data
+        {
+            [LoadColumn(0, 9)]
+            public float[] Features { get; set; }
+        }
     }
 }
@@ -6,6 +6,10 @@ namespace Samples.Dynamic
 {
     public static class CustomMapping
     {
+        // This example shows how to define and apply a custom mapping of input
+        // columns to output columns without defining a contract. Since a contract
+        // is not defined, the pipeline containing this mapping cannot be saved and
+        // loaded back.
         public static void Example()
         {
             // Create a new ML context, for ML.NET operations. It can be used for
 
@@ -7,6 +7,12 @@ namespace Samples.Dynamic
 {
     public static class CustomMappingSaveAndLoad
     {
+        // This example shows how to define and apply a custom mapping of input
+        // columns to output columns with a contract name. The contract name is
+        // used in the CustomMappingFactoryAttribute that decorates the custom
+        // mapping action. The pipeline containing the custom mapping can then be
+        // saved to disk, and it can be loaded back after the assembly containing
+        // the custom mapping action is registered.
         public static void Example()
         {
             // Create a new ML context, for ML.NET operations. It can be used for
@@ -24,7 +30,11 @@ public static void Example()
             var data = mlContext.Data.LoadFromEnumerable(samples);
 
             // Custom transformations can be used to transform data directly, or as
-            // part of a pipeline of estimators.
+            // part of a pipeline of estimators. The contractName must be provided
+            // in order for a pipeline containing a CustomMapping estimator to be
+            // saved and loaded back. The contractName must be the same as in the
+            // CustomMappingFactoryAttribute used to decorate the custom action
+            // defined by the user.
             var pipeline = mlContext.Transforms.CustomMapping(new
                 IsUnderThirtyCustomAction().GetMapping(), contractName:
                 "IsUnderThirty");
 
@@ -8,6 +8,16 @@ namespace Samples.Dynamic
 {
     class CustomMappingWithInMemoryCustomType
     {
+        // This example shows how custom mapping actions can be performed on custom data
+        // types that ML.NET doesn't know yet. The example tells a story of how two alien
+        // bodies are merged to form a super alien with a single body.
+        //
+        // Here, the type AlienHero represents a single alien entity with a member "Name"
+        // of type string and members "One" and "Two" of type AlienBody. It defines a custom
+        // mapping action AlienFusionProcess that takes an AlienHero and "fuses" its two
+        // AlienBody members to produce a SuperAlienHero entity with a "Name" member of type
+        // string and a single "Merged" member of type AlienBody, where the merger is just
+        // the addition of the various members of AlienBody.
         static public void Example()
         {
             var mlContext = new MLContext();
@@ -33,7 +43,7 @@ static public void Example()
                 + firstAlien.Merged.HandCount + " hands.");
 
             // Expected output:
-            //   We got a super alien with name Super Unknown, age 4002, height 6000, weight 8000, and 10000 hands.
+            //   We got a super alien with name Super ML.NET, age 4002, height 6000, weight 8000, and 10000 hands.
 
             // Create a prediction engine and print out its prediction.
             var engine = mlContext.Model.CreatePredictionEngine<AlienHero,
@@ -47,11 +57,14 @@ static public void Example()
                 ", and " + superAlien.Merged.HandCount + " hands.");
 
             // Expected output:
-            //   We got a super alien with name Super Unknown, age 6, height 8, weight 10, and 12 hands.
+            //   We got a super alien with name Super TEN.LM, age 6, height 8, weight 10, and 12 hands.
         }
 
         // A custom type which ML.NET doesn't know yet. Its value will be loaded as
-        // a DataView column in this test.
+        // a DataView column in this example.
+        //
+        // The type members represent the characteristics of an alien body that will
+        // be merged in the AlienFusionProcess.
         private class AlienBody
         {
             public int Age { get; set; }
@@ -68,7 +81,11 @@ public AlienBody(int age, float height, float weight, int handCount)
             }
         }
 
-        // DataViewTypeAttribute applied to class AlienBody members.
+        // DataViewTypeAttribute applied to class AlienBody members. This attribute
+        // defines how class AlienBody is registered in ML.NET's type system. In this
+        // case, AlienBody is registered as DataViewAlienBodyType in ML.NET. The RaceId
+        // property allows different members of type AlienBody to be registered with
+        // different types in ML.NEt (see usage in class AlienHero).
         private sealed class AlienTypeAttributeAttribute : DataViewTypeAttribute
         {
             public int RaceId { get; }
@@ -98,15 +115,18 @@ public override bool Equals(DataViewTypeAttribute other)
         }
 
         // A custom class with a type which ML.NET doesn't know yet. Its value will
-        // be loaded as a DataView row in this test. It will be the input of
+        // be loaded as a DataView row in this example. It will be the input of
         // AlienFusionProcess.MergeBody(AlienHero, SuperAlienHero).
         //
-        // The members One> and Two" would be mapped to different types inside
+        // The members One and Two would be mapped to different types inside
         // ML.NET type system because they have different 
         // AlienTypeAttributeAttribute's. For example, the column type of One would
-        // be DataViewAlienBodyType
-        // with RaceId=100.
-        // </summary>
+        // be DataViewAlienBodyType with RaceId=100.
+        //
+        // This type represents a "Hero" Alien that is a single entity with two bodies.
+        // The "Hero" undergoes a fusion process defined in AlienFusionProcess to
+        // become a SuperAlienHero with a single body that is a merger of the two
+        // bodies.
         private class AlienHero
         {
             public string Name { get; set; }
@@ -129,14 +149,16 @@ public AlienHero(string name,
                 int anotherAge, float anotherHeight, float anotherWeight, int
                     anotherHandCount)
             {
-                Name = "Unknown";
+                Name = name;
                 One = new AlienBody(age, height, weight, handCount);
                 Two = new AlienBody(anotherAge, anotherHeight, anotherWeight,
                     anotherHandCount);
             }
         }
 
-        // Type of AlienBody in ML.NET's type system.
+        // Type of AlienBody in ML.NET's type system. This is the data view type that
+        // will represent AlienBody in ML.NET's type system when it is registered as
+        // such in AlienTypeAttributeAttribute.
         // It usually shows up as DataViewSchema.Column.Type among IDataView.Schema.
         private class DataViewAlienBodyType : StructuredDataViewType
         {
@@ -162,6 +184,8 @@ public override int GetHashCode()
 
         // The output type of processing AlienHero using AlienFusionProcess
         // .MergeBody(AlienHero, SuperAlienHero).
+        // This is a "fused" alien whose body is a merger of the two bodies
+        // of AlienHero.
         private class SuperAlienHero
         {
             public string Name { get; set; }
@@ -194,6 +218,5 @@ public static Action<AlienHero, SuperAlienHero> GetMapping()
                 return MergeBody;
             }
         }
-
     }
 }
@@ -16,7 +16,7 @@ namespace Microsoft.ML.Data
     ///
     /// | Type | Default Value | IsDefault Indicator |
     /// | -- | -- | -- |
-    /// | <xref:Microsoft.ML.Data.DataKind.String> or [text](xref:Microsoft.ML.Data.TextDataViewType) | Empty or `null` string (both result in empty `System.ReadOnlyMemory<char>` | <xref:"System.ReadOnlyMemory`1.IsEmpty*> |
+    /// | <xref:Microsoft.ML.Data.DataKind.String> or [text](xref:Microsoft.ML.Data.TextDataViewType) | Empty or `null` string (both result in empty `System.ReadOnlyMemory<char>` | <xref:System.ReadOnlyMemory`1.IsEmpty*> |
     /// | [Key](xref:Microsoft.ML.Data.KeyDataViewType) type (supported by the unsigned integer types in `DataKind`) | Not defined | Always `false` |
     /// | All other types | Default value of the corresponding system type as defined by .NET standard. In C#, default value expression `default(T)` provides that value. | Equality test with the default value |
     ///
 
@@ -93,7 +93,12 @@ public EstimatorChain<TNewTrans> Append<TNewTrans>(IEstimator<TNewTrans> estimat
 
         /// <summary>
         /// Append a 'caching checkpoint' to the estimator chain. This will ensure that the downstream estimators will be trained against
-        /// cached data. It is helpful to have a caching checkpoint before trainers that take multiple data passes.
+        /// cached data. It is helpful to have a caching checkpoint before trainers or feature engineering that take multiple data passes.
+        /// It is also helpful to have after a slow operation, for example after dataset loading from a slow source or after feature
+        /// engineering that is slow on its apply phase, if downstream estimators will do multiple passes over the output of this operation.
+        /// Adding a cache checkpoint at the end of an <see cref="EstimatorChain{TLastTransformer}"/> is meaningless and should be avoided.
+        /// Cache checkpoints should be removed if disk thrashing or OutOfMemory exceptions are seen, which can occur on when the featured
+        /// dataset immediately prior to the checkpoint is larger than available RAM.
         /// </summary>
         /// <param name="env">The host environment to use for caching.</param>
         public EstimatorChain<TLastTransformer> AppendCacheCheckpoint(IHostEnvironment env)
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,10 @@ namespace Samples.Dynamic`
`6`	`6`	`{`
`7`	`7`	`public static class CustomMapping`
`8`	`8`	`{`
	`9`	`+ // This example shows how to define and apply a custom mapping of input`
	`10`	`+ // columns to output columns without defining a contract. Since a contract`
	`11`	`+ // is not defined, the pipeline containing this mapping cannot be saved and`
	`12`	`+ // loaded back.`
`9`	`13`	`public static void Example()`
`10`	`14`	`{`
`11`	`15`	`// Create a new ML context, for ML.NET operations. It can be used for`