dotnet · sfilipi · Oct 19, 2018 · Oct 15, 2018 · Oct 17, 2018 · Oct 17, 2018
diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md
@@ -103,7 +103,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     hasHeader: true);
 
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 ```
 
 If the schema of the data is not known at compile time, or too cumbersome, you can revert to the dynamically-typed API: 
@@ -128,9 +128,43 @@ var reader = new TextLoader(mlContext, new TextLoader.Arguments
 });
 
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 ```
 
+## How do I load data from multiple files?
+
+You can again use the `TextLoader`, and specify an array of files to its Read method.
+The files need to have the same schema (same number and type of columns) 
+
+[Example file1](../../test/data/adult.train):
+[Example file2](../../test/data/adult.test):
+```
+Label	Workclass	education	marital-status
+0	Private	11th	Never-married
+0	Private	HS-grad	Married-civ-spouse
+1	Local-gov	Assoc-acdm	Married-civ-spouse
+1	Private	Some-college	Married-civ-spouse
+```
+
+This is how you can read this data:
+```csharp
+// Create a new environment for ML.NET operations. It can be used for exception tracking and logging, 
+// as well as the source of randomness.
+var env = new LocalEnvironment();
+
+// Create the reader: define the data columns and where to find them in the text file.
+var reader = TextLoader.CreateReader(env, ctx => (
+        // A boolean column depicting the 'target label'.
+        IsOver50K: ctx.LoadBool(14),
+        // Three text columns.
+        Workclass: ctx.LoadText(1),
+        Education: ctx.LoadText(3),
+        MaritalStatus: ctx.LoadText(5)),
+    hasHeader: true);
+
+// Now read the files (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
+var data = reader.Read(exampleFile1, exampleFile2);
+
 ## How do I load data with many columns from a CSV?
 `TextLoader` is used to load data from text files. You will need to specify what are the data columns, what are their types, and where to find them in the text file. 
 
@@ -162,7 +196,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
 
 
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 ```
 
 
@@ -183,7 +217,7 @@ var reader = mlContext.Data.TextReader(new[] {
     s => s.Separator = ",");
 
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 ```
 
 ## How do I look at the intermediate data?
@@ -231,7 +265,7 @@ var dataPipeline = reader.MakeNewEstimator()
 
 // Let's verify that the data has been read correctly. 
 // First, we read the data file.
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 
 // Fit our data pipeline and transform data with it.
 var transformedData = dataPipeline.Fit(data).Transform(data);
@@ -305,7 +339,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
 
 
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
-var trainData = reader.Read(new MultiFileSource(trainDataPath));
+var trainData = reader.Read(trainDataPath);
 
 // Step two: define the learning pipeline. 
 
@@ -334,7 +368,7 @@ You can use the corresponding 'context' of the task to evaluate the model.
 Assuming the example above was used to train the model, here's how you calculate the metrics.
 ```csharp
 // Read the test dataset.
-var testData = reader.Read(new MultiFileSource(testDataPath));
+var testData = reader.Read(testDataPath);
 // Calculate metrics of the model on the test data.
 var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: r => r.Target, score: r => r.Prediction);
 ```
@@ -390,7 +424,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     separator: ',');
 
 // Retrieve the training data.
-var trainData = reader.Read(new MultiFileSource(irisDataPath));
+var trainData = reader.Read(irisDataPath);
 
 // Build the training pipeline.
 var learningPipeline = reader.MakeNewEstimator()
@@ -557,7 +591,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     separator: ',');
 
 // Retrieve the training data.
-var trainData = reader.Read(new MultiFileSource(dataPath));
+var trainData = reader.Read(dataPath);
 
 // This is the predictor ('weights collection') that we will train.
 MulticlassLogisticRegressionPredictor predictor = null;
@@ -648,7 +682,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     separator: ',');
 
 // Read the training data.
-var trainData = reader.Read(new MultiFileSource(dataPath));
+var trainData = reader.Read(dataPath);
 
 // Apply all kinds of standard ML.NET normalization to the raw features.
 var pipeline = reader.MakeNewEstimator()
@@ -707,7 +741,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     ), hasHeader: true);
 
 // Read the data.
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 
 // Inspect the categorical columns to check that they are correctly read.
 var catColumns = data.GetColumn(r => r.CategoricalFeatures).Take(10).ToArray();
@@ -784,7 +818,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     ), hasHeader: true);
 
 // Read the data.
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 
 // Inspect the message texts that are read from the file.
 var messageTexts = data.GetColumn(x => x.Message).Take(20).ToArray();
@@ -849,7 +883,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     separator: ',');
 
 // Read the data.
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 
 // Build the training pipeline.
 var learningPipeline = reader.MakeNewEstimator()
@@ -910,7 +944,7 @@ var reader = TextLoader.CreateReader(mlContext, ctx => (
     separator: ',');
 
 // Read the data.
-var data = reader.Read(new MultiFileSource(dataPath));
+var data = reader.Read(dataPath);
 
 // Build the pre-processing pipeline.
 var learningPipeline = reader.MakeNewEstimator()

diff --git a/docs/samples/Microsoft.ML.Samples/Trainers.cs b/docs/samples/Microsoft.ML.Samples/Trainers.cs
@@ -37,7 +37,7 @@ public static void SdcaRegression()
                 separator: '\t', hasHeader: true);
 
             // Read the data, and leave 10% out, so we can use them for testing
-            var data = reader.Read(new MultiFileSource(dataFile));
+            var data = reader.Read(dataFile);
             var (trainData, testData) = regressionContext.TrainTestSplit(data, testFraction: 0.1);
 
             // The predictor that gets produced out of training

diff --git a/src/Microsoft.ML.Data/DataLoadSave/MultiFileSource.cs b/src/Microsoft.ML.Data/DataLoadSave/MultiFileSource.cs
@@ -9,26 +9,46 @@
 namespace Microsoft.ML.Runtime.Data
 {
     /// <summary>
-    /// Wraps a potentially compound path as an IMultiStreamSource. Expands wild cards and supports
-    /// multiple paths separated by +.
+    /// Wraps a potentially compound path as an IMultiStreamSource.
     /// </summary>
+    /// <remarks>Expands wild cards and supports multiple paths separated by +, or loads all the files of a subfolder,
+    /// if the syntax for the path is 'FolderPath/...' (separator would be OS relevant).
+    /// </remarks>
     public sealed class MultiFileSource : IMultiStreamSource
     {
         private readonly string[] _paths;
 
-        public MultiFileSource(string path)
+        /// <summary>
+        /// Initializes a new instance of <see cref="MultiFileSource"/>.
+        /// In case of in case of usage from Maml, the paths would be wildcard concatenated in the first string of <paramref name="paths"/>.
+        /// </summary>
+        /// <param name="paths">The paths of the files to load.</param>
+        public MultiFileSource(params string[] paths)
         {
-            Contracts.CheckValueOrNull(path);
+            Contracts.CheckValueOrNull(paths);
 
-            if (string.IsNullOrEmpty(path))
+            // calling the ctor passing null, creates an array of 1, null element
+            // The types using MFS know how to account for an empty path
+            // if the paths array is empty, therefore keeping that behavior.
+            if (paths == null || (paths.Length == 1 && paths[0] == null))
             {
                 _paths = new string[0];
                 return;
             }
 
-            _paths = StreamUtils.ExpandWildCards(path);
-            if (_paths.Length == 0)
-                throw Contracts.ExceptIO("Could not find file '{0}'", path);
+            // in case of usage from Maml, the paths would be wildcard concatenated in the
+            // first string of paths.
+            string[] concatenated = paths[0] != null ? StreamUtils.ExpandWildCards(paths[0]) : null;
+
+            if (concatenated != null && concatenated.Length > 1)
+            {
+                if (paths.Length > 1)
+                    throw Contracts.Except($"Pass a single string to the {nameof(MultiFileSource)} constructor, if you are using wildcards.");
+
+                _paths = concatenated;
+            }
+            else
+                _paths = paths;
         }
 
         public int Count

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderStatic.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderStatic.cs
@@ -256,5 +256,17 @@ public Column Create()
             }
         }
     }
-}
 
+    public static class LocalPathReader
+    {
+        public static IDataView Read(this IDataReader<IMultiStreamSource> reader, params string[] path)
+        {
+            return reader.Read(new MultiFileSource(path));
+        }
+
+        public static DataView<TShape> Read<TShape>(this DataReader<IMultiStreamSource, TShape> reader, params string[] path)
+        {
+            return reader.Read(new MultiFileSource(path));
+        }
+    }
+}
diff --git a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs
@@ -51,7 +51,7 @@ public void SetupIrisPipeline()
                         }
                     });
 
-                IDataView data = reader.Read(new MultiFileSource(_irisDataPath));
+                IDataView data = reader.Read(_irisDataPath);
 
                 var pipeline = new ConcatEstimator(env, "Features", new[] { "SepalLength", "SepalWidth", "PetalLength", "PetalWidth" })
                     .Append(new SdcaMultiClassTrainer(env, "Features", "Label", advancedSettings: (s) => { s.NumThreads = 1; s.ConvergenceTolerance = 1e-2f; }));
@@ -86,7 +86,7 @@ public void SetupSentimentPipeline()
                         }
                     });
 
-                IDataView data = reader.Read(new MultiFileSource(_sentimentDataPath));
+                IDataView data = reader.Read(_sentimentDataPath);
 
                 var pipeline = new TextTransform(env, "SentimentText", "Features")
                     .Append(new LinearClassificationTrainer(env, "Features", "Label", advancedSettings: (s) => { s.NumThreads = 1; s.ConvergenceTolerance = 1e-2f; }));
@@ -121,7 +121,7 @@ public void SetupBreastCancerPipeline()
                         }
                     });
 
-                IDataView data = reader.Read(new MultiFileSource(_breastCancerDataPath));
+                IDataView data = reader.Read(_breastCancerDataPath);
 
                 var pipeline = new LinearClassificationTrainer(env, "Features", "Label", advancedSettings: (s) => { s.NumThreads = 1; s.ConvergenceTolerance = 1e-2f; });
 

diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/FileSource.cs b/test/Microsoft.ML.Core.Tests/UnitTests/FileSource.cs
@@ -0,0 +1,42 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Data;
+using System;
+using System.IO;
+using Xunit;
+
+namespace Microsoft.ML.Runtime.RunTests
+{
+    public sealed class FileSource
+    {
+
+        [Fact]
+        public void MultiFileSourceUnitTest()
+        {
+            var fileSource = new MultiFileSource("adult.txt");
+            Assert.True(fileSource.Count == 1);
+
+            fileSource = new MultiFileSource("adult.train", "adult.test");
+            Assert.True(fileSource.Count == 2, $"Error passing multiple paths to {nameof(MultiFileSource)}");
+
+            //creating a directory with three files for the tests
+            var dirName = Directory.CreateDirectory("MultiFileSourceUnitTest").FullName;
+
+            var file1 = Path.Combine(dirName, "a.txt");
+            var file2 = Path.Combine(dirName, "b.txt");
+
+            File.WriteAllText(file1, "Unit Test");
+            File.WriteAllText(file2, "Unit Test");
+
+            fileSource = new MultiFileSource($"{file1}+{file2}");
+            Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}");
+
+            fileSource = new MultiFileSource(Path.Combine(dirName, "..."));
+            Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}");
+
+            Assert.Throws<InvalidOperationException>(() => new MultiFileSource($"{file1}+{file2}", "adult.test"));
+        }
+    }
+}
diff --git a/test/Microsoft.ML.OnnxTransformTest/OnnxTransformTests.cs b/test/Microsoft.ML.OnnxTransformTest/OnnxTransformTests.cs
@@ -176,7 +176,7 @@ public void OnnxStatic()
                 var data = TextLoader.CreateReader(env, ctx => (
                     imagePath: ctx.LoadText(0),
                     name: ctx.LoadText(1)))
-                    .Read(new MultiFileSource(dataFile));
+                    .Read(dataFile);
 
                 // Note that CamelCase column names are there to match the TF graph node names.
                 var pipe = data.MakeNewEstimator()

diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
@@ -187,7 +187,7 @@ void Helper(ISchema thisSchema, string name, ColumnType expected)
             Helper(schema, "how.Donut.friend.Biz", TextType.Instance);
             Helper(schema, "how.Donut.friend.Blam", new VectorType(NumberType.R8, 10));
 
-            var textData = text.Read(new MultiFileSource(null));
+            var textData = text.Read(null);
 
             var est = text.MakeNewEstimator().Append(r => r.how.Donut.friend.Blam.ConcatWith(r.dawg.Blam));
             var outData = est.Fit(textData).Transform(textData);