dotnet · antoniovs1029 · Jun 9, 2020 · May 28, 2020 · May 28, 2020 · May 28, 2020
diff --git a/docs/code/IDataViewTypeSystem.md b/docs/code/IDataViewTypeSystem.md
@@ -540,7 +540,10 @@ is first processed entirely as `TX` values, then parsed, or processed directly
 into numeric values, that is, parsing as the row is processed. In the latter
 case, it is simple to map implicit items (suppressed due to sparsity) to zero.
 In the former case, these items are first mapped to the empty text value. To
-get the same result, we need empty text to map to zero.
+get the same result, we need empty text to map to zero. An exception to this
+rule has been permitted in the TextLoader, where there's an option to load
+empty `TX` fields as `NaN` for `R4` and `R8` fields, instead of using the default
+conversion of empty `TX` to the numeric default `0`.
 
 ### To Text
 

diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
@@ -23,6 +23,11 @@ internal enum OptionFlags : uint
             // a number and its decimal part). If this isn't set, then
             // default behavior is to use "." as decimal marker.
             UseCommaAsDecimalMarker = 0x01,
+
+            // If this flag is set, then empty spans (or those with only white-space)
+            // will be parsed as NaN. If it isn't set, then default behavior
+            // is to return them as 0.
+            EmptyAsNaN = 0x02,
         }
 
         private const ulong TopBit = 0x8000000000000000UL;
@@ -81,22 +86,22 @@ public enum Result
         }
 
         /// <summary>
-        /// This produces zero for an empty string.
+        /// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
         /// </summary>
         public static bool TryParse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
         {
             var res = Parse(span, out value, flags);
-            Contracts.Assert(res != Result.Empty || value == 0);
+            Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Single.IsNaN(value));
             return res <= Result.Empty;
         }
 
         /// <summary>
-        /// This produces zero for an empty string.
+        /// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
         /// </summary>
         public static bool TryParse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
         {
             var res = Parse(span, out value, flags);
-            Contracts.Assert(res != Result.Empty || value == 0);
+            Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Double.IsNaN(value));
             return res <= Result.Empty;
         }
 
@@ -107,7 +112,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value, OptionFlag
             {
                 if (ich >= span.Length)
                 {
-                    value = 0;
+                    if ((flags & OptionFlags.EmptyAsNaN) == 0)
+                        value = 0;
+                    else
+                        value = Single.NaN;
+
                     return Result.Empty;
                 }
                 if (!char.IsWhiteSpace(span[ich]))
@@ -155,7 +164,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value, OptionFlag
             {
                 if (ich >= span.Length)
                 {
-                    value = 0;
+                    if ((flags & OptionFlags.EmptyAsNaN) == 0)
+                        value = 0;
+                    else
+                        value = Double.NaN;
+
                     return Result.Empty;
                 }
                 if (!char.IsWhiteSpace(span[ich]))

diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs
@@ -1369,7 +1369,8 @@ private void TryParseSigned(long max, in TX text, out long? result)
         }
 
         /// <summary>
-        /// This produces zero for empty. It returns false if the text is not parsable.
+        /// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
+        /// It returns false if the text is not parsable.
         /// On failure, it sets dst to the NA value.
         /// </summary>
         public bool TryParse(in TX src, out R4 dst)
@@ -1382,7 +1383,8 @@ public bool TryParse(in TX src, out R4 dst)
         }
 
         /// <summary>
-        /// This produces zero for empty. It returns false if the text is not parsable.
+        /// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
+        /// It returns false if the text is not parsable.
         /// On failure, it sets dst to the NA value.
         /// </summary>
         public bool TryParse(in TX src, out R8 dst)
@@ -1394,6 +1396,9 @@ public bool TryParse(in TX src, out R8 dst)
             return IsStdMissing(ref span);
         }
 
+        /// <summary>
+        /// This produces default for empty.
+        /// </summary>
         public bool TryParse(in TX src, out TS dst)
         {
             if (src.IsEmpty)
@@ -1408,6 +1413,9 @@ public bool TryParse(in TX src, out TS dst)
             return false;
         }
 
+        /// <summary>
+        /// This produces default for empty.
+        /// </summary>
         public bool TryParse(in TX src, out DT dst)
         {
             if (src.IsEmpty)
@@ -1422,6 +1430,9 @@ public bool TryParse(in TX src, out DT dst)
             return false;
         }
 
+        /// <summary>
+        /// This produces default for empty.
+        /// </summary>
         public bool TryParse(in TX src, out DZ dst)
         {
             if (src.IsEmpty)

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -433,10 +433,9 @@ public class Options
             /// </summary>
             [Argument(ArgumentType.AtMostOnce,
                 HelpText =
-                    "Whether the input may include quoted values, which can contain separator characters, colons," +
-                    " and distinguish empty values from missing values. When true, consecutive separators denote a" +
-                    " missing value and an empty value is denoted by \"\". When false, consecutive separators" +
-                    " denote an empty value.",
+                    "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value" +
+                    "from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all" +
+                    "separators, even those within quotes, are treated as delimiting a new column.",
                 ShortName = "quote")]
             public bool AllowQuoting = Defaults.AllowQuoting;
 
@@ -533,6 +532,12 @@ public class Options
             [Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")]
             public char EscapeChar = Defaults.EscapeChar;
 
+            /// <summary>
+            /// If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.
+            /// </summary>
+            [Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "imputefloat")]
+            public bool ImputeEmptyFloats = Defaults.ImputeEmptyFloats;
+
             /// <summary>
             /// Checks that all column specifications are valid (that is, ranges are disjoint and have min&lt;=max).
             /// </summary>
@@ -552,6 +557,7 @@ internal static class Defaults
             internal const bool TrimWhitespace = false;
             internal const bool ReadMultilines = false;
             internal const char EscapeChar = '"';
+            internal const bool ImputeEmptyFloats = false;
         }
 
         /// <summary>
@@ -1078,7 +1084,8 @@ private static VersionInfo GetVersionInfo()
                 //verWrittenCur: 0x0001000A, // Added ForceVector in Range
                 //verWrittenCur: 0x0001000B, // Header now retained if used and present
                 //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
-                verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker
+                //verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars
+                verWrittenCur: 0x0001000E, // Added imputeEmptyFloats flag
                 verReadableCur: 0x0001000A,
                 verWeCanReadBack: 0x00010009,
                 loaderSignature: LoaderSignature,
@@ -1097,7 +1104,8 @@ private enum OptionFlags : uint
             AllowQuoting = 0x04,
             AllowSparse = 0x08,
             ReadMultilines = 0x10,
-            All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines
+            ImputeEmptyFloats = 0x20,
+            All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | ImputeEmptyFloats
         }
 
         // This is reserved to mean the range extends to the end (the segment is variable).
@@ -1179,6 +1187,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
                 _flags |= OptionFlags.AllowSparse;
             if (options.AllowQuoting && options.ReadMultilines)
                 _flags |= OptionFlags.ReadMultilines;
+            if (options.ImputeEmptyFloats)
+                _flags |= OptionFlags.ImputeEmptyFloats;
 
             // REVIEW: This should be persisted (if it should be maintained).
             _maxRows = options.MaxRows ?? long.MaxValue;
@@ -1407,7 +1417,25 @@ private TextLoader(IHost host, ModelLoadContext ctx)
             _maxRows = ctx.Reader.ReadInt64();
             host.CheckDecode(_maxRows > 0);
             _flags = (OptionFlags)ctx.Reader.ReadUInt32();
-            host.CheckDecode((_flags & ~OptionFlags.All) == 0);
+
+            // Flags introduced with the first ML.NET commit:
+            var acceptableFlags = OptionFlags.TrimWhitespace;
+            acceptableFlags |= OptionFlags.HasHeader;
+            acceptableFlags |= OptionFlags.AllowQuoting;
+            acceptableFlags |= OptionFlags.AllowSparse;
+
+            // Flags added on later versions of TextLoader:
+            if(ctx.Header.ModelVerWritten >= 0x0001000C)
+            {
+                acceptableFlags |= OptionFlags.ReadMultilines;
+            }
+            if(ctx.Header.ModelVerWritten >= 0x0001000E)
+            {
+                acceptableFlags |= OptionFlags.ImputeEmptyFloats;
+            }
+
+            host.CheckDecode((_flags & ~acceptableFlags) == 0);
+
             _inputSize = ctx.Reader.ReadInt32();
             host.CheckDecode(0 <= _inputSize && _inputSize < SrcLim);
 

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
@@ -671,6 +671,8 @@ public Parser(TextLoader parent)
                 var doubleParserOptionFlags = DoubleParser.OptionFlags.Default;
                 if (parent._decimalMarker == ',')
                     doubleParserOptionFlags |= DoubleParser.OptionFlags.UseCommaAsDecimalMarker;
+                if ((parent._flags & OptionFlags.ImputeEmptyFloats) != 0)
+                    doubleParserOptionFlags |= DoubleParser.OptionFlags.EmptyAsNaN;
 
                 if (doubleParserOptionFlags == DoubleParser.OptionFlags.Default)
                     cache = ValueCreatorCache.DefaultInstance;
@@ -900,6 +902,7 @@ private sealed class HelperImpl : Helper
                 private readonly int _srcNeeded;
                 private readonly bool _quoting;
                 private readonly bool _sparse;
+                private readonly bool _keepEmpty;
                 // This is a working buffer.
                 private readonly StringBuilder _sb;
 
@@ -930,6 +933,11 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC
                     _sb = new StringBuilder();
                     _blank = ReadOnlyMemory<char>.Empty;
                     Fields = new FieldSet();
+
+                    // If we want to impute empty float fields, then we must keep
+                    // all empty fields spans, as there's no way for the Parser.HelperImpl
+                    // to know beforehand which fields belong to a float field
+                    _keepEmpty = (flags & OptionFlags.ImputeEmptyFloats) != 0;
                 }
 
                 /// <summary>
@@ -978,6 +986,13 @@ public int GatherFields(ReadOnlyMemory<char> lineSpan, ReadOnlySpan<char> span,
                                 Fields.Spans[Fields.Count] = scan.Span;
                                 Fields.Indices[Fields.Count++] = src;
                             }
+                            else if(_keepEmpty)
+                            {
+                                Fields.EnsureSpace();
+                                Fields.Spans[Fields.Count] = _blank;
+                                Fields.Indices[Fields.Count++] = src;
+                            }
+
                             if (++src > _srcNeeded || !more)
                                 break;
                         }

diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -319,7 +319,7 @@
               {
                 "Name": "AllowQuoting",
                 "Type": "Bool",
-                "Desc": "Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by \"\". When false, consecutive separators denote an empty value.",
+                "Desc": "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input valuefrom actual separators. When true, separators within double quotes are treated as part of the input value. When false, allseparators, even those within quotes, are treated as delimiting a new column.",
                 "Aliases": [
                   "quote"
                 ],
@@ -464,6 +464,18 @@
                 "SortOrder": 150.0,
                 "IsNullable": false,
                 "Default": "\""
+              },
+              {
+                "Name": "ImputeEmptyFloats",
+                "Type": "Bool",
+                "Desc": "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.",
+                "Aliases": [
+                  "imputefloat"
+                ],
+                "Required": false,
+                "SortOrder": 150.0,
+                "IsNullable": false,
+                "Default": false
               }
             ]
           },

diff --git a/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv b/test/BaselineOutput/Common/TextLoader/missing_fields-with-impute.csv
@@ -0,0 +1,18 @@
+int,description,num1,num2,date,num3,num4
+0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78
+0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
+0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
+1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111
+2,"this has an empty num1 and a space in num3",NaN,22.22,2/2/2002,NaN,2222.2222
+3,"this has an empty quoted num1 and a quoted space in num3",NaN,33.33,3/3/2003,NaN,3333.3333
+4,"this has a space in num2 and a space in num4",4.4,NaN,4/4/2004,444.444,NaN 
+5,"this has a quoted space num2 and quoted space in num4",5.5,NaN,5/5/2005,555.555,NaN
+// The next two rows map the missing columns as 0, as it was decided not to impute with NaN
+// in this case
+6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0
+7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0
+// In the next case we do impute with NaN because the separator is there
+8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,NaN
+9,,9.9,99.99,9/9/2009,999.999,NaN
+0,"",10.10,NaN,10/10/2010,101010.101010,NaN
+11,NaN,NaN,NaN,11/11/2011,NaN,Infinity
diff --git a/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv b/test/BaselineOutput/Common/TextLoader/missing_fields-without-impute.csv
@@ -0,0 +1,15 @@
+int,description,num1,num2,date,num3,num4
+0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78
+0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
+0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
+1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111
+2,"this has an empty num1 and a space in num3",0,22.22,2/2/2002,0,2222.2222
+3,"this has an empty quoted num1 and a quoted space in num3",0,33.33,3/3/2003,0,3333.3333
+4,"this has a space in num2 and a space in num4",4.4,0,4/4/2004,444.444,0
+5,"this has a quoted space num2 and quoted space in num4",5.5,0,5/5/2005,555.555,0
+6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0
+7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0
+8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,0
+9,,9.9,99.99,9/9/2009,999.999,NaN
+0,,10.10,NaN,10/10/2010,101010.101010,NaN
+11,NaN,NaN,NaN,11/11/2011,0,Infinity