Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow TextLoader to load empty float/double fields as NaN instead of 0 #5198

Merged
merged 22 commits into from
Jun 9, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/code/IDataViewTypeSystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,10 @@ is first processed entirely as `TX` values, then parsed, or processed directly
into numeric values, that is, parsing as the row is processed. In the latter
case, it is simple to map implicit items (suppressed due to sparsity) to zero.
In the former case, these items are first mapped to the empty text value. To
get the same result, we need empty text to map to zero.
get the same result, we need empty text to map to zero. An exception to this
rule has been permitted in the TextLoader, where there's an option to load
empty `TX` fields as `NaN` for `R4` and `R8` fields, instead of using the default
conversion of empty `TX` to the numeric default `0`.

### To Text

Expand Down
25 changes: 19 additions & 6 deletions src/Microsoft.ML.Core/Utilities/DoubleParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ internal enum OptionFlags : uint
// a number and its decimal part). If this isn't set, then
// default behavior is to use "." as decimal marker.
UseCommaAsDecimalMarker = 0x01,

// If this flag is set, then empty spans (or those with only white-space)
// will be parsed as NaN. If it isn't set, then default behavior
// is to return them as 0.
EmptyAsNaN = 0x02,
}

private const ulong TopBit = 0x8000000000000000UL;
Expand Down Expand Up @@ -81,22 +86,22 @@ public enum Result
}

/// <summary>
/// This produces zero for an empty string.
/// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// </summary>
public static bool TryParse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
{
var res = Parse(span, out value, flags);
Contracts.Assert(res != Result.Empty || value == 0);
Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Single.IsNaN(value));
return res <= Result.Empty;
}

/// <summary>
/// This produces zero for an empty string.
/// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// </summary>
public static bool TryParse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
{
var res = Parse(span, out value, flags);
Contracts.Assert(res != Result.Empty || value == 0);
Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Double.IsNaN(value));
return res <= Result.Empty;
}

Expand All @@ -107,7 +112,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value, OptionFlag
{
if (ich >= span.Length)
{
value = 0;
if ((flags & OptionFlags.EmptyAsNaN) == 0)
value = 0;
else
value = Single.NaN;

return Result.Empty;
}
if (!char.IsWhiteSpace(span[ich]))
Expand Down Expand Up @@ -155,7 +164,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value, OptionFlag
{
if (ich >= span.Length)
{
value = 0;
if ((flags & OptionFlags.EmptyAsNaN) == 0)
value = 0;
else
value = Double.NaN;

return Result.Empty;
}
if (!char.IsWhiteSpace(span[ich]))
Expand Down
15 changes: 13 additions & 2 deletions src/Microsoft.ML.Data/Data/Conversion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1369,7 +1369,8 @@ private void TryParseSigned(long max, in TX text, out long? result)
}

/// <summary>
/// This produces zero for empty. It returns false if the text is not parsable.
/// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// It returns false if the text is not parsable.
/// On failure, it sets dst to the NA value.
/// </summary>
public bool TryParse(in TX src, out R4 dst)
Expand All @@ -1382,7 +1383,8 @@ public bool TryParse(in TX src, out R4 dst)
}

/// <summary>
/// This produces zero for empty. It returns false if the text is not parsable.
/// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
/// It returns false if the text is not parsable.
/// On failure, it sets dst to the NA value.
/// </summary>
public bool TryParse(in TX src, out R8 dst)
Expand All @@ -1394,6 +1396,9 @@ public bool TryParse(in TX src, out R8 dst)
return IsStdMissing(ref span);
}

/// <summary>
/// This produces default for empty.
/// </summary>
public bool TryParse(in TX src, out TS dst)
{
if (src.IsEmpty)
Expand All @@ -1408,6 +1413,9 @@ public bool TryParse(in TX src, out TS dst)
return false;
}

/// <summary>
/// This produces default for empty.
/// </summary>
public bool TryParse(in TX src, out DT dst)
{
if (src.IsEmpty)
Expand All @@ -1422,6 +1430,9 @@ public bool TryParse(in TX src, out DT dst)
return false;
}

/// <summary>
/// This produces default for empty.
/// </summary>
public bool TryParse(in TX src, out DZ dst)
{
if (src.IsEmpty)
Expand Down
42 changes: 35 additions & 7 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -433,10 +433,9 @@ public class Options
/// </summary>
[Argument(ArgumentType.AtMostOnce,
HelpText =
"Whether the input may include quoted values, which can contain separator characters, colons," +
" and distinguish empty values from missing values. When true, consecutive separators denote a" +
" missing value and an empty value is denoted by \"\". When false, consecutive separators" +
" denote an empty value.",
antoniovs1029 marked this conversation as resolved.
Show resolved Hide resolved
"Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value" +
"from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all" +
"separators, even those within quotes, are treated as delimiting a new column.",
ShortName = "quote")]
public bool AllowQuoting = Defaults.AllowQuoting;

Expand Down Expand Up @@ -533,6 +532,12 @@ public class Options
[Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")]
public char EscapeChar = Defaults.EscapeChar;

/// <summary>
/// If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "imputefloat")]
public bool ImputeEmptyFloats = Defaults.ImputeEmptyFloats;
Copy link
Member Author

@antoniovs1029 antoniovs1029 Jun 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not happy with the name I chose for this option, and for text I wrote as its doc. I think the name "field" can mean many things in the C# world (but inside the code of the TextLoader a "field" is the value found on a given column on a given row). Also, this option is supposed to be related to both Single and Doubles, so "float" is misleading. And "empty" would actually mean empty fields or fields with only whitespace. So I'm not sure how to name this and what to say on the public docs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure 'Impute' fits the terminology here. A few options to consider: ParseEmptyFloatsAsNan, ParseEmptyRealsAsNan, EmptyFieldsToNan, EmptyValuesToNan, etc.


In reply to: 434309670 [](ancestors = 434309670)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used the word "impute" as that was the original name @justinormont suggested here #4132 (comment)
I wouldn't know if that's a common term in which our users could understand the feature by reading it.

As for the names you provided, perhaps EmptyRealsAsNan would fit well, as it avoids the word Field, and "Reals" could imply Floats or Doubles.

I will leave unaddressed right now, and see if we get any suggestion on this. It's only a variable name change, and I'll be able to make the change before merging the PR.


In reply to: 434886717 [](ancestors = 434886717,434309670)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've decided to go for MissingRealsAsNan , since it's not only the Empty fields that get mapped as NaNs, but also the ones that only have whitespace, and now also the ones were there are missing columns in a given row.


In reply to: 435163479 [](ancestors = 435163479,434886717,434309670)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case you should fix the documentation above to read "If true, empty or missing float and double fields will be loaded as NaN"


In reply to: 437077933 [](ancestors = 437077933,435163479,434886717,434309670)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the documentation reads:

            /// <summary>
            /// If true, missing real fields (i.e. double or single fields) will be loaded as NaN.
            /// If false, they'll be loaded as 0. Default is false.
            /// A field is considered "missing" if it's empty, if it only has whitespace, or if there are missing columns
            /// at the end of a given row.
            /// </summary>

In reply to: 437170658 [](ancestors = 437170658,437077933,435163479,434886717,434309670)


/// <summary>
/// Checks that all column specifications are valid (that is, ranges are disjoint and have min&lt;=max).
/// </summary>
Expand All @@ -552,6 +557,7 @@ internal static class Defaults
internal const bool TrimWhitespace = false;
internal const bool ReadMultilines = false;
internal const char EscapeChar = '"';
internal const bool ImputeEmptyFloats = false;
}

/// <summary>
Expand Down Expand Up @@ -1078,7 +1084,8 @@ private static VersionInfo GetVersionInfo()
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
//verWrittenCur: 0x0001000B, // Header now retained if used and present
//verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker
//verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars
verWrittenCur: 0x0001000E, // Added imputeEmptyFloats flag
verReadableCur: 0x0001000A,
verWeCanReadBack: 0x00010009,
loaderSignature: LoaderSignature,
Expand All @@ -1097,7 +1104,8 @@ private enum OptionFlags : uint
AllowQuoting = 0x04,
AllowSparse = 0x08,
ReadMultilines = 0x10,
All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines
ImputeEmptyFloats = 0x20,
All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | ImputeEmptyFloats
}

// This is reserved to mean the range extends to the end (the segment is variable).
Expand Down Expand Up @@ -1179,6 +1187,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
_flags |= OptionFlags.AllowSparse;
if (options.AllowQuoting && options.ReadMultilines)
_flags |= OptionFlags.ReadMultilines;
if (options.ImputeEmptyFloats)
antoniovs1029 marked this conversation as resolved.
Show resolved Hide resolved
_flags |= OptionFlags.ImputeEmptyFloats;

// REVIEW: This should be persisted (if it should be maintained).
_maxRows = options.MaxRows ?? long.MaxValue;
Expand Down Expand Up @@ -1407,7 +1417,25 @@ private TextLoader(IHost host, ModelLoadContext ctx)
_maxRows = ctx.Reader.ReadInt64();
host.CheckDecode(_maxRows > 0);
_flags = (OptionFlags)ctx.Reader.ReadUInt32();
host.CheckDecode((_flags & ~OptionFlags.All) == 0);

// Flags introduced with the first ML.NET commit:
var acceptableFlags = OptionFlags.TrimWhitespace;
acceptableFlags |= OptionFlags.HasHeader;
acceptableFlags |= OptionFlags.AllowQuoting;
acceptableFlags |= OptionFlags.AllowSparse;

// Flags added on later versions of TextLoader:
if(ctx.Header.ModelVerWritten >= 0x0001000C)
{
acceptableFlags |= OptionFlags.ReadMultilines;
}
if(ctx.Header.ModelVerWritten >= 0x0001000E)
{
acceptableFlags |= OptionFlags.ImputeEmptyFloats;
}

host.CheckDecode((_flags & ~acceptableFlags) == 0);

_inputSize = ctx.Reader.ReadInt32();
host.CheckDecode(0 <= _inputSize && _inputSize < SrcLim);

Expand Down
15 changes: 15 additions & 0 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,8 @@ public Parser(TextLoader parent)
var doubleParserOptionFlags = DoubleParser.OptionFlags.Default;
if (parent._decimalMarker == ',')
doubleParserOptionFlags |= DoubleParser.OptionFlags.UseCommaAsDecimalMarker;
if ((parent._flags & OptionFlags.ImputeEmptyFloats) != 0)
doubleParserOptionFlags |= DoubleParser.OptionFlags.EmptyAsNaN;

if (doubleParserOptionFlags == DoubleParser.OptionFlags.Default)
cache = ValueCreatorCache.DefaultInstance;
Expand Down Expand Up @@ -900,6 +902,7 @@ private sealed class HelperImpl : Helper
private readonly int _srcNeeded;
private readonly bool _quoting;
private readonly bool _sparse;
private readonly bool _keepEmpty;
// This is a working buffer.
private readonly StringBuilder _sb;

Expand Down Expand Up @@ -930,6 +933,11 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC
_sb = new StringBuilder();
_blank = ReadOnlyMemory<char>.Empty;
Fields = new FieldSet();

// If we want to impute empty float fields, then we must keep
// all empty fields spans, as there's no way for the Parser.HelperImpl
// to know beforehand which fields belong to a float field
_keepEmpty = (flags & OptionFlags.ImputeEmptyFloats) != 0;
}

/// <summary>
Expand Down Expand Up @@ -978,6 +986,13 @@ public int GatherFields(ReadOnlyMemory<char> lineSpan, ReadOnlySpan<char> span,
Fields.Spans[Fields.Count] = scan.Span;
Fields.Indices[Fields.Count++] = src;
}
else if(_keepEmpty)
{
Fields.EnsureSpace();
Fields.Spans[Fields.Count] = _blank;
Fields.Indices[Fields.Count++] = src;
}

if (++src > _srcNeeded || !more)
break;
}
Expand Down
14 changes: 13 additions & 1 deletion test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@
{
"Name": "AllowQuoting",
"Type": "Bool",
"Desc": "Whether the input may include quoted values, which can contain separator characters, colons, and distinguish empty values from missing values. When true, consecutive separators denote a missing value and an empty value is denoted by \"\". When false, consecutive separators denote an empty value.",
"Desc": "Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input valuefrom actual separators. When true, separators within double quotes are treated as part of the input value. When false, allseparators, even those within quotes, are treated as delimiting a new column.",
"Aliases": [
"quote"
],
Expand Down Expand Up @@ -464,6 +464,18 @@
"SortOrder": 150.0,
"IsNullable": false,
"Default": "\""
},
{
"Name": "ImputeEmptyFloats",
"Type": "Bool",
"Desc": "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.",
"Aliases": [
"imputefloat"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": false
}
]
},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
int,description,num1,num2,date,num3,num4
0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78
0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111
2,"this has an empty num1 and a space in num3",NaN,22.22,2/2/2002,NaN,2222.2222
3,"this has an empty quoted num1 and a quoted space in num3",NaN,33.33,3/3/2003,NaN,3333.3333
4,"this has a space in num2 and a space in num4",4.4,NaN,4/4/2004,444.444,NaN
5,"this has a quoted space num2 and quoted space in num4",5.5,NaN,5/5/2005,555.555,NaN
// The next two rows map the missing columns as 0, as it was decided not to impute with NaN
// in this case
6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0
7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0
Copy link
Member Author

@antoniovs1029 antoniovs1029 Jun 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(appended to example to allow for threaded discussion, this is how it will actually load missing columns at the end of the row)

The feature I implement on this PR means that "1,,dog" will be loaded as "1,NaN,dog" if the second column is a float/double. And to load "1,dog," as "1,dog,NaN" if the third column is a float/double. This was the behavior requested on #4132

What to do with incomplete rows was undefined in the request. I.e. what to do if there are supposed to be 6 columns but on one row there's only 2? (I.e. 1,dog, without the last separator)

In such a case, I took the decision to simply load the missing columns with default values (i.e. 0 for Float and Double) even when using the new feature of this PR.

The reason behind this is that the implementation required to load those missing columns as NaNs only for float/doubles would be somewhat hacky (and I didn't figure out how to make it work for vector types which included fields on missing columns on a given row). Besides, since it might be a fringe case. it's perhaps better not to support it.

Any thoughts on this, @justinormont @harishsk ? #Resolved

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the current behavior of the Textloader when it encounters incomplete rows? Does it thrown an error or does it load default values for missing columns?
If it is the latter, then I think we should fix it to set those columns to NaN because that is the spirit of this PR.


In reply to: 434313813 [](ancestors = 434313813)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of incomplete rows it returns default values, and doesn't throw, both with or without the new ImputeEmptyFloats option.

I'll update this PR to have it return NaNs, but I won't close this thread because I'd still want to leave this discussion on. Once I update this PR with that fix, we can decide if this case is worth it the changes that would need to be introduced to cover it.


In reply to: 434896173 [](ancestors = 434896173,434313813)

Copy link
Member Author

@antoniovs1029 antoniovs1029 Jun 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To fix this case I'll also need to do some checks like typeof(TValue) == typeof(float) and casts like (TValue)(obj) Single.NaN) similar to the ones described here:
#5198 (comment)

The same concerns I've mentioned there apply to this, although in here I think I can change the code to do the cast only when there are missing float/doubles columns for a given row and the ImputeEmptyFloats option is enabled.


In reply to: 435167058 [](ancestors = 435167058,434896173,434313813)

Copy link
Member Author

@antoniovs1029 antoniovs1029 Jun 9, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It turned out fixing this case was much simpler than what I had originally expected, and no need to the (TValue)(obj)Single.NaN cast I thought about doing. So I've updated mi changes to fix this case 😄


In reply to: 435219809 [](ancestors = 435219809,435167058,434896173,434313813)

// In the next case we do impute with NaN because the separator is there
8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,NaN
9,,9.9,99.99,9/9/2009,999.999,NaN
0,"",10.10,NaN,10/10/2010,101010.101010,NaN
11,NaN,NaN,NaN,11/11/2011,NaN,Infinity
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
int,description,num1,num2,date,num3,num4
0,"this is a description",0.12,0.34,01/01/2001,0.56,0.78
0,"this has an empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
0,"this has a quoted empty int and date", 1.1, 11.11,1/1/0001,111.111,1111.11111
1,"this has a quoted int and date", 1.1, 11.11,1/1/2001,111.111,1111.11111
2,"this has an empty num1 and a space in num3",0,22.22,2/2/2002,0,2222.2222
3,"this has an empty quoted num1 and a quoted space in num3",0,33.33,3/3/2003,0,3333.3333
4,"this has a space in num2 and a space in num4",4.4,0,4/4/2004,444.444,0
5,"this has a quoted space num2 and quoted space in num4",5.5,0,5/5/2005,555.555,0
6,"this has no date, num3 or num4 (the separator corresponding to them is also missing)",6.6,66.66,1/1/0001,0,0
7,"this has no num4 (the separator corresponding to it is missing)",7.7,77.77,7/7/2007,777.777,0
8,"this has nothing in num4, but includes the last separator",8.8,88.88,8/8/2008,888.888,0
9,,9.9,99.99,9/9/2009,999.999,NaN
0,,10.10,NaN,10/10/2010,101010.101010,NaN
11,NaN,NaN,NaN,11/11/2011,0,Infinity
Loading