Skip to content

Commit c664748

Browse files
Allow TextLoader to load empty float/double fields as NaN instead of 0 (#5198)
* Added MissingRealsAsNaNs option to TextLoader * Added EmptyAsNaN option to DoubleParser
1 parent 9244e68 commit c664748

File tree

13 files changed

+482
-88
lines changed

13 files changed

+482
-88
lines changed

docs/code/IDataViewTypeSystem.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,10 @@ is first processed entirely as `TX` values, then parsed, or processed directly
540540
into numeric values, that is, parsing as the row is processed. In the latter
541541
case, it is simple to map implicit items (suppressed due to sparsity) to zero.
542542
In the former case, these items are first mapped to the empty text value. To
543-
get the same result, we need empty text to map to zero.
543+
get the same result, we need empty text to map to zero. An exception to this
544+
rule has been permitted in the TextLoader, where there's an option to load
545+
empty `TX` fields as `NaN` for `R4` and `R8` fields, instead of using the default
546+
conversion of empty `TX` to the numeric default `0`.
544547

545548
### To Text
546549

src/Microsoft.ML.Core/Utilities/DoubleParser.cs

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ internal enum OptionFlags : uint
2323
// a number and its decimal part). If this isn't set, then
2424
// default behavior is to use "." as decimal marker.
2525
UseCommaAsDecimalMarker = 0x01,
26+
27+
// If this flag is set, then empty spans (or those with only white-space)
28+
// will be parsed as NaN. If it isn't set, then default behavior
29+
// is to return them as 0.
30+
EmptyAsNaN = 0x02,
2631
}
2732

2833
private const ulong TopBit = 0x8000000000000000UL;
@@ -81,22 +86,22 @@ public enum Result
8186
}
8287

8388
/// <summary>
84-
/// This produces zero for an empty string.
89+
/// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
8590
/// </summary>
8691
public static bool TryParse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
8792
{
8893
var res = Parse(span, out value, flags);
89-
Contracts.Assert(res != Result.Empty || value == 0);
94+
Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Single.IsNaN(value));
9095
return res <= Result.Empty;
9196
}
9297

9398
/// <summary>
94-
/// This produces zero for an empty string.
99+
/// This produces zero for an empty string, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
95100
/// </summary>
96101
public static bool TryParse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
97102
{
98103
var res = Parse(span, out value, flags);
99-
Contracts.Assert(res != Result.Empty || value == 0);
104+
Contracts.Assert(res != Result.Empty || ((flags & OptionFlags.EmptyAsNaN) == 0 && value == 0) || Double.IsNaN(value));
100105
return res <= Result.Empty;
101106
}
102107

@@ -107,7 +112,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value, OptionFlag
107112
{
108113
if (ich >= span.Length)
109114
{
110-
value = 0;
115+
if ((flags & OptionFlags.EmptyAsNaN) == 0)
116+
value = 0;
117+
else
118+
value = Single.NaN;
119+
111120
return Result.Empty;
112121
}
113122
if (!char.IsWhiteSpace(span[ich]))
@@ -155,7 +164,11 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value, OptionFlag
155164
{
156165
if (ich >= span.Length)
157166
{
158-
value = 0;
167+
if ((flags & OptionFlags.EmptyAsNaN) == 0)
168+
value = 0;
169+
else
170+
value = Double.NaN;
171+
159172
return Result.Empty;
160173
}
161174
if (!char.IsWhiteSpace(span[ich]))

src/Microsoft.ML.Data/Data/Conversion.cs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1369,7 +1369,8 @@ private void TryParseSigned(long max, in TX text, out long? result)
13691369
}
13701370

13711371
/// <summary>
1372-
/// This produces zero for empty. It returns false if the text is not parsable.
1372+
/// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
1373+
/// It returns false if the text is not parsable.
13731374
/// On failure, it sets dst to the NA value.
13741375
/// </summary>
13751376
public bool TryParse(in TX src, out R4 dst)
@@ -1382,7 +1383,8 @@ public bool TryParse(in TX src, out R4 dst)
13821383
}
13831384

13841385
/// <summary>
1385-
/// This produces zero for empty. It returns false if the text is not parsable.
1386+
/// This produces zero for empty, or NaN depending on the <see cref="DoubleParser.OptionFlags.EmptyAsNaN"/> used.
1387+
/// It returns false if the text is not parsable.
13861388
/// On failure, it sets dst to the NA value.
13871389
/// </summary>
13881390
public bool TryParse(in TX src, out R8 dst)
@@ -1394,6 +1396,9 @@ public bool TryParse(in TX src, out R8 dst)
13941396
return IsStdMissing(ref span);
13951397
}
13961398

1399+
/// <summary>
1400+
/// This produces default for empty.
1401+
/// </summary>
13971402
public bool TryParse(in TX src, out TS dst)
13981403
{
13991404
if (src.IsEmpty)
@@ -1408,6 +1413,9 @@ public bool TryParse(in TX src, out TS dst)
14081413
return false;
14091414
}
14101415

1416+
/// <summary>
1417+
/// This produces default for empty.
1418+
/// </summary>
14111419
public bool TryParse(in TX src, out DT dst)
14121420
{
14131421
if (src.IsEmpty)
@@ -1422,6 +1430,9 @@ public bool TryParse(in TX src, out DT dst)
14221430
return false;
14231431
}
14241432

1433+
/// <summary>
1434+
/// This produces default for empty.
1435+
/// </summary>
14251436
public bool TryParse(in TX src, out DZ dst)
14261437
{
14271438
if (src.IsEmpty)

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -433,10 +433,9 @@ public class Options
433433
/// </summary>
434434
[Argument(ArgumentType.AtMostOnce,
435435
HelpText =
436-
"Whether the input may include quoted values, which can contain separator characters, colons," +
437-
" and distinguish empty values from missing values. When true, consecutive separators denote a" +
438-
" missing value and an empty value is denoted by \"\". When false, consecutive separators" +
439-
" denote an empty value.",
436+
"Whether the input may include double-quoted values. This parameter is used to distinguish separator characters in an input value " +
437+
"from actual separators. When true, separators within double quotes are treated as part of the input value. When false, all " +
438+
"separators, even those within quotes, are treated as delimiting a new column.",
440439
ShortName = "quote")]
441440
public bool AllowQuoting = Defaults.AllowQuoting;
442441

@@ -533,6 +532,15 @@ public class Options
533532
[Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")]
534533
public char EscapeChar = Defaults.EscapeChar;
535534

535+
/// <summary>
536+
/// If true, missing real fields (i.e. double or single fields) will be loaded as NaN.
537+
/// If false, they'll be loaded as 0. Default is false.
538+
/// A field is considered "missing" if it's empty, if it only has whitespace, or if there are missing columns
539+
/// at the end of a given row.
540+
/// </summary>
541+
[Argument(ArgumentType.AtMostOnce, HelpText = "If true, empty float fields will be loaded as NaN. If false, they'll be loaded as 0. Default is false.", ShortName = "missingrealnan")]
542+
public bool MissingRealsAsNaNs = Defaults.MissingRealsAsNaNs;
543+
536544
/// <summary>
537545
/// Checks that all column specifications are valid (that is, ranges are disjoint and have min&lt;=max).
538546
/// </summary>
@@ -552,6 +560,7 @@ internal static class Defaults
552560
internal const bool TrimWhitespace = false;
553561
internal const bool ReadMultilines = false;
554562
internal const char EscapeChar = '"';
563+
internal const bool MissingRealsAsNaNs = false;
555564
}
556565

557566
/// <summary>
@@ -1078,7 +1087,8 @@ private static VersionInfo GetVersionInfo()
10781087
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
10791088
//verWrittenCur: 0x0001000B, // Header now retained if used and present
10801089
//verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
1081-
verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker
1090+
//verWrittenCur: 0x0001000D, // Added escapeChar and decimalMarker chars
1091+
verWrittenCur: 0x0001000E, // Added MissingRealsAsNaNs flag
10821092
verReadableCur: 0x0001000A,
10831093
verWeCanReadBack: 0x00010009,
10841094
loaderSignature: LoaderSignature,
@@ -1097,7 +1107,8 @@ private enum OptionFlags : uint
10971107
AllowQuoting = 0x04,
10981108
AllowSparse = 0x08,
10991109
ReadMultilines = 0x10,
1100-
All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines
1110+
MissingRealsAsNaNs = 0x20,
1111+
All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines | MissingRealsAsNaNs
11011112
}
11021113

11031114
// This is reserved to mean the range extends to the end (the segment is variable).
@@ -1179,6 +1190,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
11791190
_flags |= OptionFlags.AllowSparse;
11801191
if (options.AllowQuoting && options.ReadMultilines)
11811192
_flags |= OptionFlags.ReadMultilines;
1193+
if (options.MissingRealsAsNaNs)
1194+
_flags |= OptionFlags.MissingRealsAsNaNs;
11821195

11831196
// REVIEW: This should be persisted (if it should be maintained).
11841197
_maxRows = options.MaxRows ?? long.MaxValue;
@@ -1407,7 +1420,25 @@ private TextLoader(IHost host, ModelLoadContext ctx)
14071420
_maxRows = ctx.Reader.ReadInt64();
14081421
host.CheckDecode(_maxRows > 0);
14091422
_flags = (OptionFlags)ctx.Reader.ReadUInt32();
1410-
host.CheckDecode((_flags & ~OptionFlags.All) == 0);
1423+
1424+
// Flags introduced with the first ML.NET commit:
1425+
var acceptableFlags = OptionFlags.TrimWhitespace;
1426+
acceptableFlags |= OptionFlags.HasHeader;
1427+
acceptableFlags |= OptionFlags.AllowQuoting;
1428+
acceptableFlags |= OptionFlags.AllowSparse;
1429+
1430+
// Flags added on later versions of TextLoader:
1431+
if(ctx.Header.ModelVerWritten >= 0x0001000C)
1432+
{
1433+
acceptableFlags |= OptionFlags.ReadMultilines;
1434+
}
1435+
if(ctx.Header.ModelVerWritten >= 0x0001000E)
1436+
{
1437+
acceptableFlags |= OptionFlags.MissingRealsAsNaNs;
1438+
}
1439+
1440+
host.CheckDecode((_flags & ~acceptableFlags) == 0);
1441+
14111442
_inputSize = ctx.Reader.ReadInt32();
14121443
host.CheckDecode(0 <= _inputSize && _inputSize < SrcLim);
14131444

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,8 @@ private abstract class ColumnPipe
228228

229229
public abstract bool HasNA { get; }
230230

231+
public abstract bool IsReal { get; } // If the type of the ColumnPipe is either Single or Double
232+
231233
protected ColumnPipe(RowSet rows)
232234
{
233235
Contracts.AssertValue(rows);
@@ -251,6 +253,8 @@ private sealed class PrimitivePipe<TResult> : ColumnPipe
251253

252254
public override bool HasNA { get; }
253255

256+
public override bool IsReal { get; }
257+
254258
public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper<TResult> conv)
255259
: base(rows)
256260
{
@@ -259,6 +263,7 @@ public PrimitivePipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper<TRe
259263
_conv = conv;
260264
_values = new TResult[Rows.Count];
261265
HasNA = Conversions.DefaultInstance.TryGetIsNAPredicate(type, out var del);
266+
IsReal = typeof(TResult) == typeof(Single) || typeof(TResult) == typeof(Double);
262267
}
263268

264269
public override void Reset(int irow, int size)
@@ -295,6 +300,8 @@ private sealed class VectorPipe<TItem> : ColumnPipe
295300

296301
public override bool HasNA { get; }
297302

303+
public override bool IsReal { get; }
304+
298305
private class VectorValue
299306
{
300307
private readonly VectorPipe<TItem> _pipe;
@@ -441,6 +448,7 @@ public VectorPipe(RowSet rows, PrimitiveDataViewType type, TryParseMapper<TItem>
441448
for (int i = 0; i < _values.Length; i++)
442449
_values[i] = new VectorValue(this);
443450
HasNA = Conversions.DefaultInstance.TryGetIsNAPredicate(type, out var del);
451+
IsReal = typeof(TItem) == typeof(Single) || typeof(TItem) == typeof(Double);
444452
}
445453

446454
public override void Reset(int irow, int size)
@@ -649,6 +657,7 @@ public void Clear()
649657

650658
private readonly char[] _separators;
651659
private readonly OptionFlags _flags;
660+
private readonly bool _missingRealsAsNaNs;
652661
private readonly char _escapeChar;
653662
private readonly int _inputSize;
654663
private readonly ColInfo[] _infos;
@@ -659,6 +668,8 @@ public void Clear()
659668
private volatile int _csrc;
660669
private volatile int _mismatchCount;
661670

671+
private ReadOnlyMemory<char> _blank;
672+
662673
public Parser(TextLoader parent)
663674
{
664675
Contracts.AssertValue(parent);
@@ -671,6 +682,8 @@ public Parser(TextLoader parent)
671682
var doubleParserOptionFlags = DoubleParser.OptionFlags.Default;
672683
if (parent._decimalMarker == ',')
673684
doubleParserOptionFlags |= DoubleParser.OptionFlags.UseCommaAsDecimalMarker;
685+
if ((parent._flags & OptionFlags.MissingRealsAsNaNs) != 0)
686+
doubleParserOptionFlags |= DoubleParser.OptionFlags.EmptyAsNaN;
674687

675688
if (doubleParserOptionFlags == DoubleParser.OptionFlags.Default)
676689
cache = ValueCreatorCache.DefaultInstance;
@@ -713,6 +726,8 @@ public Parser(TextLoader parent)
713726
_flags = parent._flags;
714727
_escapeChar = parent._escapeChar;
715728
_inputSize = parent._inputSize;
729+
_missingRealsAsNaNs = (parent._flags & OptionFlags.MissingRealsAsNaNs) != 0;
730+
_blank = ReadOnlyMemory<char>.Empty;
716731
Contracts.Assert(_inputSize >= 0);
717732
}
718733

@@ -900,6 +915,7 @@ private sealed class HelperImpl : Helper
900915
private readonly int _srcNeeded;
901916
private readonly bool _quoting;
902917
private readonly bool _sparse;
918+
private readonly bool _keepEmpty;
903919
// This is a working buffer.
904920
private readonly StringBuilder _sb;
905921

@@ -930,6 +946,11 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeC
930946
_sb = new StringBuilder();
931947
_blank = ReadOnlyMemory<char>.Empty;
932948
Fields = new FieldSet();
949+
950+
// If we want to impute empty real fields as NaNs, then we must keep
951+
// all empty field spans, as there's no way for the Parser.HelperImpl
952+
// to know beforehand which fields belong to a float field
953+
_keepEmpty = (flags & OptionFlags.MissingRealsAsNaNs) != 0;
933954
}
934955

935956
/// <summary>
@@ -978,6 +999,13 @@ public int GatherFields(ReadOnlyMemory<char> lineSpan, ReadOnlySpan<char> span,
978999
Fields.Spans[Fields.Count] = scan.Span;
9791000
Fields.Indices[Fields.Count++] = src;
9801001
}
1002+
else if(_keepEmpty)
1003+
{
1004+
Fields.EnsureSpace();
1005+
Fields.Spans[Fields.Count] = _blank;
1006+
Fields.Indices[Fields.Count++] = src;
1007+
}
1008+
9811009
if (++src > _srcNeeded || !more)
9821010
break;
9831011
}
@@ -1390,10 +1418,10 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v,
13901418
int sizeSeg = lim - min;
13911419
Contracts.Assert(ivDst <= size - sizeSeg);
13921420

1421+
int indexBase = ivDst - min;
13931422
int isrc = fields.Indices.FindIndexSorted(0, fields.Count, min);
13941423
if (isrc < fields.Count && fields.Indices[isrc] < lim)
13951424
{
1396-
int indexBase = ivDst - min;
13971425
int isrcLim = fields.Indices.FindIndexSorted(isrc, fields.Count, lim);
13981426
Contracts.Assert(isrc < isrcLim);
13991427
for (; isrc < isrcLim; isrc++)
@@ -1408,6 +1436,19 @@ private void ProcessVec(int srcLim, FieldSet fields, ColInfo info, ColumnPipe v,
14081436
}
14091437
}
14101438
}
1439+
1440+
if(_missingRealsAsNaNs && isrc >= fields.Count && v.IsReal)
1441+
{
1442+
// If the user has set the MissingRealsAsNaNs option to true,
1443+
// And there are missing columns on a given row,
1444+
// then we should load them as if they were empty (i.e. _blank) fields
1445+
// So that they can be loaded as NaNs if they're single/double columns
1446+
// Or as default if they aren't.
1447+
for (int srcCur = Math.Max(min, fields.Count); srcCur < lim; srcCur++)
1448+
{
1449+
v.Consume(irow, indexBase + srcCur, ref _blank);
1450+
}
1451+
}
14111452
ivDst += sizeSeg;
14121453
}
14131454
Contracts.Assert(ivDst == size);
@@ -1430,6 +1471,15 @@ private void ProcessOne(FieldSet vs, ColInfo info, ColumnPipe v, int irow, long
14301471
v.Rows.Stats.LogBadValue(line, info.Name);
14311472
}
14321473
}
1474+
else if(_missingRealsAsNaNs && v.IsReal)
1475+
{
1476+
// If the user has set the MissingRealsAsNaNs option to true,
1477+
// And there are missing columns on a given row,
1478+
// then we should load them as if they were empty (i.e. _blank) fields
1479+
// So that they can be loaded as NaNs if they're single/double columns
1480+
// Or as default if they aren't.
1481+
v.Consume(irow, 0, ref _blank);
1482+
}
14331483
else
14341484
v.Reset(irow, 0);
14351485
}

0 commit comments

Comments
 (0)