Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add escapeChar support to TextLoader and added benchmark for TextLoader #5147

Merged
merged 12 commits into from
May 21, 2020
35 changes: 31 additions & 4 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,12 @@ public class Options
[Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of rows to produce", ShortName = "rows", Hide = true)]
public long? MaxRows;

/// <summary>
/// Character to use to escape quotes inside quoted fields. It can't be a character used as separator.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")]
public char EscapeChar = Defaults.EscapeChar;

/// <summary>
/// Checks that all column specifications are valid (that is, ranges are disjoint and have min&lt;=max).
/// </summary>
Expand All @@ -538,6 +544,7 @@ internal static class Defaults
internal const bool HasHeader = false;
internal const bool TrimWhitespace = false;
internal const bool ReadMultilines = false;
internal const char EscapeChar = '"';
}

/// <summary>
Expand Down Expand Up @@ -702,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
ch.Assert(0 <= inputSize & inputSize < SrcLim);
List<ReadOnlyMemory<char>> lines = null;
if (headerFile != null)
Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines);
Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
if (needInputSize && inputSize == 0)
Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines);
Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
else if (headerFile == null && parent.HasHeader)
Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines);
Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);

if (needInputSize && inputSize == 0)
{
Expand Down Expand Up @@ -1063,7 +1070,8 @@ private static VersionInfo GetVersionInfo()
// verWrittenCur: 0x00010009, // Introduced _flags
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
//verWrittenCur: 0x0001000B, // Header now retained if used and present
verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
//verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
verWrittenCur: 0x0001000D, // Added escapeChar option
verReadableCur: 0x0001000A,
verWeCanReadBack: 0x00010009,
loaderSignature: LoaderSignature,
Expand All @@ -1090,6 +1098,7 @@ private enum OptionFlags : uint

private readonly bool _useThreads;
private readonly OptionFlags _flags;
private readonly char _escapeChar;
private readonly long _maxRows;
// Input size is zero for unknown - determined by the data (including sparse rows).
private readonly int _inputSize;
Expand Down Expand Up @@ -1210,6 +1219,10 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
}
}

_escapeChar = options.EscapeChar;
if(_separators.Contains(_escapeChar))
throw _host.ExceptUserArg(nameof(Options.EscapeChar), "EscapeChar '{0}' can't be used both as EscapeChar and separator", _escapeChar);

_bindings = new Bindings(this, cols, headerFile, dataSample);
_parser = new Parser(this);
}
Expand Down Expand Up @@ -1373,6 +1386,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
// int: inputSize: 0 for determined from data
// int: number of separators
// char[]: separators
// char: escapeChar
// bindings
int cbFloat = ctx.Reader.ReadInt32();
host.CheckDecode(cbFloat == sizeof(float));
Expand All @@ -1397,6 +1411,17 @@ private TextLoader(IHost host, ModelLoadContext ctx)
if (_separators.Contains(':'))
host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0);

if (ctx.Header.ModelVerWritten >= 0x0001000D)
{
_escapeChar = ctx.Reader.ReadChar();
}
else
{
_escapeChar = Defaults.EscapeChar;
}

host.CheckDecode(!_separators.Contains(_escapeChar));

_bindings = new Bindings(ctx, this);
_parser = new Parser(this);
}
Expand Down Expand Up @@ -1437,6 +1462,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
// int: inputSize: 0 for determined from data
// int: number of separators
// char[]: separators
// char: escapeChar
// bindings
ctx.Writer.Write(sizeof(float));
ctx.Writer.Write(_maxRows);
Expand All @@ -1445,6 +1471,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
_host.Assert(0 <= _inputSize && _inputSize < SrcLim);
ctx.Writer.Write(_inputSize);
ctx.Writer.WriteCharArray(_separators);
ctx.Writer.Write(_escapeChar);

_bindings.Save(ctx);
}
Expand Down
110 changes: 75 additions & 35 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
SetupCursor(parent, active, 0, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, 1);
var stats = new ParseStats(parent._host, 1);
return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
}
Expand All @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
SetupCursor(parent, active, n, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, cthd);
var stats = new ParseStats(parent._host, cthd);
if (cthd <= 1)
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
Expand Down Expand Up @@ -205,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
};
}

public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List<ReadOnlyMemory<char>> lines)
public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, char escapeChar, ref List<ReadOnlyMemory<char>> lines)
{
Contracts.AssertValue(source);
Contracts.Assert(count > 0);
Expand All @@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM
count = 2;

LineBatch batch;
var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1);
var reader = new LineReader(source, count, 1, false, readMultilines, separators, escapeChar, count, 1);
try
{
batch = reader.GetBatch();
Expand Down Expand Up @@ -404,6 +404,7 @@ private sealed class LineReader
private readonly bool _hasHeader;
private readonly bool _readMultilines;
private readonly char[] _separators;
private readonly char _escapeChar;
private readonly int _batchSize;
private readonly IMultiStreamSource _files;

Expand All @@ -413,7 +414,7 @@ private sealed class LineReader
private Task _thdRead;
private volatile bool _abort;

public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref)
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, char escapeChar, long limit, int cref)
{
// Note that files is allowed to be empty.
Contracts.AssertValue(files);
Expand All @@ -428,6 +429,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
_batchSize = batchSize;
_readMultilines = readMultilines;
_separators = separators;
_escapeChar = escapeChar;
_files = files;
_cref = cref;

Expand Down Expand Up @@ -474,15 +476,19 @@ private class MultiLineReader
private readonly char _sep0;
private readonly char[] _separators;
private readonly bool _sepsContainsSpace;
private readonly char _escapeChar;
private readonly bool _escapeCharIsDoubleQuote;
private readonly StringBuilder _sb;
private readonly TextReader _rdr;

public MultiLineReader(TextReader rdr, char[] separators)
public MultiLineReader(TextReader rdr, char[] separators, char escapeChar)
{
Contracts.AssertNonEmpty(separators);
_sep0 = separators[0];
_separators = separators;
_sepsContainsSpace = IsSep(' ');
_escapeChar = escapeChar;
_escapeCharIsDoubleQuote = (escapeChar == '"');
_sb = new StringBuilder();
_rdr = rdr;
}
Expand Down Expand Up @@ -569,52 +575,86 @@ private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim,
ichCur++;
}

if (ichCur >= ichLim) // if there were only leading spaces on the line
return startsInsideQuoted;

if(startsInsideQuoted || line[ichCur] == '"')
{
// Quoted Field Case

if (!startsInsideQuoted)
ichCur++;

for (; ; ichCur++)
if (_escapeCharIsDoubleQuote)
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to refactor these two if/else blocks into separate functions for the sake of readability? (Or even better merge them into a single function/codepath?

Copy link
Member Author

@antoniovs1029 antoniovs1029 May 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is possible to have this refactored into two functions or even into one path.

I didn't do this because the original writers of the Parser seemed to be very worried about efficiency and the way they addressed this is by avoiding creating / calling functions, and using bools such as the one I've used here for _escapeCharIsDoubleQuote.

So for example, we have the following code written by them, which can be obviously be refactored into a function, and perhaps even simplified to avoid the first 2 if branches, but it's stated that they didn't do it because it was "performance critical"

// Please note that these branched tight loops are intended and performance critical.
if (_seps.Length == 1)
{
for (; ; ichCur++)
{
Contracts.Assert(ichCur <= ichLim);
if (ichCur >= ichLim)
break;
if (_sep0 == span[ichCur])
break;
}
}
else if (_seps.Length == 2)
{
for (; ; ichCur++)
{
Contracts.Assert(ichCur <= ichLim);
if (ichCur >= ichLim)
break;
if (_sep0 == span[ichCur] || _sep1 == span[ichCur])
break;
}
}
else
{
for (; ; ichCur++)
{
Contracts.Assert(ichCur <= ichLim);
if (ichCur >= ichLim)
break;
if (IsSep(span[ichCur]))
break;
}
}

Following that pattern I thought that the fastest way for the code I am writing to run is to branch earlier with _escapeCharIsDoubleQuote only requiring to check it once... because if I refactor it to call functions or even worse if I merge into a single path I'll have to continuously check if _escapeCharIsDoubleQuote for it to work correctly, then it would have to take more steps and probably have a toll on performance. Also, personally I think that a merged path would be harder to read.

Do you still want me to try to refactor it? If so, I can try to come up with a refactored version for us to compare.


In reply to: 428386984 [](ancestors = 428386984)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find any benchmark tests for text loader in the code. Can you please look up and see if I have missed something?


In reply to: 428393614 [](ancestors = 428393614,428386984)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's any benchmark that only tests text loader. But I think it's a good idea to add one, so I'll do that.

In the meantime, I actually used the FeaturizeTextBench (which only loads a syntetic dataset and featurizes it) to run benchmarks on my previous readMultilines PR:
#5125 (comment)

As I mention there, I'm not sure how to interpret the results, so I will address this tomorrow on scrum.


In reply to: 428398494 [](ancestors = 428398494,428393614,428386984)

Copy link
Member Author

@antoniovs1029 antoniovs1029 May 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I've added a new TextLoaderBench Benchmark test, based on TextFeaturizerBench and CacheDataViewBench. On my new benchmark I simply load a DataView using TextLoader from a syntetic file, get the getter of its first 20 columns and read its 3000 rows without saving them into memory.

The results are as follows. Notice that I still don't understand how to interpret them, as they're somewhat unexpected... it seems they indicate there's not much change across runs, but it might be just as well that perhaps my benchmark isn't conclusive on what it's measuring. I wouldn't know 😕

(notice that even when the mean is very similar across scenarios, the Error is consistently higher without my PRs)

Without this PR, and also without my readMultilines PR

|                Method |    Mean |   Error |   StdDev | Extra Metric |
|---------------------- |--------:|--------:|---------:|-------------:|
| TestTextLoaderGetters | 1.256 s | 3.273 s | 0.1794 s |            - |

With this PR, with readMultilines = false and escapeChar =" (i.e. with defaults)

|                Method |     Mean |    Error |   StdDev | Extra Metric |
|---------------------- |---------:|---------:|---------:|-------------:|
| TestTextLoaderGetters | 810.5 ms | 765.0 ms | 41.93 ms |            - |

With this PR, readMultilines = true, escapeChar = "

|                Method |    Mean |    Error |   StdDev | Extra Metric |
|---------------------- |--------:|---------:|---------:|-------------:|
| TestTextLoaderGetters | 1.089 s | 0.7992 s | 0.0438 s |            - |

With this PR, readMultilines = true, escapeChar = \

|                Method |     Mean |    Error |   StdDev | Extra Metric |
|---------------------- |---------:|---------:|---------:|-------------:|
| TestTextLoaderGetters | 857.9 ms | 696.7 ms | 38.19 ms |            - |
// * Legends *
  Mean         : Arithmetic mean of all measurements
  Error        : Half of 99.9% confidence interval
  StdDev       : Standard deviation of all measurements
  Extra Metric : Value of the provided extra metric
  1 s          : 1 Second (1 sec)


In reply to: 428493474 [](ancestors = 428493474,428398494,428393614,428386984)

if (ichCur >= ichLim)
// We've reached the end of the line without finding the closing quote,
// so next line will start on this quoted field
return true;

if (line[ichCur] == '"')
for (; ; ichCur++)
{
if (++ichCur >= ichLim)
// Last character in line was the closing quote of the field
return false;
if (ichCur >= ichLim)
// We've reached the end of the line without finding the closing quote,
// so next line will start on this quoted field
return true;

if (line[ichCur] == '"')
// 2 Double quotes means escaped quote
continue;
{
if (++ichCur >= ichLim)
// Last character in line was the closing quote of the field
return false;

// If it wasn't an escaped quote, then this is supposed to be
// the closing quote of the field, and there should only be spaces remaining
// until the next separator.
if (line[ichCur] == '"')
// 2 Double quotes means escaped quote
continue;

if (!_sepsContainsSpace)
{
// Ignore leading spaces
while (ichCur < ichLim && line[ichCur] == ' ')
ichCur++;
// If it wasn't an escaped quote, then this is supposed to be
// the closing quote of the field
break;
}
}
}
else
{
for (; ; ichCur++)
{
if (ichCur >= ichLim)
// We've reached the end of the line without finding the closing quote,
// so next line will start on this quoted field
return true;

// If there's anything else than spaces or the next separator,
// this will actually be a QuotingError on the parser, so we decide that this
// line contains a quoting error, and so it's not going to be considered a valid field
// and the rest of the line should be ignored.
if (ichCur >= ichLim || IsSep(line[ichCur]))
return false;
if (line[ichCur] == _escapeChar)
{
if (++ichCur >= ichLim)
// Last character in line was escapeChar
return true;

quotingError = true;
return false;
// Whatever char comes after an escapeChar is ignored
continue;
}
else if (line[ichCur] == '"')
{
// Since this wasn't an escaped quote, then this is supposed to be
// the closing quote of the field
break;
}
}
}

// After finding the closing quote of the field...
// There should only be empty spaces until the next separator
if (!_sepsContainsSpace)
{
// Ignore leading spaces
while (ichCur < ichLim && line[ichCur] == ' ')
ichCur++;
}

// If there's anything else than spaces or the next separator,
// this will actually be a QuotingError on the parser, so we decide that this
// line contains a quoting error, and so it's not going to be considered a valid field
// and the rest of the line should be ignored.
if (ichCur >= ichLim || IsSep(line[ichCur]))
return false;

quotingError = true;
return false;
}

// Unquoted field case.
Expand Down Expand Up @@ -655,7 +695,7 @@ private void ThreadProc()
string path = _files.GetPathOrNull(ifile);
using (var rdr = _files.OpenTextReader(ifile))
{
var multilineReader = new MultiLineReader(rdr, _separators);
var multilineReader = new MultiLineReader(rdr, _separators, _escapeChar);
string text;
long line = 0;
for (; ; )
Expand Down
Loading