dotnet · harishsk · May 21, 2020 · May 20, 2020 · May 20, 2020 · May 20, 2020
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -521,6 +521,12 @@ public class Options
             [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of rows to produce", ShortName = "rows", Hide = true)]
             public long? MaxRows;
 
+            /// <summary>
+            /// Character to use to escape quotes inside quoted fields. It can't be a character used as separator.
+            /// </summary>
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")]
+            public char EscapeChar = Defaults.EscapeChar;
+
             /// <summary>
             /// Checks that all column specifications are valid (that is, ranges are disjoint and have min&lt;=max).
             /// </summary>
@@ -538,6 +544,7 @@ internal static class Defaults
             internal const bool HasHeader = false;
             internal const bool TrimWhitespace = false;
             internal const bool ReadMultilines = false;
+            internal const char EscapeChar = '"';
         }
 
         /// <summary>
@@ -702,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
                     ch.Assert(0 <= inputSize & inputSize < SrcLim);
                     List<ReadOnlyMemory<char>> lines = null;
                     if (headerFile != null)
-                        Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines);
+                        Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
                     if (needInputSize && inputSize == 0)
-                        Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines);
+                        Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
                     else if (headerFile == null && parent.HasHeader)
-                        Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines);
+                        Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
 
                     if (needInputSize && inputSize == 0)
                     {
@@ -1063,7 +1070,8 @@ private static VersionInfo GetVersionInfo()
                 // verWrittenCur: 0x00010009, // Introduced _flags
                 //verWrittenCur: 0x0001000A, // Added ForceVector in Range
                 //verWrittenCur: 0x0001000B, // Header now retained if used and present
-                verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
+                //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
+                verWrittenCur: 0x0001000D, // Added escapeChar option
                 verReadableCur: 0x0001000A,
                 verWeCanReadBack: 0x00010009,
                 loaderSignature: LoaderSignature,
@@ -1090,6 +1098,7 @@ private enum OptionFlags : uint
 
         private readonly bool _useThreads;
         private readonly OptionFlags _flags;
+        private readonly char _escapeChar;
         private readonly long _maxRows;
         // Input size is zero for unknown - determined by the data (including sparse rows).
         private readonly int _inputSize;
@@ -1210,6 +1219,10 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
                 }
             }
 
+            _escapeChar = options.EscapeChar;
+            if(_separators.Contains(_escapeChar))
+                throw _host.ExceptUserArg(nameof(Options.EscapeChar), "EscapeChar '{0}' can't be used both as EscapeChar and separator", _escapeChar);
+
             _bindings = new Bindings(this, cols, headerFile, dataSample);
             _parser = new Parser(this);
         }
@@ -1373,6 +1386,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
             // int: inputSize: 0 for determined from data
             // int: number of separators
             // char[]: separators
+            // char: escapeChar
             // bindings
             int cbFloat = ctx.Reader.ReadInt32();
             host.CheckDecode(cbFloat == sizeof(float));
@@ -1397,6 +1411,17 @@ private TextLoader(IHost host, ModelLoadContext ctx)
             if (_separators.Contains(':'))
                 host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0);
 
+            if (ctx.Header.ModelVerWritten >= 0x0001000D)
+            {
+                _escapeChar = ctx.Reader.ReadChar();
+            }
+            else
+            {
+                _escapeChar = Defaults.EscapeChar;
+            }
+
+            host.CheckDecode(!_separators.Contains(_escapeChar));
+
             _bindings = new Bindings(ctx, this);
             _parser = new Parser(this);
         }
@@ -1437,6 +1462,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
             // int: inputSize: 0 for determined from data
             // int: number of separators
             // char[]: separators
+            // char: escapeChar
             // bindings
             ctx.Writer.Write(sizeof(float));
             ctx.Writer.Write(_maxRows);
@@ -1445,6 +1471,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
             _host.Assert(0 <= _inputSize && _inputSize < SrcLim);
             ctx.Writer.Write(_inputSize);
             ctx.Writer.WriteCharArray(_separators);
+            ctx.Writer.Write(_escapeChar);
 
             _bindings.Save(ctx);
         }

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
@@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
                 SetupCursor(parent, active, 0, out srcNeeded, out cthd);
                 Contracts.Assert(cthd > 0);
 
-                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1);
+                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, 1);
                 var stats = new ParseStats(parent._host, 1);
                 return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
             }
@@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
                 SetupCursor(parent, active, n, out srcNeeded, out cthd);
                 Contracts.Assert(cthd > 0);
 
-                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd);
+                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, cthd);
                 var stats = new ParseStats(parent._host, cthd);
                 if (cthd <= 1)
                     return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
@@ -205,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
                     };
             }
 
-            public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List<ReadOnlyMemory<char>> lines)
+            public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, char escapeChar, ref List<ReadOnlyMemory<char>> lines)
             {
                 Contracts.AssertValue(source);
                 Contracts.Assert(count > 0);
@@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM
                     count = 2;
 
                 LineBatch batch;
-                var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1);
+                var reader = new LineReader(source, count, 1, false, readMultilines, separators, escapeChar, count, 1);
                 try
                 {
                     batch = reader.GetBatch();
@@ -404,6 +404,7 @@ private sealed class LineReader
                 private readonly bool _hasHeader;
                 private readonly bool _readMultilines;
                 private readonly char[] _separators;
+                private readonly char _escapeChar;
                 private readonly int _batchSize;
                 private readonly IMultiStreamSource _files;
 
@@ -413,7 +414,7 @@ private sealed class LineReader
                 private Task _thdRead;
                 private volatile bool _abort;
 
-                public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref)
+                public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, char escapeChar, long limit, int cref)
                 {
                     // Note that files is allowed to be empty.
                     Contracts.AssertValue(files);
@@ -428,6 +429,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
                     _batchSize = batchSize;
                     _readMultilines = readMultilines;
                     _separators = separators;
+                    _escapeChar = escapeChar;
                     _files = files;
                     _cref = cref;
 
@@ -474,15 +476,19 @@ private class MultiLineReader
                     private readonly char _sep0;
                     private readonly char[] _separators;
                     private readonly bool _sepsContainsSpace;
+                    private readonly char _escapeChar;
+                    private readonly bool _escapeCharIsDoubleQuote;
                     private readonly StringBuilder _sb;
                     private readonly TextReader _rdr;
 
-                    public MultiLineReader(TextReader rdr, char[] separators)
+                    public MultiLineReader(TextReader rdr, char[] separators, char escapeChar)
                     {
                         Contracts.AssertNonEmpty(separators);
                         _sep0 = separators[0];
                         _separators = separators;
                         _sepsContainsSpace = IsSep(' ');
+                        _escapeChar = escapeChar;
+                        _escapeCharIsDoubleQuote = (escapeChar == '"');
                         _sb = new StringBuilder();
                         _rdr = rdr;
                     }
@@ -569,52 +575,86 @@ private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim,
                                 ichCur++;
                         }
 
+                        if (ichCur >= ichLim) // if there were only leading spaces on the line
+                            return startsInsideQuoted;
+
                         if(startsInsideQuoted || line[ichCur] == '"')
                         {
                             // Quoted Field Case
 
                             if (!startsInsideQuoted)
                                 ichCur++;
 
-                            for (; ; ichCur++)
+                            if (_escapeCharIsDoubleQuote)
                             {
 // Please note that these branched tight loops are intended and performance critical. 
 if (_seps.Length == 1) 
 { 
     for (; ; ichCur++) 
     { 
         Contracts.Assert(ichCur <= ichLim); 
         if (ichCur >= ichLim) 
             break; 
         if (_sep0 == span[ichCur]) 
             break; 
     } 
 } 
 else if (_seps.Length == 2) 
 { 
     for (; ; ichCur++) 
     { 
         Contracts.Assert(ichCur <= ichLim); 
         if (ichCur >= ichLim) 
             break; 
         if (_sep0 == span[ichCur] || _sep1 == span[ichCur]) 
             break; 
     } 
 } 
 else 
 { 
     for (; ; ichCur++) 
     { 
         Contracts.Assert(ichCur <= ichLim); 
         if (ichCur >= ichLim) 
             break; 
         if (IsSep(span[ichCur])) 
             break; 
     } 
 } 
 // Please note that these branched tight loops are intended and performance critical. 
 if (_seps.Length == 1) 
 { 
     for (; ; ichCur++) 
     { 
         Contracts.Assert(ichCur <= ichLim); 
         if (ichCur >= ichLim) 
             break; 
         if (_sep0 == span[ichCur]) 
             break; 
     } 
 } 
 else if (_seps.Length == 2) 
 { 
     for (; ; ichCur++) 
     { 
         Contracts.Assert(ichCur <= ichLim); 
         if (ichCur >= ichLim) 
             break; 
         if (_sep0 == span[ichCur] || _sep1 == span[ichCur]) 
             break; 
     } 
 } 
 else 
 { 
     for (; ; ichCur++) 
     { 
         Contracts.Assert(ichCur <= ichLim); 
         if (ichCur >= ichLim) 
             break; 
         if (IsSep(span[ichCur])) 
             break; 
     } 
 } 
-                                if (ichCur >= ichLim)
-                                    // We've reached the end of the line without finding the closing quote,
-                                    // so next line will start on this quoted field
-                                    return true;
-
-                                if (line[ichCur] == '"')
+                                for (; ; ichCur++)
                                 {
-                                    if (++ichCur >= ichLim)
-                                        // Last character in line was the closing quote of the field
-                                        return false;
+                                    if (ichCur >= ichLim)
+                                        // We've reached the end of the line without finding the closing quote,
+                                        // so next line will start on this quoted field
+                                        return true;
 
                                     if (line[ichCur] == '"')
-                                        // 2 Double quotes means escaped quote
-                                        continue;
+                                    {
+                                        if (++ichCur >= ichLim)
+                                            // Last character in line was the closing quote of the field
+                                            return false;
 
-                                    // If it wasn't an escaped quote, then this is supposed to be
-                                    // the closing quote of the field, and there should only be spaces remaining
-                                    // until the next separator.
+                                        if (line[ichCur] == '"')
+                                            // 2 Double quotes means escaped quote
+                                            continue;
 
-                                    if (!_sepsContainsSpace)
-                                    {
-                                        // Ignore leading spaces
-                                        while (ichCur < ichLim && line[ichCur] == ' ')
-                                            ichCur++;
+                                        // If it wasn't an escaped quote, then this is supposed to be
+                                        // the closing quote of the field
+                                        break;
                                     }
+                                }
+                            }
+                            else
+                            {
+                                for (; ; ichCur++)
+                                {
+                                    if (ichCur >= ichLim)
+                                        // We've reached the end of the line without finding the closing quote,
+                                        // so next line will start on this quoted field
+                                        return true;
 
-                                    // If there's anything else than spaces or the next separator,
-                                    // this will actually be a QuotingError on the parser, so we decide that this
-                                    // line contains a quoting error, and so it's not going to be considered a valid field
-                                    // and the rest of the line should be ignored.
-                                    if (ichCur >= ichLim || IsSep(line[ichCur]))
-                                        return false;
+                                    if (line[ichCur] == _escapeChar)
+                                    {
+                                        if (++ichCur >= ichLim)
+                                            // Last character in line was escapeChar
+                                            return true;
 
-                                    quotingError = true;
-                                    return false;
+                                        // Whatever char comes after an escapeChar is ignored
+                                        continue;
+                                    }
+                                    else if (line[ichCur] == '"')
+                                    {
+                                        // Since this wasn't an escaped quote, then this is supposed to be
+                                        // the closing quote of the field
+                                        break;
+                                    }
                                 }
                             }
+
+                            // After finding the closing quote of the field...
+                            // There should only be empty spaces until the next separator
+                            if (!_sepsContainsSpace)
+                            {
+                                // Ignore leading spaces
+                                while (ichCur < ichLim && line[ichCur] == ' ')
+                                    ichCur++;
+                            }
+
+                            // If there's anything else than spaces or the next separator,
+                            // this will actually be a QuotingError on the parser, so we decide that this
+                            // line contains a quoting error, and so it's not going to be considered a valid field
+                            // and the rest of the line should be ignored.
+                            if (ichCur >= ichLim || IsSep(line[ichCur]))
+                                return false;
+
+                            quotingError = true;
+                            return false;
                         }
 
                         // Unquoted field case.
@@ -655,7 +695,7 @@ private void ThreadProc()
                             string path = _files.GetPathOrNull(ifile);
                             using (var rdr = _files.OpenTextReader(ifile))
                             {
-                                var multilineReader = new MultiLineReader(rdr, _separators);
+                                var multilineReader = new MultiLineReader(rdr, _separators, _escapeChar);
                                 string text;
                                 long line = 0;
                                 for (; ; )