Skip to content

Commit e3ca7e0

Browse files
support sweeping multiline option in AutoML (#5148)
* upgrade to 3.1 * write inline data using invariantCulture * add tryMulti in AutoML and test * add test for AutoML inferColumn API * Update test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs Co-authored-by: Justin Ormont <justinormont@users.noreply.github.com> Co-authored-by: Justin Ormont <justinormont@users.noreply.github.com>
1 parent c576d5e commit e3ca7e0

File tree

6 files changed

+91
-6
lines changed

6 files changed

+91
-6
lines changed

src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
5555
Separators = new[] { splitInference.Separator.Value },
5656
AllowSparse = splitInference.AllowSparse,
5757
AllowQuoting = splitInference.AllowQuote,
58+
ReadMultilines = splitInference.ReadMultilines,
5859
HasHeader = hasHeader,
5960
TrimWhitespace = trimWhitespace
6061
};
@@ -91,6 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
9192
AllowQuoting = splitInference.AllowQuote,
9293
AllowSparse = splitInference.AllowSparse,
9394
Separators = new char[] { splitInference.Separator.Value },
95+
ReadMultilines = splitInference.ReadMultilines,
9496
HasHeader = hasHeader,
9597
TrimWhitespace = trimWhitespace
9698
};
@@ -139,6 +141,7 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co
139141
Separator = splitInference.Separator.Value,
140142
AllowSparse = splitInference.AllowSparse,
141143
AllowQuote = splitInference.AllowQuote,
144+
ReadMultilines = splitInference.ReadMultilines,
142145
HasHeader = hasHeader,
143146
LabelColumnIndex = labelColumnIndex,
144147
Label = label

src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ internal sealed class Arguments
3232
public int MaxRowsToRead;
3333
public uint? LabelColumnIndex;
3434
public string Label;
35+
public bool ReadMultilines;
3536

3637
public Arguments()
3738
{
@@ -262,6 +263,7 @@ private static InferenceResult InferTextFileColumnTypesCore(MLContext context, I
262263
Separators = new[] { args.Separator },
263264
AllowSparse = args.AllowSparse,
264265
AllowQuoting = args.AllowQuote,
266+
ReadMultilines = args.ReadMultilines,
265267
};
266268
var textLoader = context.Data.CreateTextLoader(textLoaderOptions);
267269
var idv = textLoader.Load(fileSource);

src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,16 @@ public class ColumnSplitResult
2323

2424
public bool AllowQuote { get; set; }
2525
public bool AllowSparse { get; set; }
26+
public bool ReadMultilines { get; set; }
2627

27-
public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount)
28+
public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
2829
{
2930
IsSuccess = isSuccess;
3031
Separator = separator;
3132
AllowQuote = allowQuote;
3233
AllowSparse = allowSparse;
3334
ColumnCount = columnCount;
35+
ReadMultilines = readMultilines;
3436
}
3537
}
3638

@@ -50,12 +52,14 @@ public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamS
5052
{
5153
var sparse = new[] { false, true };
5254
var quote = new[] { true, false };
55+
var tryMultiline = new[] { false, true };
5356
var foundAny = false;
5457
var result = default(ColumnSplitResult);
5558
foreach (var perm in (from _allowSparse in sparse
5659
from _allowQuote in quote
5760
from _sep in separatorCandidates
58-
select new { _allowSparse, _allowQuote, _sep }))
61+
from _tryMultiline in tryMultiline
62+
select new { _allowSparse, _allowQuote, _sep, _tryMultiline }))
5963
{
6064
var options = new TextLoader.Options
6165
{
@@ -66,7 +70,8 @@ from _sep in separatorCandidates
6670
} },
6771
Separators = new[] { perm._sep },
6872
AllowQuoting = perm._allowQuote,
69-
AllowSparse = perm._allowSparse
73+
AllowSparse = perm._allowSparse,
74+
ReadMultilines = perm._tryMultiline,
7075
};
7176

7277
if (TryParseFile(context, options, source, out result))
@@ -75,7 +80,7 @@ from _sep in separatorCandidates
7580
break;
7681
}
7782
}
78-
return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0);
83+
return foundAny ? result : new ColumnSplitResult(false, null, true, true, true, 0);
7984
}
8085

8186
private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source,
@@ -111,7 +116,7 @@ private static bool TryParseFile(MLContext context, TextLoader.Options options,
111116
// disallow single-column case
112117
if (mostCommon.Key <= 1) { return false; }
113118

114-
result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key);
119+
result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key);
115120
return true;
116121
}
117122
// fail gracefully if unable to instantiate data view with swept arguments

test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
using System.Collections.Generic;
33
using System.IO;
44
using System.Linq;
5+
using FluentAssertions;
56
using Microsoft.ML.Data;
67
using Microsoft.ML.TestFramework;
78
using Xunit;
@@ -186,5 +187,39 @@ public void InferColumnsColumnInfoParam()
186187
Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First());
187188
Assert.Null(result.ColumnInformation.ExampleWeightColumnName);
188189
}
190+
191+
[Fact]
192+
public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes()
193+
{
194+
var context = new MLContext();
195+
var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt");
196+
var sample = TextFileSample.CreateFromFullFile(dataset);
197+
var result = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators);
198+
199+
result.ColumnCount.Should().Be(4);
200+
result.Separator.Should().Be(',');
201+
result.IsSuccess.Should().BeTrue();
202+
}
203+
204+
[Fact]
205+
public void InferColumnsFromMultilineInputFile()
206+
{
207+
// Check if we can infer the column information
208+
// from and input file which has escaped newlines inside quotes
209+
var dataPath = GetDataPath("multiline.csv");
210+
MLContext mlContext = new MLContext();
211+
var inputColumnInformation = new ColumnInformation();
212+
inputColumnInformation.LabelColumnName = @"id";
213+
var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation);
214+
215+
// File has 3 columns: "id", "description" and "animal"
216+
Assert.NotNull(result.ColumnInformation.LabelColumnName);
217+
Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count);
218+
Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count);
219+
220+
Assert.Equal("id", result.ColumnInformation.LabelColumnName);
221+
Assert.Equal("description", result.ColumnInformation.TextColumnNames.First());
222+
Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First());
223+
}
189224
}
190-
}
225+
}

test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,14 @@
77
</ItemGroup>
88

99
<ItemGroup>
10+
<PackageReference Include="FluentAssertions" Version="5.10.3" />
1011
<PackageReference Include="SciSharp.TensorFlow.Redist" Version="$(TensorFlowVersion)" />
1112
</ItemGroup>
1213

1314
<ItemGroup>
15+
<None Update="TestData\DatasetWithNewlineBetweenQuotes.txt">
16+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
17+
</None>
1418
<None Update="TestData\DatasetWithDefaultColumnNames.txt">
1519
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
1620
</None>
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
id,Column1,Column2,Column3
2+
1,this is a description, 1,2
3+
2,"this is a quote description",1,2
4+
3,"this is a quote description with double quote("")",1,2
5+
4,"this is a quote description with ""a pair of double quote""",1,2
6+
5,"this is a quote description with new line
7+
quote",1,2
8+
6,"this is a quote description with
9+
new line1 and
10+
new line2 and empty line
11+
12+
and double quote""",1,2
13+
7, this is a description with single quote("),1,2
14+
// empty line between quotes
15+
8,"",1,2
16+
// single quote between quotes
17+
9,"""",1,2
18+
// simply newline between quotes
19+
10,"
20+
21+
22+
23+
",1,2
24+
// simply signle quote and newline between quotes
25+
11,"
26+
27+
""""
28+
29+
""
30+
31+
""
32+
33+
",1,2
34+
35+
36+

0 commit comments

Comments
 (0)