Skip to content

Commit

Permalink
support sweeping multiline option in AutoML (#5148)
Browse files Browse the repository at this point in the history
* upgrade to 3.1

* write inline data using invariantCulture

* add tryMulti in AutoML and test

* add test for AutoML inferColumn API

* Update test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs

Co-authored-by: Justin Ormont <justinormont@users.noreply.github.com>

Co-authored-by: Justin Ormont <justinormont@users.noreply.github.com>
  • Loading branch information
LittleLittleCloud and justinormont authored May 21, 2020
1 parent c576d5e commit e3ca7e0
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 6 deletions.
3 changes: 3 additions & 0 deletions src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
Separators = new[] { splitInference.Separator.Value },
AllowSparse = splitInference.AllowSparse,
AllowQuoting = splitInference.AllowQuote,
ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
TrimWhitespace = trimWhitespace
};
Expand Down Expand Up @@ -91,6 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
AllowQuoting = splitInference.AllowQuote,
AllowSparse = splitInference.AllowSparse,
Separators = new char[] { splitInference.Separator.Value },
ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
TrimWhitespace = trimWhitespace
};
Expand Down Expand Up @@ -139,6 +141,7 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co
Separator = splitInference.Separator.Value,
AllowSparse = splitInference.AllowSparse,
AllowQuote = splitInference.AllowQuote,
ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
LabelColumnIndex = labelColumnIndex,
Label = label
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ internal sealed class Arguments
public int MaxRowsToRead;
public uint? LabelColumnIndex;
public string Label;
public bool ReadMultilines;

public Arguments()
{
Expand Down Expand Up @@ -262,6 +263,7 @@ private static InferenceResult InferTextFileColumnTypesCore(MLContext context, I
Separators = new[] { args.Separator },
AllowSparse = args.AllowSparse,
AllowQuoting = args.AllowQuote,
ReadMultilines = args.ReadMultilines,
};
var textLoader = context.Data.CreateTextLoader(textLoaderOptions);
var idv = textLoader.Load(fileSource);
Expand Down
15 changes: 10 additions & 5 deletions src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,16 @@ public class ColumnSplitResult

public bool AllowQuote { get; set; }
public bool AllowSparse { get; set; }
public bool ReadMultilines { get; set; }

public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount)
public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
{
IsSuccess = isSuccess;
Separator = separator;
AllowQuote = allowQuote;
AllowSparse = allowSparse;
ColumnCount = columnCount;
ReadMultilines = readMultilines;
}
}

Expand All @@ -50,12 +52,14 @@ public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamS
{
var sparse = new[] { false, true };
var quote = new[] { true, false };
var tryMultiline = new[] { false, true };
var foundAny = false;
var result = default(ColumnSplitResult);
foreach (var perm in (from _allowSparse in sparse
from _allowQuote in quote
from _sep in separatorCandidates
select new { _allowSparse, _allowQuote, _sep }))
from _tryMultiline in tryMultiline
select new { _allowSparse, _allowQuote, _sep, _tryMultiline }))
{
var options = new TextLoader.Options
{
Expand All @@ -66,7 +70,8 @@ from _sep in separatorCandidates
} },
Separators = new[] { perm._sep },
AllowQuoting = perm._allowQuote,
AllowSparse = perm._allowSparse
AllowSparse = perm._allowSparse,
ReadMultilines = perm._tryMultiline,
};

if (TryParseFile(context, options, source, out result))
Expand All @@ -75,7 +80,7 @@ from _sep in separatorCandidates
break;
}
}
return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0);
return foundAny ? result : new ColumnSplitResult(false, null, true, true, true, 0);
}

private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source,
Expand Down Expand Up @@ -111,7 +116,7 @@ private static bool TryParseFile(MLContext context, TextLoader.Options options,
// disallow single-column case
if (mostCommon.Key <= 1) { return false; }

result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key);
result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key);
return true;
}
// fail gracefully if unable to instantiate data view with swept arguments
Expand Down
37 changes: 36 additions & 1 deletion test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using FluentAssertions;
using Microsoft.ML.Data;
using Microsoft.ML.TestFramework;
using Xunit;
Expand Down Expand Up @@ -186,5 +187,39 @@ public void InferColumnsColumnInfoParam()
Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First());
Assert.Null(result.ColumnInformation.ExampleWeightColumnName);
}

[Fact]
public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes()
{
var context = new MLContext();
var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt");
var sample = TextFileSample.CreateFromFullFile(dataset);
var result = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators);

result.ColumnCount.Should().Be(4);
result.Separator.Should().Be(',');
result.IsSuccess.Should().BeTrue();
}

[Fact]
public void InferColumnsFromMultilineInputFile()
{
// Check if we can infer the column information
// from and input file which has escaped newlines inside quotes
var dataPath = GetDataPath("multiline.csv");
MLContext mlContext = new MLContext();
var inputColumnInformation = new ColumnInformation();
inputColumnInformation.LabelColumnName = @"id";
var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation);

// File has 3 columns: "id", "description" and "animal"
Assert.NotNull(result.ColumnInformation.LabelColumnName);
Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count);
Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count);

Assert.Equal("id", result.ColumnInformation.LabelColumnName);
Assert.Equal("description", result.ColumnInformation.TextColumnNames.First());
Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First());
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@
</ItemGroup>

<ItemGroup>
<PackageReference Include="FluentAssertions" Version="5.10.3" />
<PackageReference Include="SciSharp.TensorFlow.Redist" Version="$(TensorFlowVersion)" />
</ItemGroup>

<ItemGroup>
<None Update="TestData\DatasetWithNewlineBetweenQuotes.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\DatasetWithDefaultColumnNames.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
id,Column1,Column2,Column3
1,this is a description, 1,2
2,"this is a quote description",1,2
3,"this is a quote description with double quote("")",1,2
4,"this is a quote description with ""a pair of double quote""",1,2
5,"this is a quote description with new line
quote",1,2
6,"this is a quote description with
new line1 and
new line2 and empty line

and double quote""",1,2
7, this is a description with single quote("),1,2
// empty line between quotes
8,"",1,2
// single quote between quotes
9,"""",1,2
// simply newline between quotes
10,"



",1,2
// simply signle quote and newline between quotes
11,"

""""

""

""

",1,2



0 comments on commit e3ca7e0

Please sign in to comment.