Skip to content

Commit 27141e3

Browse files
authored
Added decimal marker option in TextLoader (#5145)
* Added decimal marker option in TextLoader * Added decimalChar to more TextLoader constructors * Removed decimalMarker from TextLoader constructors due to API breaking * Added unit test for ',' as a decimal marker, and added decimalMarker to TextLoaderCursor and TextLoaderParser * Added DecimalMarker in DoubleParser * Added decimal marker check and removed decimalMarker from CreateTextLoader's constructor * Added TextLoader decimalMarker unit tests, and refined logic in DoubleParser * Refine tests, logic, csv dataset * nit fix * Compressed tests using <T>
1 parent e3ca7e0 commit 27141e3

File tree

6 files changed

+523
-5
lines changed

6 files changed

+523
-5
lines changed

src/Microsoft.ML.Core/Utilities/DoubleParser.cs

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,21 @@ internal static class DoubleParser
1717
private const ulong TopThreeBits = 0xE000000000000000UL;
1818
private const char InfinitySymbol = '\u221E';
1919

20+
// Note for future development: DoubleParser is a static class and DecimalMarker is a
21+
// static variable, which means only one instance of these can exist at once. As such,
22+
// the value of DecimalMarker cannot vary when datasets with differing decimal markers
23+
// are loaded together at once, which would result in not being able to accurately read
24+
// the dataset with the differing decimal marker. Although this edge case where we attempt
25+
// to load in datasets with different decimal markers at once is unlikely to occur, we
26+
// should still be aware of this and plan to fix it in the future.
27+
28+
// The decimal marker that separates the integer part from the fractional part of a number
29+
// written in decimal from can vary across different cultures as either '.' or ','. The
30+
// default decimal marker in ML .NET is '.', however through this static char variable,
31+
// we allow users to specify the decimal marker used in their datasets as ',' as well.
32+
[BestFriend]
33+
internal static char DecimalMarker = '.';
34+
2035
// REVIEW: casting ulong to Double doesn't always do the right thing, for example
2136
// with 0x84595161401484A0UL. Hence the gymnastics several places in this code. Note that
2237
// long to Double does work. The work around is:
@@ -555,6 +570,12 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
555570
break;
556571

557572
case '.':
573+
if (DecimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error.
574+
return false; // Since this was an error, return false, which will later make the caller to set NaN as the out value.
575+
goto LPoint;
576+
case ',':
577+
if (DecimalMarker != ',') // Same logic as above.
578+
return false;
558579
goto LPoint;
559580

560581
// The common cases.
@@ -571,7 +592,7 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
571592
break;
572593
}
573594

574-
// Get digits before '.'
595+
// Get digits before the decimal marker, which may be '.' or ','
575596
uint d;
576597
for (; ; )
577598
{
@@ -593,14 +614,14 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
593614
}
594615
Contracts.Assert(i < span.Length);
595616

596-
if (span[i] != '.')
617+
if (span[i] != DecimalMarker)
597618
goto LAfterDigits;
598619

599620
LPoint:
600621
Contracts.Assert(i < span.Length);
601-
Contracts.Assert(span[i] == '.');
622+
Contracts.Assert(span[i] == DecimalMarker);
602623

603-
// Get the digits after '.'
624+
// Get the digits after the decimal marker, which may be '.' or ','
604625
for (; ; )
605626
{
606627
if (++i >= span.Length)

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,12 @@ public class Options
474474
[Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")]
475475
public char[] Separators = new[] { Defaults.Separator };
476476

477+
/// <summary>
478+
/// The character that should be used as the decimal marker. Default value is '.'. Only '.' and ',' are allowed to be decimal markers.
479+
/// </summary>
480+
[Argument(ArgumentType.AtMostOnce, Name = "Decimal Marker", HelpText = "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.", ShortName = "decimal")]
481+
public char DecimalMarker = Defaults.DecimalMarker;
482+
477483
/// <summary>
478484
/// Specifies the input columns that should be mapped to <see cref="IDataView"/> columns.
479485
/// </summary>
@@ -541,6 +547,7 @@ internal static class Defaults
541547
internal const bool AllowQuoting = false;
542548
internal const bool AllowSparse = false;
543549
internal const char Separator = '\t';
550+
internal const char DecimalMarker = '.';
544551
internal const bool HasHeader = false;
545552
internal const bool TrimWhitespace = false;
546553
internal const bool ReadMultilines = false;
@@ -1071,7 +1078,7 @@ private static VersionInfo GetVersionInfo()
10711078
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
10721079
//verWrittenCur: 0x0001000B, // Header now retained if used and present
10731080
//verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
1074-
verWrittenCur: 0x0001000D, // Added escapeChar option
1081+
verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker
10751082
verReadableCur: 0x0001000A,
10761083
verWeCanReadBack: 0x00010009,
10771084
loaderSignature: LoaderSignature,
@@ -1103,6 +1110,7 @@ private enum OptionFlags : uint
11031110
// Input size is zero for unknown - determined by the data (including sparse rows).
11041111
private readonly int _inputSize;
11051112
private readonly char[] _separators;
1113+
private readonly char _decimalMarker;
11061114
private readonly Bindings _bindings;
11071115

11081116
private readonly Parser _parser;
@@ -1219,6 +1227,11 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
12191227
}
12201228
}
12211229

1230+
if (options.DecimalMarker != '.' && options.DecimalMarker != ',')
1231+
throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker);
1232+
if (!options.AllowQuoting && options.DecimalMarker == ',' && _separators.Contains(','))
1233+
throw _host.ExceptUserArg(nameof(Options.AllowQuoting), "Quoting must be allowed if decimal marker and separator are the ',' character.");
1234+
_decimalMarker = options.DecimalMarker;
12221235
_escapeChar = options.EscapeChar;
12231236
if(_separators.Contains(_escapeChar))
12241237
throw _host.ExceptUserArg(nameof(Options.EscapeChar), "EscapeChar '{0}' can't be used both as EscapeChar and separator", _escapeChar);
@@ -1387,6 +1400,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
13871400
// int: number of separators
13881401
// char[]: separators
13891402
// char: escapeChar
1403+
// char: decimal marker
13901404
// bindings
13911405
int cbFloat = ctx.Reader.ReadInt32();
13921406
host.CheckDecode(cbFloat == sizeof(float));
@@ -1414,10 +1428,13 @@ private TextLoader(IHost host, ModelLoadContext ctx)
14141428
if (ctx.Header.ModelVerWritten >= 0x0001000D)
14151429
{
14161430
_escapeChar = ctx.Reader.ReadChar();
1431+
_decimalMarker = ctx.Reader.ReadChar();
1432+
host.CheckDecode(_decimalMarker == '.' || _decimalMarker == ',');
14171433
}
14181434
else
14191435
{
14201436
_escapeChar = Defaults.EscapeChar;
1437+
_decimalMarker = Defaults.DecimalMarker;
14211438
}
14221439

14231440
host.CheckDecode(!_separators.Contains(_escapeChar));
@@ -1463,6 +1480,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
14631480
// int: number of separators
14641481
// char[]: separators
14651482
// char: escapeChar
1483+
// char: decimal marker
14661484
// bindings
14671485
ctx.Writer.Write(sizeof(float));
14681486
ctx.Writer.Write(_maxRows);
@@ -1472,6 +1490,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
14721490
ctx.Writer.Write(_inputSize);
14731491
ctx.Writer.WriteCharArray(_separators);
14741492
ctx.Writer.Write(_escapeChar);
1493+
ctx.Writer.Write(_decimalMarker);
14751494

14761495
_bindings.Save(ctx);
14771496
}
@@ -1612,13 +1631,15 @@ public BoundLoader(TextLoader loader, IMultiStreamSource files)
16121631
public DataViewRowCursor GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand = null)
16131632
{
16141633
_host.CheckValueOrNull(rand);
1634+
DoubleParser.DecimalMarker = _loader._decimalMarker;
16151635
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
16161636
return Cursor.Create(_loader, _files, active);
16171637
}
16181638

16191639
public DataViewRowCursor[] GetRowCursorSet(IEnumerable<DataViewSchema.Column> columnsNeeded, int n, Random rand = null)
16201640
{
16211641
_host.CheckValueOrNull(rand);
1642+
DoubleParser.DecimalMarker = _loader._decimalMarker;
16221643
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
16231644
return Cursor.CreateSet(_loader, _files, active, n);
16241645
}

test/BaselineOutput/Common/EntryPoints/core_manifest.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,18 @@
369369
"\t"
370370
]
371371
},
372+
{
373+
"Name": "Decimal Marker",
374+
"Type": "Char",
375+
"Desc": "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.",
376+
"Aliases": [
377+
"decimal"
378+
],
379+
"Required": false,
380+
"SortOrder": 150.0,
381+
"IsNullable": false,
382+
"Default": "."
383+
},
372384
{
373385
"Name": "TrimWhitespace",
374386
"Type": "Bool",

test/Microsoft.ML.Tests/TextLoaderTests.cs

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,168 @@ public void TestTextLoaderBackCompat_VerWritt_0x0001000C()
840840
Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString());
841841
}
842842

843+
[Theory]
844+
[InlineData(true)]
845+
[InlineData(false)]
846+
public void TestCommaAsDecimalMarker(bool useCsvVersion)
847+
{
848+
// When userCsvVersion == false:
849+
// Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
850+
// decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
851+
852+
// When userCsvVersion == true:
853+
// Check to confirm TextLoader can read data from a CSV file where the separator is ',', decimals
854+
// are enclosed with quotes, and with the decimal marker being ','.
855+
856+
// Do these checks with both float and double as types of features being read, to test decimal marker
857+
// recognition with both doubles and floats.
858+
TestCommaAsDecimalMarkerHelper<float>(useCsvVersion);
859+
TestCommaAsDecimalMarkerHelper<double>(useCsvVersion);
860+
}
861+
862+
private void TestCommaAsDecimalMarkerHelper<T>(bool useCsvVersion)
863+
{
864+
// Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
865+
// decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
866+
// Datasets iris.txt and iris-decimal-marker-as-comma.csv have the exact same data, however the .csv
867+
// version has ',' as decimal marker and separator, and feature values are enclosed with quotes.
868+
// T varies as either float or double, so that decimal markers can be tested for both floating
869+
// point value types.
870+
var mlContext = new MLContext(seed: 1);
871+
872+
// Read dataset with period as decimal marker.
873+
string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
874+
var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
875+
{
876+
Columns = new[]
877+
{
878+
new TextLoader.Column("Label", DataKind.UInt32, 0),
879+
new TextLoader.Column("Features", typeof(T) == typeof(double) ? DataKind.Double : DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
880+
},
881+
DecimalMarker = '.'
882+
});
883+
var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod));
884+
885+
// Load values from iris.txt
886+
DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema;
887+
using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod);
888+
UInt32 labelPeriod = default;
889+
ValueGetter<UInt32> labelDelegatePeriod = cursorPeriod.GetGetter<UInt32>(columnsPeriod[0]);
890+
VBuffer<T> featuresPeriod = default;
891+
ValueGetter<VBuffer<T>> featuresDelegatePeriod = cursorPeriod.GetGetter<VBuffer<T>>(columnsPeriod[1]);
892+
893+
// Iterate over each row and save labels and features to array for future comparison
894+
int count = 0;
895+
UInt32[] labels = new uint[150];
896+
T[][] features = new T[150][];
897+
while (cursorPeriod.MoveNext())
898+
{
899+
//Get values from respective columns
900+
labelDelegatePeriod(ref labelPeriod);
901+
featuresDelegatePeriod(ref featuresPeriod);
902+
labels[count] = labelPeriod;
903+
features[count] = featuresPeriod.GetValues().ToArray();
904+
count++;
905+
}
906+
907+
// Read dataset with comma as decimal marker.
908+
// Dataset is either the .csv version or the .txt version.
909+
string dataPathDecimalMarkerComma;
910+
TextLoader.Options options = new TextLoader.Options()
911+
{
912+
Columns = new[]
913+
{
914+
new TextLoader.Column("Label", DataKind.UInt32, 0),
915+
new TextLoader.Column("Features", typeof(T) == typeof(double) ? DataKind.Double : DataKind.Single, new [] { new TextLoader.Range(1, 4) })
916+
},
917+
};
918+
// Set TextLoader.Options for the .csv or .txt cases.
919+
if (useCsvVersion)
920+
{
921+
dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.csv");
922+
options.DecimalMarker = ',';
923+
options.Separator = ",";
924+
options.AllowQuoting = true;
925+
options.HasHeader = true;
926+
}
927+
else
928+
{
929+
dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
930+
options.DecimalMarker = ',';
931+
}
932+
var readerDecimalMarkerComma = new TextLoader(mlContext, options);
933+
var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
934+
935+
// Load values from dataset with comma as decimal marker
936+
DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
937+
using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
938+
UInt32 labelComma = default;
939+
ValueGetter<UInt32> labelDelegateComma = cursorComma.GetGetter<UInt32>(columnsComma[0]);
940+
VBuffer<T> featuresComma = default;
941+
ValueGetter<VBuffer<T>> featuresDelegateComma = cursorComma.GetGetter<VBuffer<T>>(columnsComma[1]);
942+
943+
// Check values from dataset with comma as decimal marker match those in iris.txt (period decimal marker)
944+
count = 0;
945+
while (cursorComma.MoveNext())
946+
{
947+
//Get values from respective columns
948+
labelDelegateComma(ref labelComma);
949+
featuresDelegateComma(ref featuresComma);
950+
Assert.Equal(labels[count], labelComma);
951+
Assert.Equal(features[count], featuresComma.GetValues().ToArray());
952+
count++;
953+
}
954+
}
955+
956+
[Theory]
957+
[InlineData(true)]
958+
[InlineData(false)]
959+
public void TestWrongDecimalMarkerInputs(bool useCommaAsDecimalMarker)
960+
{
961+
// When DecimalMarker does not match the actual decimal marker used in the dataset,
962+
// we obtain values of NaN. Check that the values are indeed NaN in this case.
963+
// Do this check for both cases where decimal markers in the dataset are '.' and ','.
964+
var mlContext = new MLContext(seed: 1);
965+
966+
// Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ',',
967+
// and vice versa.
968+
string dataPath;
969+
TextLoader.Options options = new TextLoader.Options()
970+
{
971+
Columns = new[]
972+
{
973+
new TextLoader.Column("Label", DataKind.UInt32, 0),
974+
new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) })
975+
},
976+
};
977+
if (useCommaAsDecimalMarker)
978+
{
979+
dataPath = GetDataPath("iris.txt"); // Has '.' as decimal marker inside dataset
980+
options.DecimalMarker = ','; // Choose wrong decimal marker on purpose
981+
}
982+
else
983+
{
984+
dataPath = GetDataPath("iris-decimal-marker-as-comma.txt"); // Has ',' as decimal marker inside dataset
985+
options.DecimalMarker = '.'; // Choose wrong decimal marker on purpose
986+
}
987+
var reader = new TextLoader(mlContext, options);
988+
var textData = reader.Load(GetDataPath(dataPath));
989+
990+
// Check that the features being loaded are NaN.
991+
DataViewSchema columns = textData.Schema;
992+
using DataViewRowCursor cursor = textData.GetRowCursor(columns);
993+
VBuffer<Single> featuresPeriod = default;
994+
ValueGetter<VBuffer<Single>> featuresDelegatePeriod = cursor.GetGetter<VBuffer<Single>>(columns[1]);
995+
996+
// Iterate over each row and check that feature values are NaN.
997+
while (cursor.MoveNext())
998+
{
999+
featuresDelegatePeriod.Invoke(ref featuresPeriod);
1000+
foreach(float feature in featuresPeriod.GetValues().ToArray())
1001+
Assert.Equal(feature, Single.NaN);
1002+
}
1003+
}
1004+
8431005
private class IrisNoFields
8441006
{
8451007
}

0 commit comments

Comments
 (0)