-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Created DoubleParser.OptionFlags to be used by TextLoader #5154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cce5ec4
c059e2c
63e5c62
efaf972
1dde1ab
6a5567d
8c5cf1f
0266f3f
e63ea52
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,26 +12,24 @@ namespace Microsoft.ML.Internal.Utilities | |
[BestFriend] | ||
internal static class DoubleParser | ||
{ | ||
[BestFriend] | ||
[Flags] | ||
internal enum OptionFlags : uint | ||
{ | ||
Default = 0x00, | ||
|
||
// If this flag is set, then a "," will be used as Decimal Marker | ||
// (i.e., the punctuation mark that separates the integer part of | ||
// a number and its decimal part). If this isn't set, then | ||
// default behavior is to use "." as decimal marker. | ||
UseCommaAsDecimalMarker = 0x01, | ||
} | ||
antoniovs1029 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
private const ulong TopBit = 0x8000000000000000UL; | ||
private const ulong TopTwoBits = 0xC000000000000000UL; | ||
private const ulong TopThreeBits = 0xE000000000000000UL; | ||
private const char InfinitySymbol = '\u221E'; | ||
|
||
// Note for future development: DoubleParser is a static class and DecimalMarker is a | ||
// static variable, which means only one instance of these can exist at once. As such, | ||
// the value of DecimalMarker cannot vary when datasets with differing decimal markers | ||
// are loaded together at once, which would result in not being able to accurately read | ||
// the dataset with the differing decimal marker. Although this edge case where we attempt | ||
// to load in datasets with different decimal markers at once is unlikely to occur, we | ||
// should still be aware of this and plan to fix it in the future. | ||
|
||
// The decimal marker that separates the integer part from the fractional part of a number | ||
// written in decimal from can vary across different cultures as either '.' or ','. The | ||
// default decimal marker in ML .NET is '.', however through this static char variable, | ||
// we allow users to specify the decimal marker used in their datasets as ',' as well. | ||
[BestFriend] | ||
internal static char DecimalMarker = '.'; | ||
|
||
// REVIEW: casting ulong to Double doesn't always do the right thing, for example | ||
// with 0x84595161401484A0UL. Hence the gymnastics several places in this code. Note that | ||
// long to Double does work. The work around is: | ||
|
@@ -85,24 +83,24 @@ public enum Result | |
/// <summary> | ||
/// This produces zero for an empty string. | ||
/// </summary> | ||
public static bool TryParse(ReadOnlySpan<char> span, out Single value) | ||
public static bool TryParse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default) | ||
{ | ||
var res = Parse(span, out value); | ||
var res = Parse(span, out value, flags); | ||
Contracts.Assert(res != Result.Empty || value == 0); | ||
return res <= Result.Empty; | ||
} | ||
|
||
/// <summary> | ||
/// This produces zero for an empty string. | ||
/// </summary> | ||
public static bool TryParse(ReadOnlySpan<char> span, out Double value) | ||
public static bool TryParse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default) | ||
{ | ||
var res = Parse(span, out value); | ||
var res = Parse(span, out value, flags); | ||
Contracts.Assert(res != Result.Empty || value == 0); | ||
return res <= Result.Empty; | ||
} | ||
|
||
public static Result Parse(ReadOnlySpan<char> span, out Single value) | ||
public static Result Parse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default) | ||
{ | ||
int ich = 0; | ||
for (; ; ich++) | ||
|
@@ -133,7 +131,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value) | |
} | ||
|
||
int ichEnd; | ||
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd)) | ||
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd, flags)) | ||
{ | ||
value = default(Single); | ||
return Result.Error; | ||
|
@@ -150,7 +148,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value) | |
return Result.Good; | ||
} | ||
|
||
public static Result Parse(ReadOnlySpan<char> span, out Double value) | ||
public static Result Parse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default) | ||
{ | ||
int ich = 0; | ||
for (; ; ich++) | ||
|
@@ -181,7 +179,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value) | |
} | ||
|
||
int ichEnd; | ||
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd)) | ||
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd, flags)) | ||
{ | ||
value = default(Double); | ||
return Result.Error; | ||
|
@@ -198,14 +196,14 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value) | |
return Result.Good; | ||
} | ||
|
||
public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int ichEnd) | ||
public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int ichEnd, OptionFlags flags = OptionFlags.Default) | ||
{ | ||
bool neg = false; | ||
ulong num = 0; | ||
long exp = 0; | ||
|
||
ichEnd = 0; | ||
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp)) | ||
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp, flags)) | ||
return TryParseSpecial(span, ref ichEnd, out value); | ||
|
||
if (num == 0) | ||
|
@@ -287,14 +285,14 @@ public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int i | |
return true; | ||
} | ||
|
||
public static bool TryParse(ReadOnlySpan<char> span, out Double value, out int ichEnd) | ||
public static bool TryParse(ReadOnlySpan<char> span, out Double value, out int ichEnd, OptionFlags flags = OptionFlags.Default) | ||
{ | ||
bool neg = false; | ||
ulong num = 0; | ||
long exp = 0; | ||
|
||
ichEnd = 0; | ||
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp)) | ||
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp, flags)) | ||
return TryParseSpecial(span, ref ichEnd, out value); | ||
|
||
if (num == 0) | ||
|
@@ -535,13 +533,19 @@ private static bool TryParseSpecial(ReadOnlySpan<char> span, ref int ich, out Si | |
return false; | ||
} | ||
|
||
private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool neg, ref ulong num, ref long exp) | ||
private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool neg, ref ulong num, ref long exp, OptionFlags flags = OptionFlags.Default) | ||
{ | ||
Contracts.Assert(0 <= ich & ich <= span.Length); | ||
Contracts.Assert(!neg); | ||
Contracts.Assert(num == 0); | ||
Contracts.Assert(exp == 0); | ||
|
||
char decimalMarker; | ||
if ((flags & OptionFlags.UseCommaAsDecimalMarker) != 0) | ||
decimalMarker = ','; | ||
else | ||
decimalMarker = '.'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why as a flag Pandas lets users enter the char directly, see They also allow for entering the char for Excel also lets the user enter the chars directly: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The public API to let users change the decimal marker was implemented and decided on #5145. We accept a character as the DecimalMarker option, but we actually only accept "," or "." and throw exceptions if the user sets other character. On this PR I am not changing that behavior, but actually only changing how we connect the TextLoader to the DoubleParser (as the way we connected them wasn't thread safe, and would have unexpected behavior in some cases). Since we only accept "," or ".", then I used a Flag here. Since this flag is of internal use (i.e. it isn't determining a public API), it could later be replaced by an options object or something else if we get to accept other characters/strings as decimal separators. In any case, if we were to accept other characters different from "," and ".", then we'd actually need to change other things on the parser and on textloader to make this work correctly... so perhaps that could be left for future work, since I don't think we'd have time to correctly implement and test those other options before the next release. So, I think it's better to simply merge this PR to enable users to use "," and "." as decimal markers in a safe manner in the upcoming release, and then discuss about the possibility of enabling other characters. As for the thousands parameter you mention, it is included inside a list of possible features that we might want to implement for TextLoader, but it isn't in our plans to implement it before the upcoming release. In reply to: 429606655 [](ancestors = 429606655) |
||
|
||
if (ich >= span.Length) | ||
return false; | ||
|
||
|
@@ -570,11 +574,11 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool | |
break; | ||
|
||
case '.': | ||
if (DecimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error. | ||
if (decimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error. | ||
return false; // Since this was an error, return false, which will later make the caller to set NaN as the out value. | ||
goto LPoint; | ||
case ',': | ||
if (DecimalMarker != ',') // Same logic as above. | ||
if (decimalMarker != ',') // Same logic as above. | ||
return false; | ||
goto LPoint; | ||
|
||
|
@@ -614,12 +618,12 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool | |
} | ||
Contracts.Assert(i < span.Length); | ||
|
||
if (span[i] != DecimalMarker) | ||
if (span[i] != decimalMarker) | ||
goto LAfterDigits; | ||
|
||
LPoint: | ||
Contracts.Assert(i < span.Length); | ||
Contracts.Assert(span[i] == DecimalMarker); | ||
Contracts.Assert(span[i] == decimalMarker); | ||
|
||
// Get the digits after the decimal marker, which may be '.' or ',' | ||
for (; ; ) | ||
|
Uh oh!
There was an error while loading. Please reload this page.