Skip to content

Created DoubleParser.OptionFlags to be used by TextLoader #5154

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 24, 2020
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public bool HasAllBooleanValues()
bool value;
// (note: Conversions.Instance.TryParse parses an empty string as a Boolean)
return !string.IsNullOrEmpty(x.ToString()) &&
Conversions.Instance.TryParse(in x, out value);
Conversions.DefaultInstance.TryParse(in x, out value);
}))
{
return true;
Expand Down Expand Up @@ -164,7 +164,7 @@ public void Apply(IntermediateColumn[] columns)
col.SuggestedType = BooleanDataViewType.Instance;
bool first;

col.HasHeader = !Conversions.Instance.TryParse(in col.RawData[0], out first);
col.HasHeader = !Conversions.DefaultInstance.TryParse(in col.RawData[0], out first);
}
}
}
Expand All @@ -179,7 +179,7 @@ public void Apply(IntermediateColumn[] columns)
.All(x =>
{
float value;
return Conversions.Instance.TryParse(in x, out value);
return Conversions.DefaultInstance.TryParse(in x, out value);
})
)
{
Expand Down
68 changes: 36 additions & 32 deletions src/Microsoft.ML.Core/Utilities/DoubleParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,24 @@ namespace Microsoft.ML.Internal.Utilities
[BestFriend]
internal static class DoubleParser
{
[BestFriend]
[Flags]
internal enum OptionFlags : uint
{
Default = 0x00,

// If this flag is set, then a "," will be used as Decimal Marker
// (i.e., the punctuation mark that separates the integer part of
// a number and its decimal part). If this isn't set, then
// default behavior is to use "." as decimal marker.
UseCommaAsDecimalMarker = 0x01,
}

private const ulong TopBit = 0x8000000000000000UL;
private const ulong TopTwoBits = 0xC000000000000000UL;
private const ulong TopThreeBits = 0xE000000000000000UL;
private const char InfinitySymbol = '\u221E';

// Note for future development: DoubleParser is a static class and DecimalMarker is a
// static variable, which means only one instance of these can exist at once. As such,
// the value of DecimalMarker cannot vary when datasets with differing decimal markers
// are loaded together at once, which would result in not being able to accurately read
// the dataset with the differing decimal marker. Although this edge case where we attempt
// to load in datasets with different decimal markers at once is unlikely to occur, we
// should still be aware of this and plan to fix it in the future.

// The decimal marker that separates the integer part from the fractional part of a number
// written in decimal from can vary across different cultures as either '.' or ','. The
// default decimal marker in ML .NET is '.', however through this static char variable,
// we allow users to specify the decimal marker used in their datasets as ',' as well.
[BestFriend]
internal static char DecimalMarker = '.';

// REVIEW: casting ulong to Double doesn't always do the right thing, for example
// with 0x84595161401484A0UL. Hence the gymnastics several places in this code. Note that
// long to Double does work. The work around is:
Expand Down Expand Up @@ -85,24 +83,24 @@ public enum Result
/// <summary>
/// This produces zero for an empty string.
/// </summary>
public static bool TryParse(ReadOnlySpan<char> span, out Single value)
public static bool TryParse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
{
var res = Parse(span, out value);
var res = Parse(span, out value, flags);
Contracts.Assert(res != Result.Empty || value == 0);
return res <= Result.Empty;
}

/// <summary>
/// This produces zero for an empty string.
/// </summary>
public static bool TryParse(ReadOnlySpan<char> span, out Double value)
public static bool TryParse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
{
var res = Parse(span, out value);
var res = Parse(span, out value, flags);
Contracts.Assert(res != Result.Empty || value == 0);
return res <= Result.Empty;
}

public static Result Parse(ReadOnlySpan<char> span, out Single value)
public static Result Parse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
{
int ich = 0;
for (; ; ich++)
Expand Down Expand Up @@ -133,7 +131,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value)
}

int ichEnd;
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd))
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd, flags))
{
value = default(Single);
return Result.Error;
Expand All @@ -150,7 +148,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value)
return Result.Good;
}

public static Result Parse(ReadOnlySpan<char> span, out Double value)
public static Result Parse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
{
int ich = 0;
for (; ; ich++)
Expand Down Expand Up @@ -181,7 +179,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value)
}

int ichEnd;
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd))
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd, flags))
{
value = default(Double);
return Result.Error;
Expand All @@ -198,14 +196,14 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value)
return Result.Good;
}

public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int ichEnd)
public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int ichEnd, OptionFlags flags = OptionFlags.Default)
{
bool neg = false;
ulong num = 0;
long exp = 0;

ichEnd = 0;
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp))
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp, flags))
return TryParseSpecial(span, ref ichEnd, out value);

if (num == 0)
Expand Down Expand Up @@ -287,14 +285,14 @@ public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int i
return true;
}

public static bool TryParse(ReadOnlySpan<char> span, out Double value, out int ichEnd)
public static bool TryParse(ReadOnlySpan<char> span, out Double value, out int ichEnd, OptionFlags flags = OptionFlags.Default)
{
bool neg = false;
ulong num = 0;
long exp = 0;

ichEnd = 0;
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp))
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp, flags))
return TryParseSpecial(span, ref ichEnd, out value);

if (num == 0)
Expand Down Expand Up @@ -535,13 +533,19 @@ private static bool TryParseSpecial(ReadOnlySpan<char> span, ref int ich, out Si
return false;
}

private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool neg, ref ulong num, ref long exp)
private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool neg, ref ulong num, ref long exp, OptionFlags flags = OptionFlags.Default)
{
Contracts.Assert(0 <= ich & ich <= span.Length);
Contracts.Assert(!neg);
Contracts.Assert(num == 0);
Contracts.Assert(exp == 0);

char decimalMarker;
if ((flags & OptionFlags.UseCommaAsDecimalMarker) != 0)
decimalMarker = ',';
else
decimalMarker = '.';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why as a flag OptionFlags.UseCommaAsDecimalMarker instead of just having a user enter a char directly? Having user enter a char directly lets them any character like _ in 1000_23.

Pandas lets users enter the char directly, see decimal parameter:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

They also allow for entering the char for thousands (technically allow a string).

Excel also lets the user enter the chars directly:
https://www.officetooltips.com/excel_2016/tips/change_the_decimal_point_to_a_comma_or_vice_versa.html

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The public API to let users change the decimal marker was implemented and decided on #5145. We accept a character as the DecimalMarker option, but we actually only accept "," or "." and throw exceptions if the user sets other character.

On this PR I am not changing that behavior, but actually only changing how we connect the TextLoader to the DoubleParser (as the way we connected them wasn't thread safe, and would have unexpected behavior in some cases). Since we only accept "," or ".", then I used a Flag here. Since this flag is of internal use (i.e. it isn't determining a public API), it could later be replaced by an options object or something else if we get to accept other characters/strings as decimal separators. In any case, if we were to accept other characters different from "," and ".", then we'd actually need to change other things on the parser and on textloader to make this work correctly... so perhaps that could be left for future work, since I don't think we'd have time to correctly implement and test those other options before the next release. So, I think it's better to simply merge this PR to enable users to use "," and "." as decimal markers in a safe manner in the upcoming release, and then discuss about the possibility of enabling other characters.

As for the thousands parameter you mention, it is included inside a list of possible features that we might want to implement for TextLoader, but it isn't in our plans to implement it before the upcoming release.


In reply to: 429606655 [](ancestors = 429606655)


if (ich >= span.Length)
return false;

Expand Down Expand Up @@ -570,11 +574,11 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
break;

case '.':
if (DecimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error.
if (decimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error.
return false; // Since this was an error, return false, which will later make the caller to set NaN as the out value.
goto LPoint;
case ',':
if (DecimalMarker != ',') // Same logic as above.
if (decimalMarker != ',') // Same logic as above.
return false;
goto LPoint;

Expand Down Expand Up @@ -614,12 +618,12 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
}
Contracts.Assert(i < span.Length);

if (span[i] != DecimalMarker)
if (span[i] != decimalMarker)
goto LAfterDigits;

LPoint:
Contracts.Assert(i < span.Length);
Contracts.Assert(span[i] == DecimalMarker);
Contracts.Assert(span[i] == decimalMarker);

// Get the digits after the decimal marker, which may be '.' or ','
for (; ; )
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Commands/ShowSchemaCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ private static void ShowMetadataValue<T>(IndentedTextWriter itw, DataViewSchema
Contracts.Assert(!(type is VectorDataViewType));
Contracts.Assert(type.RawType == typeof(T));

var conv = Conversions.Instance.GetStringConversion<T>(type);
var conv = Conversions.DefaultInstance.GetStringConversion<T>(type);

var value = default(T);
var sb = default(StringBuilder);
Expand Down Expand Up @@ -272,7 +272,7 @@ private static void ShowMetadataValueVec<T>(IndentedTextWriter itw, DataViewSche
Contracts.AssertValue(type);
Contracts.Assert(type.ItemType.RawType == typeof(T));

var conv = Conversions.Instance.GetStringConversion<T>(type.ItemType);
var conv = Conversions.DefaultInstance.GetStringConversion<T>(type.ItemType);

var value = default(VBuffer<T>);
schema[col].Annotations.GetValue(kind, ref value);
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Commands/TypeInfoCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public void Run()
{
using (var ch = _host.Start("Run"))
{
var conv = Conversions.Instance;
var conv = Conversions.DefaultInstance;
var comp = new SetOfKindsComparer();
var dstToSrcMap = new Dictionary<HashSet<InternalDataKind>, HashSet<InternalDataKind>>(comp);
var srcToDstMap = new Dictionary<InternalDataKind, HashSet<InternalDataKind>>();
Expand Down Expand Up @@ -143,7 +143,7 @@ private TypeNaInfo KindReport<T>(IChannel ch, PrimitiveDataViewType type)
ch.AssertValue(type);
ch.Assert(type.IsStandardScalar());

var conv = Conversions.Instance;
var conv = Conversions.DefaultInstance;
InPredicate<T> isNaDel;
bool hasNaPred = conv.TryGetIsNAPredicate(type, out isNaDel);
bool defaultIsNa = false;
Expand Down
31 changes: 20 additions & 11 deletions src/Microsoft.ML.Data/Data/Conversion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,26 @@ private static readonly FuncInstanceMethodInfo1<Conversions, KeyDataViewType, De

// REVIEW: Reconcile implementations with TypeUtils, and clarify the distinction.

// Singleton pattern.
private static volatile Conversions _instance;
public static Conversions Instance
// Default instance used by most of the codebase
// Currently, only TextLoader would sometimes not use this instance
private static volatile Conversions _defaultInstance;
public static Conversions DefaultInstance
{
get
{
return _instance ??
Interlocked.CompareExchange(ref _instance, new Conversions(), null) ??
_instance;
return _defaultInstance ??
Interlocked.CompareExchange(ref _defaultInstance, new Conversions(), null) ??
_defaultInstance;
}
}

// Currently only TextLoader could create instances using non-default DoubleParser.OptionFlags
private readonly DoubleParser.OptionFlags _doubleParserOptionFlags;
public static Conversions CreateInstanceWithDoubleParserOptions(DoubleParser.OptionFlags doubleParserOptionFlags)
{
return new Conversions(doubleParserOptionFlags);
}

// Maps from {src,dst} pair of DataKind to ValueMapper. The {src,dst} pair is
// the two byte values packed into the low two bytes of an int, with src the lsb.
private readonly Dictionary<(Type src, Type dst), Delegate> _delegatesStd;
Expand Down Expand Up @@ -92,7 +100,7 @@ public static Conversions Instance
// This has TryParseMapper<T> delegates for parsing values from text.
private readonly Dictionary<Type, Delegate> _tryParseDelegates;

private Conversions()
private Conversions(DoubleParser.OptionFlags doubleParserOptionFlags = DoubleParser.OptionFlags.Default)
{
_delegatesStd = new Dictionary<(Type src, Type dst), Delegate>();
_delegatesAll = new Dictionary<(Type src, Type dst), Delegate>();
Expand All @@ -102,6 +110,7 @@ private Conversions()
_hasZeroDelegates = new Dictionary<Type, Delegate>();
_getNADelegates = new Dictionary<Type, Delegate>();
_tryParseDelegates = new Dictionary<Type, Delegate>();
_doubleParserOptionFlags = doubleParserOptionFlags;

// !!! WARNING !!!: Do NOT add any standard conversions without clearing from the IDV Type System
// design committee. Any changes also require updating the IDV Type System Specification.
Expand Down Expand Up @@ -1333,7 +1342,7 @@ private void TryParseSigned(long max, in TX text, out long? result)
public bool TryParse(in TX src, out R4 dst)
{
var span = src.Span;
if (DoubleParser.TryParse(span, out dst))
if (DoubleParser.TryParse(span, out dst, _doubleParserOptionFlags))
return true;
dst = R4.NaN;
return IsStdMissing(ref span);
Expand All @@ -1346,7 +1355,7 @@ public bool TryParse(in TX src, out R4 dst)
public bool TryParse(in TX src, out R8 dst)
{
var span = src.Span;
if (DoubleParser.TryParse(span, out dst))
if (DoubleParser.TryParse(span, out dst, _doubleParserOptionFlags))
return true;
dst = R8.NaN;
return IsStdMissing(ref span);
Expand Down Expand Up @@ -1630,15 +1639,15 @@ public void Convert(in TX span, ref UG value)
public void Convert(in TX src, ref R4 value)
{
var span = src.Span;
if (DoubleParser.TryParse(span, out value))
if (DoubleParser.TryParse(span, out value, _doubleParserOptionFlags))
return;
// Unparsable is mapped to NA.
value = R4.NaN;
}
public void Convert(in TX src, ref R8 value)
{
var span = src.Span;
if (DoubleParser.TryParse(span, out value))
if (DoubleParser.TryParse(span, out value, _doubleParserOptionFlags))
return;
// Unparsable is mapped to NA.
value = R8.NaN;
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Data/DataViewUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1352,7 +1352,7 @@ public static ValueGetter<ReadOnlyMemory<char>> GetSingleValueGetter<T>(DataView
var floatGetter = cursor.GetGetter<T>(cursor.Schema[i]);
T v = default(T);
ValueMapper<T, StringBuilder> conversion;
if (!Conversions.Instance.TryGetStringConversion<T>(colType, out conversion))
if (!Conversions.DefaultInstance.TryGetStringConversion<T>(colType, out conversion))
{
var error = $"Cannot display {colType}";
conversion = (in T src, ref StringBuilder builder) =>
Expand Down Expand Up @@ -1383,7 +1383,7 @@ public static ValueGetter<ReadOnlyMemory<char>> GetVectorFlatteningGetter<T>(Dat
var vbuf = default(VBuffer<T>);
const int previewValues = 100;
ValueMapper<T, StringBuilder> conversion;
Conversions.Instance.TryGetStringConversion<T>(colType, out conversion);
Conversions.DefaultInstance.TryGetStringConversion<T>(colType, out conversion);
StringBuilder dst = null;
ValueGetter<ReadOnlyMemory<char>> getter =
(ref ReadOnlyMemory<char> value) =>
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Data/Data/RowCursorUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ private static ValueGetter<TDst> GetGetterAsCore<TSrc, TDst>(DataViewType typeSr

var getter = row.GetGetter<TSrc>(row.Schema[col]);
bool identity;
var conv = Conversions.Instance.GetStandardConversion<TSrc, TDst>(typeSrc, typeDst, out identity);
var conv = Conversions.DefaultInstance.GetStandardConversion<TSrc, TDst>(typeSrc, typeDst, out identity);
if (identity)
{
Contracts.Assert(typeof(TSrc) == typeof(TDst));
Expand Down Expand Up @@ -134,7 +134,7 @@ private static ValueGetter<StringBuilder> GetGetterAsStringBuilderCore<TSrc>(Dat
Contracts.Assert(typeof(TSrc) == typeSrc.RawType);

var getter = row.GetGetter<TSrc>(row.Schema[col]);
var conv = Conversions.Instance.GetStringConversion<TSrc>(typeSrc);
var conv = Conversions.DefaultInstance.GetStringConversion<TSrc>(typeSrc);

var src = default(TSrc);
return
Expand Down Expand Up @@ -260,7 +260,7 @@ private static ValueGetter<VBuffer<TDst>> GetVecGetterAsCore<TSrc, TDst>(VectorD

var getter = getterFact.GetGetter<VBuffer<TSrc>>();
bool identity;
var conv = Conversions.Instance.GetStandardConversion<TSrc, TDst>(typeSrc.ItemType, typeDst, out identity);
var conv = Conversions.DefaultInstance.GetStandardConversion<TSrc, TDst>(typeSrc.ItemType, typeDst, out identity);
if (identity)
{
Contracts.Assert(typeof(TSrc) == typeof(TDst));
Expand Down
2 changes: 0 additions & 2 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1631,15 +1631,13 @@ public BoundLoader(TextLoader loader, IMultiStreamSource files)
public DataViewRowCursor GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand = null)
{
_host.CheckValueOrNull(rand);
DoubleParser.DecimalMarker = _loader._decimalMarker;
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
return Cursor.Create(_loader, _files, active);
}

public DataViewRowCursor[] GetRowCursorSet(IEnumerable<DataViewSchema.Column> columnsNeeded, int n, Random rand = null)
{
_host.CheckValueOrNull(rand);
DoubleParser.DecimalMarker = _loader._decimalMarker;
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
return Cursor.CreateSet(_loader, _files, active, n);
}
Expand Down
Loading