Skip to content

Commit bc9abda

Browse files
Created DoubleParser.OptionFlags to be used internally by TextLoader (#5154)
* Created the DoubleParser.OptionFlags enum, and made changes on Conversion.cs and TextLoader.ValueCreatorCache so that it can be used by TextLoader.Parser * Break the singleton pattern on Conversion and ValueCreatorCache, so that they could have more instances with custom DoubleParser.OptionFlags. These new instances would only be used by TextLoader, and the DefaultInstance is used elsewhere in the codebase.
1 parent 27141e3 commit bc9abda

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+315
-156
lines changed

src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ public bool HasAllBooleanValues()
8282
bool value;
8383
// (note: Conversions.Instance.TryParse parses an empty string as a Boolean)
8484
return !string.IsNullOrEmpty(x.ToString()) &&
85-
Conversions.Instance.TryParse(in x, out value);
85+
Conversions.DefaultInstance.TryParse(in x, out value);
8686
}))
8787
{
8888
return true;
@@ -164,7 +164,7 @@ public void Apply(IntermediateColumn[] columns)
164164
col.SuggestedType = BooleanDataViewType.Instance;
165165
bool first;
166166

167-
col.HasHeader = !Conversions.Instance.TryParse(in col.RawData[0], out first);
167+
col.HasHeader = !Conversions.DefaultInstance.TryParse(in col.RawData[0], out first);
168168
}
169169
}
170170
}
@@ -179,7 +179,7 @@ public void Apply(IntermediateColumn[] columns)
179179
.All(x =>
180180
{
181181
float value;
182-
return Conversions.Instance.TryParse(in x, out value);
182+
return Conversions.DefaultInstance.TryParse(in x, out value);
183183
})
184184
)
185185
{

src/Microsoft.ML.Core/Utilities/DoubleParser.cs

Lines changed: 36 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,24 @@ namespace Microsoft.ML.Internal.Utilities
1212
[BestFriend]
1313
internal static class DoubleParser
1414
{
15+
[BestFriend]
16+
[Flags]
17+
internal enum OptionFlags : uint
18+
{
19+
Default = 0x00,
20+
21+
// If this flag is set, then a "," will be used as Decimal Marker
22+
// (i.e., the punctuation mark that separates the integer part of
23+
// a number and its decimal part). If this isn't set, then
24+
// default behavior is to use "." as decimal marker.
25+
UseCommaAsDecimalMarker = 0x01,
26+
}
27+
1528
private const ulong TopBit = 0x8000000000000000UL;
1629
private const ulong TopTwoBits = 0xC000000000000000UL;
1730
private const ulong TopThreeBits = 0xE000000000000000UL;
1831
private const char InfinitySymbol = '\u221E';
1932

20-
// Note for future development: DoubleParser is a static class and DecimalMarker is a
21-
// static variable, which means only one instance of these can exist at once. As such,
22-
// the value of DecimalMarker cannot vary when datasets with differing decimal markers
23-
// are loaded together at once, which would result in not being able to accurately read
24-
// the dataset with the differing decimal marker. Although this edge case where we attempt
25-
// to load in datasets with different decimal markers at once is unlikely to occur, we
26-
// should still be aware of this and plan to fix it in the future.
27-
28-
// The decimal marker that separates the integer part from the fractional part of a number
29-
// written in decimal from can vary across different cultures as either '.' or ','. The
30-
// default decimal marker in ML .NET is '.', however through this static char variable,
31-
// we allow users to specify the decimal marker used in their datasets as ',' as well.
32-
[BestFriend]
33-
internal static char DecimalMarker = '.';
34-
3533
// REVIEW: casting ulong to Double doesn't always do the right thing, for example
3634
// with 0x84595161401484A0UL. Hence the gymnastics several places in this code. Note that
3735
// long to Double does work. The work around is:
@@ -85,24 +83,24 @@ public enum Result
8583
/// <summary>
8684
/// This produces zero for an empty string.
8785
/// </summary>
88-
public static bool TryParse(ReadOnlySpan<char> span, out Single value)
86+
public static bool TryParse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
8987
{
90-
var res = Parse(span, out value);
88+
var res = Parse(span, out value, flags);
9189
Contracts.Assert(res != Result.Empty || value == 0);
9290
return res <= Result.Empty;
9391
}
9492

9593
/// <summary>
9694
/// This produces zero for an empty string.
9795
/// </summary>
98-
public static bool TryParse(ReadOnlySpan<char> span, out Double value)
96+
public static bool TryParse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
9997
{
100-
var res = Parse(span, out value);
98+
var res = Parse(span, out value, flags);
10199
Contracts.Assert(res != Result.Empty || value == 0);
102100
return res <= Result.Empty;
103101
}
104102

105-
public static Result Parse(ReadOnlySpan<char> span, out Single value)
103+
public static Result Parse(ReadOnlySpan<char> span, out Single value, OptionFlags flags = OptionFlags.Default)
106104
{
107105
int ich = 0;
108106
for (; ; ich++)
@@ -133,7 +131,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value)
133131
}
134132

135133
int ichEnd;
136-
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd))
134+
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd, flags))
137135
{
138136
value = default(Single);
139137
return Result.Error;
@@ -150,7 +148,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Single value)
150148
return Result.Good;
151149
}
152150

153-
public static Result Parse(ReadOnlySpan<char> span, out Double value)
151+
public static Result Parse(ReadOnlySpan<char> span, out Double value, OptionFlags flags = OptionFlags.Default)
154152
{
155153
int ich = 0;
156154
for (; ; ich++)
@@ -181,7 +179,7 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value)
181179
}
182180

183181
int ichEnd;
184-
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd))
182+
if (!DoubleParser.TryParse(span.Slice(ich, span.Length - ich), out value, out ichEnd, flags))
185183
{
186184
value = default(Double);
187185
return Result.Error;
@@ -198,14 +196,14 @@ public static Result Parse(ReadOnlySpan<char> span, out Double value)
198196
return Result.Good;
199197
}
200198

201-
public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int ichEnd)
199+
public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int ichEnd, OptionFlags flags = OptionFlags.Default)
202200
{
203201
bool neg = false;
204202
ulong num = 0;
205203
long exp = 0;
206204

207205
ichEnd = 0;
208-
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp))
206+
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp, flags))
209207
return TryParseSpecial(span, ref ichEnd, out value);
210208

211209
if (num == 0)
@@ -287,14 +285,14 @@ public static bool TryParse(ReadOnlySpan<char> span, out Single value, out int i
287285
return true;
288286
}
289287

290-
public static bool TryParse(ReadOnlySpan<char> span, out Double value, out int ichEnd)
288+
public static bool TryParse(ReadOnlySpan<char> span, out Double value, out int ichEnd, OptionFlags flags = OptionFlags.Default)
291289
{
292290
bool neg = false;
293291
ulong num = 0;
294292
long exp = 0;
295293

296294
ichEnd = 0;
297-
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp))
295+
if (!TryParseCore(span, ref ichEnd, ref neg, ref num, ref exp, flags))
298296
return TryParseSpecial(span, ref ichEnd, out value);
299297

300298
if (num == 0)
@@ -535,13 +533,19 @@ private static bool TryParseSpecial(ReadOnlySpan<char> span, ref int ich, out Si
535533
return false;
536534
}
537535

538-
private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool neg, ref ulong num, ref long exp)
536+
private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool neg, ref ulong num, ref long exp, OptionFlags flags = OptionFlags.Default)
539537
{
540538
Contracts.Assert(0 <= ich & ich <= span.Length);
541539
Contracts.Assert(!neg);
542540
Contracts.Assert(num == 0);
543541
Contracts.Assert(exp == 0);
544542

543+
char decimalMarker;
544+
if ((flags & OptionFlags.UseCommaAsDecimalMarker) != 0)
545+
decimalMarker = ',';
546+
else
547+
decimalMarker = '.';
548+
545549
if (ich >= span.Length)
546550
return false;
547551

@@ -570,11 +574,11 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
570574
break;
571575

572576
case '.':
573-
if (DecimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error.
577+
if (decimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error.
574578
return false; // Since this was an error, return false, which will later make the caller to set NaN as the out value.
575579
goto LPoint;
576580
case ',':
577-
if (DecimalMarker != ',') // Same logic as above.
581+
if (decimalMarker != ',') // Same logic as above.
578582
return false;
579583
goto LPoint;
580584

@@ -614,12 +618,12 @@ private static bool TryParseCore(ReadOnlySpan<char> span, ref int ich, ref bool
614618
}
615619
Contracts.Assert(i < span.Length);
616620

617-
if (span[i] != DecimalMarker)
621+
if (span[i] != decimalMarker)
618622
goto LAfterDigits;
619623

620624
LPoint:
621625
Contracts.Assert(i < span.Length);
622-
Contracts.Assert(span[i] == DecimalMarker);
626+
Contracts.Assert(span[i] == decimalMarker);
623627

624628
// Get the digits after the decimal marker, which may be '.' or ','
625629
for (; ; )

src/Microsoft.ML.Data/Commands/ShowSchemaCommand.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ private static void ShowMetadataValue<T>(IndentedTextWriter itw, DataViewSchema
234234
Contracts.Assert(!(type is VectorDataViewType));
235235
Contracts.Assert(type.RawType == typeof(T));
236236

237-
var conv = Conversions.Instance.GetStringConversion<T>(type);
237+
var conv = Conversions.DefaultInstance.GetStringConversion<T>(type);
238238

239239
var value = default(T);
240240
var sb = default(StringBuilder);
@@ -272,7 +272,7 @@ private static void ShowMetadataValueVec<T>(IndentedTextWriter itw, DataViewSche
272272
Contracts.AssertValue(type);
273273
Contracts.Assert(type.ItemType.RawType == typeof(T));
274274

275-
var conv = Conversions.Instance.GetStringConversion<T>(type.ItemType);
275+
var conv = Conversions.DefaultInstance.GetStringConversion<T>(type.ItemType);
276276

277277
var value = default(VBuffer<T>);
278278
schema[col].Annotations.GetValue(kind, ref value);

src/Microsoft.ML.Data/Commands/TypeInfoCommand.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public void Run()
7979
{
8080
using (var ch = _host.Start("Run"))
8181
{
82-
var conv = Conversions.Instance;
82+
var conv = Conversions.DefaultInstance;
8383
var comp = new SetOfKindsComparer();
8484
var dstToSrcMap = new Dictionary<HashSet<InternalDataKind>, HashSet<InternalDataKind>>(comp);
8585
var srcToDstMap = new Dictionary<InternalDataKind, HashSet<InternalDataKind>>();
@@ -143,7 +143,7 @@ private TypeNaInfo KindReport<T>(IChannel ch, PrimitiveDataViewType type)
143143
ch.AssertValue(type);
144144
ch.Assert(type.IsStandardScalar());
145145

146-
var conv = Conversions.Instance;
146+
var conv = Conversions.DefaultInstance;
147147
InPredicate<T> isNaDel;
148148
bool hasNaPred = conv.TryGetIsNAPredicate(type, out isNaDel);
149149
bool defaultIsNa = false;

src/Microsoft.ML.Data/Data/Conversion.cs

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,26 @@ private static readonly FuncInstanceMethodInfo1<Conversions, KeyDataViewType, De
5353

5454
// REVIEW: Reconcile implementations with TypeUtils, and clarify the distinction.
5555

56-
// Singleton pattern.
57-
private static volatile Conversions _instance;
58-
public static Conversions Instance
56+
// Default instance used by most of the codebase
57+
// Currently, only TextLoader would sometimes not use this instance
58+
private static volatile Conversions _defaultInstance;
59+
public static Conversions DefaultInstance
5960
{
6061
get
6162
{
62-
return _instance ??
63-
Interlocked.CompareExchange(ref _instance, new Conversions(), null) ??
64-
_instance;
63+
return _defaultInstance ??
64+
Interlocked.CompareExchange(ref _defaultInstance, new Conversions(), null) ??
65+
_defaultInstance;
6566
}
6667
}
6768

69+
// Currently only TextLoader could create instances using non-default DoubleParser.OptionFlags
70+
private readonly DoubleParser.OptionFlags _doubleParserOptionFlags;
71+
public static Conversions CreateInstanceWithDoubleParserOptions(DoubleParser.OptionFlags doubleParserOptionFlags)
72+
{
73+
return new Conversions(doubleParserOptionFlags);
74+
}
75+
6876
// Maps from {src,dst} pair of DataKind to ValueMapper. The {src,dst} pair is
6977
// the two byte values packed into the low two bytes of an int, with src the lsb.
7078
private readonly Dictionary<(Type src, Type dst), Delegate> _delegatesStd;
@@ -92,7 +100,7 @@ public static Conversions Instance
92100
// This has TryParseMapper<T> delegates for parsing values from text.
93101
private readonly Dictionary<Type, Delegate> _tryParseDelegates;
94102

95-
private Conversions()
103+
private Conversions(DoubleParser.OptionFlags doubleParserOptionFlags = DoubleParser.OptionFlags.Default)
96104
{
97105
_delegatesStd = new Dictionary<(Type src, Type dst), Delegate>();
98106
_delegatesAll = new Dictionary<(Type src, Type dst), Delegate>();
@@ -102,6 +110,7 @@ private Conversions()
102110
_hasZeroDelegates = new Dictionary<Type, Delegate>();
103111
_getNADelegates = new Dictionary<Type, Delegate>();
104112
_tryParseDelegates = new Dictionary<Type, Delegate>();
113+
_doubleParserOptionFlags = doubleParserOptionFlags;
105114

106115
// !!! WARNING !!!: Do NOT add any standard conversions without clearing from the IDV Type System
107116
// design committee. Any changes also require updating the IDV Type System Specification.
@@ -1333,7 +1342,7 @@ private void TryParseSigned(long max, in TX text, out long? result)
13331342
public bool TryParse(in TX src, out R4 dst)
13341343
{
13351344
var span = src.Span;
1336-
if (DoubleParser.TryParse(span, out dst))
1345+
if (DoubleParser.TryParse(span, out dst, _doubleParserOptionFlags))
13371346
return true;
13381347
dst = R4.NaN;
13391348
return IsStdMissing(ref span);
@@ -1346,7 +1355,7 @@ public bool TryParse(in TX src, out R4 dst)
13461355
public bool TryParse(in TX src, out R8 dst)
13471356
{
13481357
var span = src.Span;
1349-
if (DoubleParser.TryParse(span, out dst))
1358+
if (DoubleParser.TryParse(span, out dst, _doubleParserOptionFlags))
13501359
return true;
13511360
dst = R8.NaN;
13521361
return IsStdMissing(ref span);
@@ -1630,15 +1639,15 @@ public void Convert(in TX span, ref UG value)
16301639
public void Convert(in TX src, ref R4 value)
16311640
{
16321641
var span = src.Span;
1633-
if (DoubleParser.TryParse(span, out value))
1642+
if (DoubleParser.TryParse(span, out value, _doubleParserOptionFlags))
16341643
return;
16351644
// Unparsable is mapped to NA.
16361645
value = R4.NaN;
16371646
}
16381647
public void Convert(in TX src, ref R8 value)
16391648
{
16401649
var span = src.Span;
1641-
if (DoubleParser.TryParse(span, out value))
1650+
if (DoubleParser.TryParse(span, out value, _doubleParserOptionFlags))
16421651
return;
16431652
// Unparsable is mapped to NA.
16441653
value = R8.NaN;

src/Microsoft.ML.Data/Data/DataViewUtils.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,7 +1352,7 @@ public static ValueGetter<ReadOnlyMemory<char>> GetSingleValueGetter<T>(DataView
13521352
var floatGetter = cursor.GetGetter<T>(cursor.Schema[i]);
13531353
T v = default(T);
13541354
ValueMapper<T, StringBuilder> conversion;
1355-
if (!Conversions.Instance.TryGetStringConversion<T>(colType, out conversion))
1355+
if (!Conversions.DefaultInstance.TryGetStringConversion<T>(colType, out conversion))
13561356
{
13571357
var error = $"Cannot display {colType}";
13581358
conversion = (in T src, ref StringBuilder builder) =>
@@ -1383,7 +1383,7 @@ public static ValueGetter<ReadOnlyMemory<char>> GetVectorFlatteningGetter<T>(Dat
13831383
var vbuf = default(VBuffer<T>);
13841384
const int previewValues = 100;
13851385
ValueMapper<T, StringBuilder> conversion;
1386-
Conversions.Instance.TryGetStringConversion<T>(colType, out conversion);
1386+
Conversions.DefaultInstance.TryGetStringConversion<T>(colType, out conversion);
13871387
StringBuilder dst = null;
13881388
ValueGetter<ReadOnlyMemory<char>> getter =
13891389
(ref ReadOnlyMemory<char> value) =>

src/Microsoft.ML.Data/Data/RowCursorUtils.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ private static ValueGetter<TDst> GetGetterAsCore<TSrc, TDst>(DataViewType typeSr
9494

9595
var getter = row.GetGetter<TSrc>(row.Schema[col]);
9696
bool identity;
97-
var conv = Conversions.Instance.GetStandardConversion<TSrc, TDst>(typeSrc, typeDst, out identity);
97+
var conv = Conversions.DefaultInstance.GetStandardConversion<TSrc, TDst>(typeSrc, typeDst, out identity);
9898
if (identity)
9999
{
100100
Contracts.Assert(typeof(TSrc) == typeof(TDst));
@@ -134,7 +134,7 @@ private static ValueGetter<StringBuilder> GetGetterAsStringBuilderCore<TSrc>(Dat
134134
Contracts.Assert(typeof(TSrc) == typeSrc.RawType);
135135

136136
var getter = row.GetGetter<TSrc>(row.Schema[col]);
137-
var conv = Conversions.Instance.GetStringConversion<TSrc>(typeSrc);
137+
var conv = Conversions.DefaultInstance.GetStringConversion<TSrc>(typeSrc);
138138

139139
var src = default(TSrc);
140140
return
@@ -260,7 +260,7 @@ private static ValueGetter<VBuffer<TDst>> GetVecGetterAsCore<TSrc, TDst>(VectorD
260260

261261
var getter = getterFact.GetGetter<VBuffer<TSrc>>();
262262
bool identity;
263-
var conv = Conversions.Instance.GetStandardConversion<TSrc, TDst>(typeSrc.ItemType, typeDst, out identity);
263+
var conv = Conversions.DefaultInstance.GetStandardConversion<TSrc, TDst>(typeSrc.ItemType, typeDst, out identity);
264264
if (identity)
265265
{
266266
Contracts.Assert(typeof(TSrc) == typeof(TDst));

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1631,15 +1631,13 @@ public BoundLoader(TextLoader loader, IMultiStreamSource files)
16311631
public DataViewRowCursor GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand = null)
16321632
{
16331633
_host.CheckValueOrNull(rand);
1634-
DoubleParser.DecimalMarker = _loader._decimalMarker;
16351634
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
16361635
return Cursor.Create(_loader, _files, active);
16371636
}
16381637

16391638
public DataViewRowCursor[] GetRowCursorSet(IEnumerable<DataViewSchema.Column> columnsNeeded, int n, Random rand = null)
16401639
{
16411640
_host.CheckValueOrNull(rand);
1642-
DoubleParser.DecimalMarker = _loader._decimalMarker;
16431641
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
16441642
return Cursor.CreateSet(_loader, _files, active, n);
16451643
}

0 commit comments

Comments
 (0)