Skip to content

Commit e17aeb8

Browse files
committed
fast CSV
1 parent 9f74924 commit e17aeb8

File tree

1 file changed

+113
-95
lines changed

1 file changed

+113
-95
lines changed

Signum.Utilities/Csv.cs

Lines changed: 113 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
using System.Collections.Concurrent;
77
using System.Collections;
88
using System.IO.Pipes;
9+
using System;
10+
using System.ComponentModel.Design.Serialization;
911

1012
namespace Signum.Utilities;
1113

@@ -181,8 +183,7 @@ public static IEnumerable<T> ReadStream<T>(Stream stream, Encoding? encoding = n
181183

182184
var members = CsvMemberCache<T>.Members;
183185
var parsers = members.Select(m => GetParser(defCulture, m, defOptions.ParserFactory)).ToList();
184-
185-
Regex regex = GetRegex(defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
186+
Regex valueRegex = GetRegex(isLine: false, defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
186187

187188
if (defOptions.AsumeSingleLine)
188189
{
@@ -199,66 +200,68 @@ public static IEnumerable<T> ReadStream<T>(Stream stream, Encoding? encoding = n
199200
if (csvLine == null)
200201
yield break;
201202

202-
Match? m = null;
203-
T? t = null;
204-
try
203+
if (csvLine.Length > 0)
205204
{
206-
m = regex.Match(csvLine);
207-
if (m.Length > 0)
205+
T? t = null;
206+
try
208207
{
209-
t = ReadObject<T>(m, members, parsers);
208+
var m = valueRegex.EnumerateMatches(csvLine);
209+
210+
t = ReadObject<T>(m, csvLine.AsSpan(), members, parsers);
210211
}
211-
}
212-
catch (Exception e)
213-
{
214-
e.Data["row"] = line;
212+
catch (Exception e)
213+
{
214+
e.Data["row"] = line;
215215

216-
if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
217-
throw new ParseCsvException(e);
218-
}
216+
if (defOptions.SkipError == null || !defOptions.SkipError(e, csvLine))
217+
throw new ParseCsvException(e);
218+
}
219219

220-
if (t != null)
221-
yield return t;
220+
if (t != null)
221+
yield return t;
222222

223+
}
223224
line++;
224225
}
225226
}
226227
}
227228
else
228229
{
230+
Regex lineRegex = GetRegex(isLine: true, defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
231+
229232
using (StreamReader sr = new StreamReader(stream, encoding))
230233
{
231234
string str = sr.ReadToEnd();
232235

233-
var matches = regex.Matches(str).Cast<Match>();
234-
235-
if (skipLines > 0)
236-
matches = matches.Skip(skipLines);
237-
238-
int line = skipLines;
239-
foreach (var m in matches)
236+
int i = 0;
237+
foreach (Match m in lineRegex.Matches(str))
240238
{
239+
if (i < skipLines)
240+
continue;
241+
241242
if (m.Length > 0)
242243
{
243244
T? t = null;
244245
try
245246
{
247+
var line = m.Value;
248+
246249
if (options?.Constructor != null)
247-
t = options.Constructor(m);
250+
t = options.Constructor(line);
248251
else
249-
t = ReadObject<T>(m, members, parsers);
252+
t = ReadObject<T>(valueRegex.EnumerateMatches(line), line, members, parsers);
250253
}
251254
catch (Exception e)
252255
{
253-
e.Data["row"] = line;
256+
e.Data["row"] = i;
254257

255-
if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
258+
if (defOptions.SkipError == null || !defOptions.SkipError(e, str.Substring(m.Index, m.Length)))
256259
throw new ParseCsvException(e);
257260
}
258261
if (t != null)
259262
yield return t;
260263
}
261-
line++;
264+
i++;
262265
}
263266
}
264267
}
@@ -271,18 +274,20 @@ public static T ReadLine<T>(string csvLine, CultureInfo? culture = null, CsvRead
271274

272275
var defCulture = GetDefaultCulture(culture);
273276

274-
Regex regex = GetRegex(defCulture, defOptions.RegexTimeout);
277+
Regex regex = GetRegex(isLine: false, defCulture, defOptions.RegexTimeout);
275278

276-
Match m = regex.Match(csvLine);
279+
var vme = regex.EnumerateMatches(csvLine);
277280

278281
var members = CsvMemberCache<T>.Members;
279282

280-
return ReadObject<T>(m,
283+
return ReadObject<T>(vme,
284+
csvLine.AsSpan(),
281285
members,
282286
members.Select(c => GetParser(defCulture, c, defOptions.ParserFactory)).ToList());
283287
}
284288

285-
private static Func<string, object?> GetParser<T>(CultureInfo culture, CsvMemberInfo<T> column, Func<CsvMemberInfo<T>, CultureInfo, Func<string, object?>?>? parserFactory)
289+
290+
private static ValueParser GetParser<T>(CultureInfo culture, CsvMemberInfo<T> column, Func<CsvMemberInfo<T>, CultureInfo, ValueParser?>? parserFactory)
286291
{
287292
if (parserFactory != null)
288293
{
@@ -294,43 +299,51 @@ public static T ReadLine<T>(string csvLine, CultureInfo? culture = null, CsvRead
294299

295300
var type = column.IsCollection ? column.MemberInfo.ReturningType().ElementType()! : column.MemberInfo.ReturningType();
296301

297-
return str => ConvertTo(str, type, culture, column.Format);
302+
return GetBasicParser(type.UnNullify(), culture, column.Format);
298303
}
299304

300-
static T ReadObject<T>(Match m, List<CsvMemberInfo<T>> members, List<Func<string, object?>> parsers)
301-
{
302-
var vals = m.Groups["val"].Captures;
303-
304-
if (vals.Count < members.Count)
305-
throw new FormatException("Only {0} columns found (instead of {1}) in line: {2}".FormatWith(vals.Count, members.Count, m.Value));
305+
public delegate object? ValueParser(ReadOnlySpan<char> str);
306306

307+
static T ReadObject<T>(Regex.ValueMatchEnumerator vme, ReadOnlySpan<char> line, List<CsvMemberInfo<T>> members, List<ValueParser> parsers)
308+
{
307309
T t = Activator.CreateInstance<T>();
308310

309-
for (int i = 0; i < members.Count; i++)
311+
bool endsInCollection = false;
312+
int i = 0;
313+
foreach (var v in vme)
310314
{
315+
if (members.Count <= i)
316+
continue;
317+
318+
var value = line.Slice(v.Index, v.Length);
311319
var member = members[i];
312320
var parser = parsers[i];
313-
string? str = null;
314321
try
315322
{
316323
if (!member.IsCollection)
317324
{
318-
str = DecodeCsv(vals[i].Value);
325+
value = DecodeCsv(value);
319326

320-
object? val = parser(str);
327+
object? val = parser(value);
321328

322329
member.MemberEntry.Setter!(t, val);
323330
}
324331
else
325332
{
333+
if (i != members.Count - 1)
334+
throw new InvalidOperationException($"Collection {member.MemberInfo} should be the last member");
335+
endsInCollection = true;
326336
var list = (IList)Activator.CreateInstance(member.MemberInfo.ReturningType())!;
327337

328-
for (int j = i; j < vals.Count; j++)
329-
{
330-
str = DecodeCsv(vals[j].Value);
331-
332-
object? val = parser(str);
338+
value = DecodeCsv(value);
339+
object? val = parser(value);
340+
list.Add(val);
333341

342+
foreach (var v2 in vme)
343+
{
344+
value = line.Slice(v2.Index, v2.Length);
345+
value = DecodeCsv(value);
346+
val = parser(value);
334347
list.Add(val);
335348
}
336349

@@ -339,11 +352,17 @@ static T ReadObject<T>(Match m, List<CsvMemberInfo<T>> members, List<Func<string
339352
}
340353
catch (Exception e)
341354
{
342-
e.Data["value"] = str;
355+
e.Data["value"] = new String(value);
343356
e.Data["member"] = members[i].MemberInfo.Name;
344357
throw;
345358
}
359+
360+
i++;
346361
}
362+
363+
if (!endsInCollection && i != members.Count)
364+
throw new FormatException("Only {0} columns found (instead of {1}) in line: {2}".FormatWith(i, members.Count, new string(line)));
365+
347366
return t;
348367
}
349368

@@ -369,7 +388,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
369388
var defCulture = GetDefaultCulture(culture);
370389
var defOptions = options ?? new CsvReadOptions();
371390

372-
Regex regex = GetRegex(defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
391+
Regex valueRegex = GetRegex(false, defCulture, defOptions.RegexTimeout, defOptions.ListSeparator);
373392
if (defOptions.AsumeSingleLine)
374393
{
375394
using (StreamReader sr = new StreamReader(stream, encoding))
@@ -386,7 +405,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
386405
string[]? t = null;
387406
try
388407
{
389-
m = regex.Match(csvLine);
408+
m = valueRegex.Match(csvLine);
390409
if (m.Length > 0)
391410
{
392411
t = m.Groups["val"].Captures.Select(c => c.Value).ToArray();
@@ -396,7 +415,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
396415
{
397416
e.Data["row"] = line;
398417

399-
if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
418+
if (defOptions.SkipError == null || !defOptions.SkipError(e, csvLine))
400419
throw new ParseCsvException(e);
401420
}
402421

@@ -413,7 +432,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
413432
{
414433
string str = sr.ReadToEnd();
415434

416-
var matches = regex.Matches(str).Cast<Match>();
435+
var matches = valueRegex.Matches(str).Cast<Match>();
417436

418437
int line = 0;
419438
foreach (var m in matches)
@@ -429,7 +448,7 @@ public static IEnumerable<string[]> ReadUntypedStream(Stream stream, Encoding? e
429448
{
430449
e.Data["row"] = line;
431450

432-
if (defOptions.SkipError == null || !defOptions.SkipError(e, m))
451+
if (defOptions.SkipError == null || !defOptions.SkipError(e, m.Value))
433452
throw new ParseCsvException(e);
434453
}
435454
if (t != null)
@@ -530,16 +549,17 @@ public class MyFileCSV
530549
""";
531550
}
532551

533-
534-
static ConcurrentDictionary<char, Regex> regexCache = new ConcurrentDictionary<char, Regex>();
535-
const string BaseRegex = @"^((?<val>'(?:[^']+|'')*'|[^;\r\n]*))?((?!($|\r\n));(?<val>'(?:[^']+|'')*'|[^;\r\n]*))*($|\r\n)";
536-
static Regex GetRegex(CultureInfo culture, TimeSpan timeout, char? listSeparator = null)
552+
static ConcurrentDictionary<(bool multiLine, char separator, TimeSpan timeout), Regex> regexCache = new();
553+
readonly static string ValueRegex = "'(?:[^']+|'')*'|[^;\r\n]*".Replace('\'', '"');
554+
readonly static string LineRegex = $@"^({ValueRegex})?((?!($|\r\n));({ValueRegex}))*($|\r\n)";
555+
static Regex GetRegex(bool isLine, CultureInfo culture, TimeSpan timeout, char? listSeparator = null)
537556
{
538557
char separator = listSeparator ?? GetListSeparator(culture);
539558

540-
return regexCache.GetOrAdd(separator, s =>
541-
new Regex(BaseRegex.Replace('\'', '"').Replace(';', s), RegexOptions.Multiline | RegexOptions.ExplicitCapture, timeout));
559+
return regexCache.GetOrAdd((isLine, separator, timeout), a =>
560+
new Regex((isLine ? LineRegex : ValueRegex).Replace(';', a.separator), RegexOptions.Multiline | RegexOptions.ExplicitCapture, a.timeout));
542561
}
562+
543563

544564
private static char GetListSeparator(CultureInfo culture)
545565
{
@@ -570,62 +590,60 @@ static CsvMemberCache()
570590
public static List<CsvMemberInfo<T>> Members;
571591
}
572592

573-
static string DecodeCsv(string s)
593+
594+
595+
static ReadOnlySpan<char> DecodeCsv(ReadOnlySpan<char> s)
574596
{
575597
if (s.StartsWith("\"") && s.EndsWith("\""))
576598
{
577-
string str = s[1..^1].Replace("\"\"", "\"");
599+
string str = new string(s[1..^1]).Replace("\"\"", "\"");
578600

579601
return Regex.Replace(str, "(?<!\r)\n", "\r\n");
580602
}
581603

582604
return s;
583605
}
584606

585-
static object? ConvertTo(string s, Type type, CultureInfo culture, string? format)
607+
static ValueParser GetBasicParser(Type type, CultureInfo culture, string? format)
586608
{
587-
Type? baseType = Nullable.GetUnderlyingType(type);
588-
if (baseType != null)
609+
return type switch
589610
{
590-
if (!s.HasText())
591-
return null;
592-
593-
type = baseType;
594-
}
595-
596-
if (type.IsEnum)
597-
return Enum.Parse(type, s);
598-
599-
if (type == typeof(DateTime))
600-
if (format == null)
601-
return DateTime.Parse(s, culture);
602-
else
603-
return DateTime.ParseExact(s, format, culture);
604-
605-
if (type == typeof(DateOnly))
606-
if (format == null)
607-
return DateOnly.Parse(s, culture);
608-
else
609-
return DateOnly.ParseExact(s, format, culture);
610-
611-
if (type == typeof(Guid))
612-
return Guid.Parse(s);
613-
614-
return Convert.ChangeType(s, type, culture);
611+
_ when type == typeof(string) => str => str.Length == 0 ? null : str.ToString(),
612+
_ when type == typeof(byte) => str => str.Length == 0 ? null : byte.Parse(str, NumberStyles.Integer, culture),
613+
_ when type == typeof(sbyte) => str => str.Length == 0 ? null : sbyte.Parse(str, NumberStyles.Integer, culture),
614+
_ when type == typeof(short) => str => str.Length == 0 ? null : short.Parse(str, NumberStyles.Integer, culture),
615+
_ when type == typeof(ushort) => str => str.Length == 0 ? null : ushort.Parse(str, NumberStyles.Integer, culture),
616+
_ when type == typeof(int) => str => str.Length == 0 ? null : int.Parse(str, NumberStyles.Integer, culture),
617+
_ when type == typeof(uint) => str => str.Length == 0 ? null : uint.Parse(str, NumberStyles.Integer, culture),
618+
_ when type == typeof(long) => str => str.Length == 0 ? null : long.Parse(str, NumberStyles.Integer, culture),
619+
_ when type == typeof(ulong) => str => str.Length == 0 ? null : ulong.Parse(str, NumberStyles.Integer, culture),
620+
_ when type == typeof(float) => str => str.Length == 0 ? null : float.Parse(str, NumberStyles.Float, culture),
621+
_ when type == typeof(double) => str => str.Length == 0 ? null : double.Parse(str, NumberStyles.Float, culture),
622+
_ when type == typeof(decimal) => str => str.Length == 0 ? null : decimal.Parse(str, NumberStyles.Number, culture),
623+
_ when type == typeof(DateTime) => str => str.Length == 0 ? null : DateTime.ParseExact(str, format, culture),
624+
_ when type == typeof(DateTimeOffset) => str => str.Length == 0 ? null : DateTimeOffset.ParseExact(str, format, culture),
625+
_ when type == typeof(DateOnly) => str => str.Length == 0 ? null : DateOnly.ParseExact(str, format, culture),
626+
_ when type == typeof(TimeOnly) => str => str.Length == 0 ? null : TimeOnly.ParseExact(str, format, culture),
627+
_ when type == typeof(Guid) => str => str.Length == 0 ? null : Guid.Parse(str.ToString()),
628+
_ when type.IsEnum => str => str.Length == 0 ? null : Enum.Parse(type, str),
629+
_ => str => Convert.ChangeType(new string(str), type, culture)
630+
};
615631
}
616632
}
617633

618634
public class CsvReadOptions<T> : CsvReadOptions
619635
where T : class
620636
{
621-
public Func<CsvMemberInfo<T>, CultureInfo, Func<string, object?>?>? ParserFactory;
622-
public Func<Match, T>? Constructor;
637+
public Func<CsvMemberInfo<T>, CultureInfo, Csv.ValueParser?>? ParserFactory;
638+
public CsvConstructor<T>? Constructor;
623639
}
624640

641+
public delegate T CsvConstructor<T>(ReadOnlySpan<char> line);
642+
625643
public class CsvReadOptions
626644
{
627-
public bool AsumeSingleLine = false;
628-
public Func<Exception, Match?, bool>? SkipError;
645+
public bool AsumeSingleLine = true; //Breaking change!
646+
public Func<Exception, string, bool>? SkipError;
629647
public TimeSpan RegexTimeout = Regex.InfiniteMatchTimeout;
630648
public char? ListSeparator;
631649
}

0 commit comments

Comments
 (0)