Skip to content

Commit 81d0ba5

Browse files
authored
Add WriteCsv plus unit tests. (dotnet#2947)
* Add WriteCsv plus unit tests. * Add CultureInfo to WriteCsv. Remove index column param. Update unit tests. * Add CR changes. CultureInfo. Separator. * Format decimal types individually. Fix culture info. Fix unit tests. * Format decimal types individually. Fix culture info. Fix unit tests.
1 parent 4e6d801 commit 81d0ba5

File tree

2 files changed

+274
-0
lines changed

2 files changed

+274
-0
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44

55
using System;
66
using System.Collections.Generic;
7+
using System.Globalization;
78
using System.IO;
9+
using System.Linq;
810
using System.Text;
11+
using Microsoft.ML;
912

1013
namespace Microsoft.Data.Analysis
1114
{
@@ -306,5 +309,119 @@ public static DataFrame LoadCsv(Stream csvStream,
306309
return ret;
307310
}
308311
}
312+
313+
/// <summary>
314+
/// Writes a DataFrame into a CSV.
315+
/// </summary>
316+
/// <param name="dataFrame"><see cref="DataFrame"/></param>
317+
/// <param name="path">CSV file path</param>
318+
/// <param name="separator">column separator</param>
319+
/// <param name="header">has a header or not</param>
320+
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
321+
/// <param name="cultureInfo">culture info for formatting values</param>
322+
public static void WriteCsv(DataFrame dataFrame, string path,
323+
char separator = ',', bool header = true,
324+
Encoding encoding = null, CultureInfo cultureInfo = null)
325+
{
326+
using (FileStream csvStream = new FileStream(path, FileMode.Create))
327+
{
328+
WriteCsv(dataFrame: dataFrame, csvStream: csvStream,
329+
separator: separator, header: header,
330+
encoding: encoding, cultureInfo: cultureInfo);
331+
}
332+
}
333+
334+
/// <summary>
335+
/// Writes a DataFrame into a CSV.
336+
/// </summary>
337+
/// <param name="dataFrame"><see cref="DataFrame"/></param>
338+
/// <param name="csvStream">stream of CSV data to be write out</param>
339+
/// <param name="separator">column separator</param>
340+
/// <param name="header">has a header or not</param>
341+
/// <param name="encoding">the character encoding. Defaults to UTF8 if not specified</param>
342+
/// <param name="cultureInfo">culture info for formatting values</param>
343+
public static void WriteCsv(DataFrame dataFrame, Stream csvStream,
344+
char separator = ',', bool header = true,
345+
Encoding encoding = null, CultureInfo cultureInfo = null)
346+
{
347+
if (cultureInfo is null)
348+
{
349+
cultureInfo = CultureInfo.CurrentCulture;
350+
}
351+
352+
if (cultureInfo.NumberFormat.NumberDecimalSeparator.Equals(separator.ToString()))
353+
{
354+
throw new ArgumentException("Decimal separator cannot match the column separator");
355+
}
356+
357+
if (encoding is null)
358+
{
359+
encoding = Encoding.ASCII;
360+
}
361+
362+
using (StreamWriter csvFile = new StreamWriter(csvStream, encoding, bufferSize: DefaultStreamReaderBufferSize, leaveOpen: true))
363+
{
364+
if (dataFrame != null)
365+
{
366+
var columnNames = dataFrame.Columns.GetColumnNames();
367+
368+
if (header)
369+
{
370+
var headerColumns = string.Join(separator.ToString(), columnNames);
371+
csvFile.WriteLine(headerColumns);
372+
}
373+
374+
var record = new StringBuilder();
375+
376+
foreach (var row in dataFrame.Rows)
377+
{
378+
bool firstRow = true;
379+
foreach (var cell in row)
380+
{
381+
if (!firstRow)
382+
{
383+
record.Append(separator);
384+
}
385+
else
386+
{
387+
firstRow = false;
388+
}
389+
390+
Type t = cell?.GetType();
391+
392+
if (t == typeof(bool))
393+
{
394+
record.AppendFormat(cultureInfo, "{0}", cell);
395+
continue;
396+
}
397+
398+
if (t == typeof(float))
399+
{
400+
record.AppendFormat(cultureInfo, "{0:G9}", cell);
401+
continue;
402+
}
403+
404+
if (t == typeof(double))
405+
{
406+
record.AppendFormat(cultureInfo, "{0:G17}", cell);
407+
continue;
408+
}
409+
410+
if (t == typeof(decimal))
411+
{
412+
record.AppendFormat(cultureInfo, "{0:G31}", cell);
413+
continue;
414+
}
415+
416+
record.Append(cell);
417+
}
418+
419+
csvFile.WriteLine(record);
420+
421+
record.Clear();
422+
}
423+
}
424+
}
425+
}
309426
}
310427
}

tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Globalization;
67
using System.IO;
8+
using System.Linq;
79
using System.Text;
810
using Apache.Arrow;
911
using Xunit;
@@ -603,5 +605,160 @@ Stream GetStream(string streamData)
603605
Assert.Null(df[2, 2]);
604606
Assert.Null(df[5, 3]);
605607
}
608+
609+
[Fact]
610+
public void TestWriteCsvWithHeader()
611+
{
612+
using MemoryStream csvStream = new MemoryStream();
613+
DataFrame dataFrame = MakeDataFrameWithAllColumnTypes(10, true);
614+
615+
DataFrame.WriteCsv(dataFrame, csvStream);
616+
617+
csvStream.Seek(0, SeekOrigin.Begin);
618+
DataFrame readIn = DataFrame.LoadCsv(csvStream);
619+
620+
Assert.Equal(dataFrame.Rows.Count, readIn.Rows.Count);
621+
Assert.Equal(dataFrame.Columns.Count, readIn.Columns.Count);
622+
Assert.Equal(1F, readIn[1, 0]);
623+
Assert.Equal(1F, readIn[1, 1]);
624+
Assert.Equal(1F, readIn[1, 2]);
625+
Assert.Equal(1F, readIn[1, 3]);
626+
Assert.Equal(1F, readIn[1, 4]);
627+
Assert.Equal(1F, readIn[1, 5]);
628+
Assert.Equal(1F, readIn[1, 6]);
629+
Assert.Equal(1F, readIn[1, 7]);
630+
Assert.Equal(1F, readIn[1, 8]);
631+
Assert.Equal(1F, readIn[1, 9]);
632+
Assert.Equal(1F, readIn[1, 10]);
633+
}
634+
635+
[Fact]
636+
public void TestWriteCsvWithCultureInfoRomanianAndSemiColon()
637+
{
638+
DataFrame dataFrame = MakeDataFrameWithNumericColumns(10, true);
639+
dataFrame[1, 1] = 1.1M;
640+
dataFrame[1, 2] = 1.2D;
641+
dataFrame[1, 3] = 1.3F;
642+
643+
using MemoryStream csvStream = new MemoryStream();
644+
var cultureInfo = new CultureInfo("ro-RO");
645+
var separator = ';';
646+
DataFrame.WriteCsv(dataFrame, csvStream, separator: separator, cultureInfo: cultureInfo);
647+
648+
csvStream.Seek(0, SeekOrigin.Begin);
649+
DataFrame readIn = DataFrame.LoadCsv(csvStream, separator: separator);
650+
651+
Assert.Equal(dataFrame.Rows.Count, readIn.Rows.Count);
652+
Assert.Equal(dataFrame.Columns.Count, readIn.Columns.Count);
653+
Assert.Equal(1F, readIn[1, 0]);
654+
655+
// LoadCsv does not support culture info, therefore decimal point comma (,) is seen as thousand separator and is ignored when read
656+
Assert.Equal(11F, readIn[1, 1]);
657+
Assert.Equal(12F, readIn[1, 2]);
658+
Assert.Equal(129999992F, readIn[1, 3]);
659+
660+
Assert.Equal(1F, readIn[1, 4]);
661+
Assert.Equal(1F, readIn[1, 5]);
662+
Assert.Equal(1F, readIn[1, 6]);
663+
Assert.Equal(1F, readIn[1, 7]);
664+
Assert.Equal(1F, readIn[1, 8]);
665+
Assert.Equal(1F, readIn[1, 9]);
666+
Assert.Equal(1F, readIn[1, 10]);
667+
}
668+
669+
[Fact]
670+
public void TestWriteCsvWithCultureInfo()
671+
{
672+
using MemoryStream csvStream = new MemoryStream();
673+
DataFrame dataFrame = MakeDataFrameWithNumericColumns(10, true);
674+
dataFrame[1, 1] = 1.1M;
675+
dataFrame[1, 2] = 1.2D;
676+
dataFrame[1, 3] = 1.3F;
677+
678+
var cultureInfo = new CultureInfo("en-US");
679+
DataFrame.WriteCsv(dataFrame, csvStream, cultureInfo: cultureInfo);
680+
681+
csvStream.Seek(0, SeekOrigin.Begin);
682+
DataFrame readIn = DataFrame.LoadCsv(csvStream);
683+
684+
Assert.Equal(dataFrame.Rows.Count, readIn.Rows.Count);
685+
Assert.Equal(dataFrame.Columns.Count, readIn.Columns.Count);
686+
Assert.Equal(1F, readIn[1, 0]);
687+
Assert.Equal(1.1F, readIn[1, 1]);
688+
Assert.Equal(1.2F, readIn[1, 2]);
689+
Assert.Equal(1.3F, readIn[1, 3]);
690+
Assert.Equal(1F, readIn[1, 4]);
691+
Assert.Equal(1F, readIn[1, 5]);
692+
Assert.Equal(1F, readIn[1, 6]);
693+
Assert.Equal(1F, readIn[1, 7]);
694+
Assert.Equal(1F, readIn[1, 8]);
695+
Assert.Equal(1F, readIn[1, 9]);
696+
Assert.Equal(1F, readIn[1, 10]);
697+
}
698+
699+
[Fact]
700+
public void TestWriteCsvWithCultureInfoRomanianAndComma()
701+
{
702+
using MemoryStream csvStream = new MemoryStream();
703+
DataFrame dataFrame = MakeDataFrameWithNumericColumns(10, true);
704+
705+
var cultureInfo = new CultureInfo("ro-RO");
706+
var separator = cultureInfo.NumberFormat.NumberDecimalSeparator.First();
707+
708+
Assert.Throws<ArgumentException>(() => DataFrame.WriteCsv(dataFrame, csvStream, separator: separator, cultureInfo: cultureInfo));
709+
}
710+
711+
[Fact]
712+
public void TestWriteCsvWithNoHeader()
713+
{
714+
using MemoryStream csvStream = new MemoryStream();
715+
DataFrame dataFrame = MakeDataFrameWithAllColumnTypes(10, true);
716+
717+
DataFrame.WriteCsv(dataFrame, csvStream, header: false);
718+
719+
csvStream.Seek(0, SeekOrigin.Begin);
720+
DataFrame readIn = DataFrame.LoadCsv(csvStream, header: false);
721+
722+
Assert.Equal(dataFrame.Rows.Count, readIn.Rows.Count);
723+
Assert.Equal(dataFrame.Columns.Count, readIn.Columns.Count);
724+
Assert.Equal(1F, readIn[1, 0]);
725+
Assert.Equal(1F, readIn[1, 1]);
726+
Assert.Equal(1F, readIn[1, 2]);
727+
Assert.Equal(1F, readIn[1, 3]);
728+
Assert.Equal(1F, readIn[1, 4]);
729+
Assert.Equal(1F, readIn[1, 5]);
730+
Assert.Equal(1F, readIn[1, 6]);
731+
Assert.Equal(1F, readIn[1, 7]);
732+
Assert.Equal(1F, readIn[1, 8]);
733+
Assert.Equal(1F, readIn[1, 9]);
734+
Assert.Equal(1F, readIn[1, 10]);
735+
}
736+
737+
[Fact]
738+
public void TestWriteCsvWithSemicolonSeparator()
739+
{
740+
using MemoryStream csvStream = new MemoryStream();
741+
DataFrame dataFrame = MakeDataFrameWithAllColumnTypes(10, true);
742+
743+
var separator = ';';
744+
DataFrame.WriteCsv(dataFrame, csvStream, separator: separator);
745+
746+
csvStream.Seek(0, SeekOrigin.Begin);
747+
DataFrame readIn = DataFrame.LoadCsv(csvStream, separator: separator);
748+
749+
Assert.Equal(dataFrame.Rows.Count, readIn.Rows.Count);
750+
Assert.Equal(dataFrame.Columns.Count, readIn.Columns.Count);
751+
Assert.Equal(1F, readIn[1, 0]);
752+
Assert.Equal(1F, readIn[1, 1]);
753+
Assert.Equal(1F, readIn[1, 2]);
754+
Assert.Equal(1F, readIn[1, 3]);
755+
Assert.Equal(1F, readIn[1, 4]);
756+
Assert.Equal(1F, readIn[1, 5]);
757+
Assert.Equal(1F, readIn[1, 6]);
758+
Assert.Equal(1F, readIn[1, 7]);
759+
Assert.Equal(1F, readIn[1, 8]);
760+
Assert.Equal(1F, readIn[1, 9]);
761+
Assert.Equal(1F, readIn[1, 10]);
762+
}
606763
}
607764
}

0 commit comments

Comments
 (0)