From 7cee9d94b98cf0163eb1de39037f740088c9af30 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Thu, 5 Dec 2019 13:42:27 -0800 Subject: [PATCH] Creating a `Rows` property, similar to `Columns` (#2794) * Rows collection, similar to Columns * Doc * Some minor clean up * Make DataFrameRow a view into the DataFrame * sq * Address feedback * Remove DataFrame.RowCount * More row count changes * sq * Address feedback * Merge upstream --- .../DataFrame.Arrow.cs | 4 +- .../DataFrame.IDataView.cs | 4 +- src/Microsoft.Data.Analysis/DataFrame.Join.cs | 24 +-- src/Microsoft.Data.Analysis/DataFrame.cs | 36 ++-- .../DataFrameColumnCollection.cs | 10 - src/Microsoft.Data.Analysis/DataFrameRow.cs | 70 +++++++ .../DataFrameRowCollection.cs | 56 +++++ .../ArrowIntegrationTests.cs | 2 +- .../DataFrame.IOTests.cs | 12 +- .../DataFrameTests.IDataView.cs | 2 +- .../DataFrameTests.cs | 196 +++++++++++------- 11 files changed, 291 insertions(+), 125 deletions(-) create mode 100644 src/Microsoft.Data.Analysis/DataFrameRow.cs create mode 100644 src/Microsoft.Data.Analysis/DataFrameRowCollection.cs diff --git a/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs b/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs index dda54b6cea..396f778a73 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.Arrow.cs @@ -148,7 +148,7 @@ public IEnumerable ToArrowRecordBatches() List arrays = new List(); int recordBatchLength = Int32.MaxValue; - int numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, RowCount); + int numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, Rows.Count); long numberOfRowsProcessed = 0; // Sometimes .NET for Spark passes in DataFrames with no rows. In those cases, we just return a RecordBatch with the right Schema and no rows @@ -166,7 +166,7 @@ public IEnumerable ToArrowRecordBatches() } numberOfRowsProcessed += numberOfRowsInThisRecordBatch; yield return new RecordBatch(schema, arrays, numberOfRowsInThisRecordBatch); - } while (numberOfRowsProcessed < RowCount); + } while (numberOfRowsProcessed < Rows.Count); } } diff --git a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs index e7ee8fc572..4755f296f4 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs @@ -38,7 +38,7 @@ private DataViewSchema DataViewSchema DataViewSchema IDataView.Schema => DataViewSchema; - long? IDataView.GetRowCount() => RowCount; + long? IDataView.GetRowCount() => Rows.Count; private DataViewRowCursor GetRowCursorCore(IEnumerable columnsNeeded) { @@ -133,7 +133,7 @@ public override bool MoveNext() if (_disposed) return false; _position++; - return _position < _dataFrame.RowCount; + return _position < _dataFrame.Rows.Count; } } } diff --git a/src/Microsoft.Data.Analysis/DataFrame.Join.cs b/src/Microsoft.Data.Analysis/DataFrame.Join.cs index 609eac102c..dbb0998428 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.Join.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.Join.cs @@ -44,7 +44,7 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right DataFrameColumn newColumn = Columns[i].Clone(); ret.Columns.Insert(ret.Columns.Count, newColumn); } - long minLength = Math.Min(RowCount, other.RowCount); + long minLength = Math.Min(Rows.Count, other.Rows.Count); PrimitiveDataFrameColumn mapIndices = new PrimitiveDataFrameColumn("mapIndices", minLength); for (long i = 0; i < minLength; i++) { @@ -53,9 +53,9 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn newColumn; - if (other.RowCount < RowCount) + if (other.Rows.Count < Rows.Count) { - newColumn = other.Columns[i].Clone(numberOfNullsToAppend: RowCount - other.RowCount); + newColumn = other.Columns[i].Clone(numberOfNullsToAppend: Rows.Count - other.Rows.Count); } else { @@ -67,7 +67,7 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right } else if (joinAlgorithm == JoinAlgorithm.Right) { - long minLength = Math.Min(RowCount, other.RowCount); + long minLength = Math.Min(Rows.Count, other.Rows.Count); PrimitiveDataFrameColumn mapIndices = new PrimitiveDataFrameColumn("mapIndices", minLength); for (long i = 0; i < minLength; i++) { @@ -76,9 +76,9 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right for (int i = 0; i < Columns.Count; i++) { DataFrameColumn newColumn; - if (RowCount < other.RowCount) + if (Rows.Count < other.Rows.Count) { - newColumn = Columns[i].Clone(numberOfNullsToAppend: other.RowCount - RowCount); + newColumn = Columns[i].Clone(numberOfNullsToAppend: other.Rows.Count - Rows.Count); } else { @@ -95,14 +95,14 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right } else if (joinAlgorithm == JoinAlgorithm.FullOuter) { - long newRowCount = Math.Max(RowCount, other.RowCount); - long numberOfNulls = newRowCount - RowCount; + long newRowCount = Math.Max(Rows.Count, other.Rows.Count); + long numberOfNulls = newRowCount - Rows.Count; for (int i = 0; i < Columns.Count; i++) { DataFrameColumn newColumn = Columns[i].Clone(numberOfNullsToAppend: numberOfNulls); ret.Columns.Insert(ret.Columns.Count, newColumn); } - numberOfNulls = newRowCount - other.RowCount; + numberOfNulls = newRowCount - other.Rows.Count; for (int i = 0; i < other.Columns.Count; i++) { DataFrameColumn newColumn = other.Columns[i].Clone(numberOfNullsToAppend: numberOfNulls); @@ -112,7 +112,7 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right } else if (joinAlgorithm == JoinAlgorithm.Inner) { - long newRowCount = Math.Min(RowCount, other.RowCount); + long newRowCount = Math.Min(Rows.Count, other.Rows.Count); PrimitiveDataFrameColumn mapIndices = new PrimitiveDataFrameColumn("mapIndices", newRowCount); for (long i = 0; i < newRowCount; i++) { @@ -242,8 +242,8 @@ public DataFrame Merge(DataFrame other, string leftJoinColumn, string righ else if (joinAlgorithm == JoinAlgorithm.Inner) { // Hash the column with the smaller RowCount - long leftRowCount = RowCount; - long rightRowCount = other.RowCount; + long leftRowCount = Rows.Count; + long rightRowCount = other.Rows.Count; DataFrame longerDataFrame = leftRowCount <= rightRowCount ? other : this; DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this; DataFrameColumn hashColumn = (leftRowCount <= rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn]; diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index a68e6e502c..6bf50f46c9 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -32,20 +32,26 @@ public enum DropNullOptions public partial class DataFrame { private readonly DataFrameColumnCollection _columnCollection; + private readonly DataFrameRowCollection _rowCollection; public DataFrame() { _columnCollection = new DataFrameColumnCollection(OnColumnsChanged); + _rowCollection = new DataFrameRowCollection(this); } public DataFrame(IList columns) { _columnCollection = new DataFrameColumnCollection(columns, OnColumnsChanged); + _rowCollection = new DataFrameRowCollection(this); } - public long RowCount => _columnCollection.RowCount; - public DataFrameColumnCollection Columns => _columnCollection; + /// + /// Returns a that contains a view of the rows in this + /// + public DataFrameRowCollection Rows => _rowCollection; + internal IReadOnlyList GetColumnNames() => _columnCollection.GetColumnNames(); #region Operators @@ -55,15 +61,6 @@ public DataFrame(IList columns) set => _columnCollection[columnIndex][rowIndex] = value; } - public IList this[long rowIndex] - { - get - { - return _columnCollection.GetRow(rowIndex); - } - //TODO?: set? - } - /// /// Returns a new DataFrame using the boolean values in /// @@ -177,9 +174,9 @@ public DataFrame Head(int numberOfRows) public DataFrame Tail(int numberOfRows) { PrimitiveDataFrameColumn filter = new PrimitiveDataFrameColumn("Filter", numberOfRows); - for (long i = RowCount - numberOfRows; i < RowCount; i++) + for (long i = Rows.Count - numberOfRows; i < Rows.Count; i++) { - filter[i - (RowCount - numberOfRows)] = i; + filter[i - (Rows.Count - numberOfRows)] = i; } return Clone(filter); } @@ -328,7 +325,7 @@ public DataFrame Sample(int numberOfRows) { Random rand = new Random(); PrimitiveDataFrameColumn indices = new PrimitiveDataFrameColumn("Indices", numberOfRows); - int randMaxValue = (int)Math.Min(Int32.MaxValue, RowCount); + int randMaxValue = (int)Math.Min(Int32.MaxValue, Rows.Count); for (long i = 0; i < numberOfRows; i++) { indices[i] = rand.Next(randMaxValue); @@ -369,7 +366,7 @@ public DataFrame DropNulls(DropNullOptions options = DropNullOptions.Any) PrimitiveDataFrameColumn filter = new PrimitiveDataFrameColumn("Filter"); if (options == DropNullOptions.Any) { - filter.AppendMany(true, RowCount); + filter.AppendMany(true, Rows.Count); for (int i = 0; i < Columns.Count; i++) { @@ -382,7 +379,7 @@ public DataFrame DropNulls(DropNullOptions options = DropNullOptions.Any) } else { - filter.AppendMany(false, RowCount); + filter.AppendMany(false, Rows.Count); for (int i = 0; i < Columns.Count; i++) { DataFrameColumn column = Columns[i]; @@ -540,7 +537,7 @@ public void Append(IEnumerable> row) foreach (DataFrameColumn column in Columns) { - if (column.Length == RowCount) + if (column.Length == Rows.Count) { ResizeByOneAndAppend(column, null); } @@ -570,11 +567,10 @@ public override string ToString() sb.Append(string.Format(Columns[i].Name.PadRight(longestColumnName))); } sb.AppendLine(); - long numberOfRows = Math.Min(RowCount, 25); + long numberOfRows = Math.Min(Rows.Count, 25); for (int i = 0; i < numberOfRows; i++) { - IList row = this[i]; - foreach (object obj in row) + foreach (object obj in Rows[i]) { sb.Append((obj ?? "null").ToString().PadRight(longestColumnName)); } diff --git a/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs b/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs index 9454b675f9..68861f2f43 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs @@ -46,16 +46,6 @@ internal IReadOnlyList GetColumnNames() return ret; } - internal IList GetRow(long rowIndex) - { - var ret = new List(); - for (int i = 0; i < Count; i++) - { - ret.Add(this[i][rowIndex]); - } - return ret; - } - public void SetColumnName(DataFrameColumn column, string newName) { string currentName = column.Name; diff --git a/src/Microsoft.Data.Analysis/DataFrameRow.cs b/src/Microsoft.Data.Analysis/DataFrameRow.cs new file mode 100644 index 0000000000..812fd69721 --- /dev/null +++ b/src/Microsoft.Data.Analysis/DataFrameRow.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections; +using System.Collections.Generic; +using System.Diagnostics; +using System.Text; + +namespace Microsoft.Data.Analysis +{ + /// + /// A DataFrameRow is a collection of values that represent a row in a . + /// + public class DataFrameRow : IEnumerable + { + private readonly DataFrame _dataFrame; + private readonly long _rowIndex; + internal DataFrameRow(DataFrame df, long rowIndex) + { + Debug.Assert(rowIndex < df.Columns.RowCount); + _dataFrame = df; + _rowIndex = rowIndex; + } + + /// + /// Returns an enumerator of the values in this row. + /// + public IEnumerator GetEnumerator() + { + foreach (DataFrameColumn column in _dataFrame.Columns) + { + yield return column[_rowIndex]; + } + } + + /// + /// An indexer to return the value at . + /// + /// The index of the value to return + /// The value at this . + public object this[int index] + { + get + { + return _dataFrame.Columns[index][_rowIndex]; + } + set + { + _dataFrame.Columns[index][_rowIndex] = value; + } + } + + /// + /// A simple string representation of the values in this row + /// + public override string ToString() + { + StringBuilder sb = new StringBuilder(); + foreach (object value in this) + { + sb.Append(value?.ToString() ?? "null").Append(" "); + } + return sb.ToString(); + } + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + } +} diff --git a/src/Microsoft.Data.Analysis/DataFrameRowCollection.cs b/src/Microsoft.Data.Analysis/DataFrameRowCollection.cs new file mode 100644 index 0000000000..85af70f76e --- /dev/null +++ b/src/Microsoft.Data.Analysis/DataFrameRowCollection.cs @@ -0,0 +1,56 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections; +using System.Collections.Generic; + +namespace Microsoft.Data.Analysis +{ + /// + /// Represents the rows of a + /// + public class DataFrameRowCollection : IEnumerable + { + private readonly DataFrame _dataFrame; + + /// + /// Initializes a . + /// + internal DataFrameRowCollection(DataFrame dataFrame) + { + _dataFrame = dataFrame ?? throw new ArgumentNullException(nameof(dataFrame)); + } + + /// + /// An indexer to return the at + /// + /// The row index + public DataFrameRow this[long index] + { + get + { + return new DataFrameRow(_dataFrame, index); + } + } + + /// + /// Returns an enumerator of objects + /// + public IEnumerator GetEnumerator() + { + for (long i = 0; i < Count; i++) + { + yield return new DataFrameRow(_dataFrame, i); + } + } + + /// + /// The number of rows in this . + /// + public long Count => _dataFrame.Columns.RowCount; + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + } +} diff --git a/tests/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs b/tests/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs index 550c3a7254..d7b59accf2 100644 --- a/tests/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs +++ b/tests/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs @@ -134,7 +134,7 @@ public void TestEmptyArrowColumns() RecordBatch batch1 = new RecordBatch.Builder() .Append("EmptyDataAndNullColumns", false, col => col.Int32(array => array.Clear())).Build(); DataFrame emptyDataFrame = DataFrame.FromArrowRecordBatch(batch1); - Assert.Equal(0, emptyDataFrame.RowCount); + Assert.Equal(0, emptyDataFrame.Rows.Count); Assert.Equal(0, emptyDataFrame["EmptyDataAndNullColumns"].Length); Assert.Equal(0, emptyDataFrame["EmptyDataAndNullColumns"].NullCount); } diff --git a/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 1284d050d4..0236f64537 100644 --- a/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -25,12 +25,12 @@ Stream GetStream(string streamData) return new MemoryStream(Encoding.Default.GetBytes(streamData)); } DataFrame df = DataFrame.LoadCsv(GetStream(data)); - Assert.Equal(4, df.RowCount); + Assert.Equal(4, df.Rows.Count); Assert.Equal(7, df.Columns.Count); Assert.Equal("CMT", df["vendor_id"][3]); DataFrame reducedRows = DataFrame.LoadCsv(GetStream(data), numberOfRowsToRead: 3); - Assert.Equal(3, reducedRows.RowCount); + Assert.Equal(3, reducedRows.Rows.Count); Assert.Equal(7, reducedRows.Columns.Count); Assert.Equal("CMT", reducedRows["vendor_id"][2]); } @@ -48,12 +48,12 @@ Stream GetStream(string streamData) return new MemoryStream(Encoding.Default.GetBytes(streamData)); } DataFrame df = DataFrame.LoadCsv(GetStream(data), header: false); - Assert.Equal(4, df.RowCount); + Assert.Equal(4, df.Rows.Count); Assert.Equal(7, df.Columns.Count); Assert.Equal("CMT", df["Column0"][3]); DataFrame reducedRows = DataFrame.LoadCsv(GetStream(data), header: false, numberOfRowsToRead: 3); - Assert.Equal(3, reducedRows.RowCount); + Assert.Equal(3, reducedRows.Rows.Count); Assert.Equal(7, reducedRows.Columns.Count); Assert.Equal("CMT", reducedRows["Column0"][2]); } @@ -73,7 +73,7 @@ Stream GetStream(string streamData) return new MemoryStream(Encoding.Default.GetBytes(streamData)); } DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(short), typeof(int), typeof(long), typeof(float), typeof(string), typeof(double) }); - Assert.Equal(5, df.RowCount); + Assert.Equal(5, df.Rows.Count); Assert.Equal(7, df.Columns.Count); Assert.True(typeof(string) == df.Columns[0].DataType); @@ -95,7 +95,7 @@ Stream GetStream(string streamData) Assert.Equal(0, column.NullCount); } } - var nullRow = df[3]; + var nullRow = df.Rows[3]; Assert.Equal("", nullRow[0]); Assert.Null(nullRow[1]); Assert.Null(nullRow[2]); diff --git a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs index 57b530607f..459c4a47ce 100644 --- a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs +++ b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs @@ -96,7 +96,7 @@ public void TestIDataViewSchemaInvalidate() schema = dataView.Schema; Assert.Equal(13, schema.Count); - DataFrameColumn boolColumn = new PrimitiveDataFrameColumn("Bool", Enumerable.Range(0, (int)df.RowCount).Select(x => x % 2 == 1)); + DataFrameColumn boolColumn = new PrimitiveDataFrameColumn("Bool", Enumerable.Range(0, (int)df.Rows.Count).Select(x => x % 2 == 1)); df.Columns.Insert(0, boolColumn); schema = dataView.Schema; Assert.Equal(14, schema.Count); diff --git a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index 5c0fcd19ff..f32983acdd 100644 --- a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -167,9 +167,9 @@ public static DataFrame MakeDataFrame(int length, bool withNulls = true) public DataFrame SplitTrainTest(DataFrame input, float testRatio, out DataFrame Test) { - IEnumerable randomIndices = Enumerable.Range(0, (int)input.RowCount); - IEnumerable trainIndices = randomIndices.Take((int)(input.RowCount * testRatio)); - IEnumerable testIndices = randomIndices.TakeLast((int)(input.RowCount * (1 - testRatio))); + IEnumerable randomIndices = Enumerable.Range(0, (int)input.Rows.Count); + IEnumerable trainIndices = randomIndices.Take((int)(input.Rows.Count * testRatio)); + IEnumerable testIndices = randomIndices.TakeLast((int)(input.Rows.Count * (1 - testRatio))); Test = input[testIndices]; return input[trainIndices]; } @@ -186,15 +186,15 @@ public void TestIndexer() Assert.Equal("Int1", dataFrame.Columns[0].Name); var headList = dataFrame.Head(5); - Assert.Equal(14, (int)headList[4][1]); + Assert.Equal(14, (int)headList.Rows[4][1]); var tailList = dataFrame.Tail(5); - Assert.Equal(19, (int)tailList[4][1]); + Assert.Equal(19, (int)tailList.Rows[4][1]); dataFrame[2, 1] = 1000; Assert.Equal(1000, dataFrame[2, 1]); - var row = dataFrame[4]; + var row = dataFrame.Rows[4]; Assert.Equal(14, (int)row[1]); var column = dataFrame["Int2"] as PrimitiveDataFrameColumn; @@ -211,7 +211,7 @@ public void ColumnAndTableCreationTest() DataFrame dataFrame = new DataFrame(); dataFrame.Columns.Insert(0, intColumn); dataFrame.Columns.Insert(1, floatColumn); - Assert.Equal(10, dataFrame.RowCount); + Assert.Equal(10, dataFrame.Rows.Count); Assert.Equal(2, dataFrame.Columns.Count); Assert.Equal(10, dataFrame.Columns[0].Length); Assert.Equal("IntColumn", dataFrame.Columns[0].Name); @@ -628,7 +628,7 @@ public void TestBinaryOperatorsOnBoolColumns() Assert.True(or.Columns[1].All()); DataFrame xor = df ^ true; - for (int i = 0; i < xor.RowCount; i++) + for (int i = 0; i < xor.Rows.Count; i++) { if (i % 2 == 0) Assert.False((bool)xor["Bool1"][i]); @@ -636,7 +636,7 @@ public void TestBinaryOperatorsOnBoolColumns() Assert.True((bool)xor["Bool1"][i]); } xor = true ^ df; - for (int i = 0; i < xor.RowCount; i++) + for (int i = 0; i < xor.Rows.Count; i++) { if (i % 2 == 0) Assert.False((bool)xor["Bool1"][i]); @@ -857,17 +857,17 @@ public void TestSplitAndSort() { DataFrame df = MakeDataFrameWithAllMutableColumnTypes(20); df["Int"][0] = 100000; - df["Int"][df.RowCount - 1] = -1; + df["Int"][df.Rows.Count - 1] = -1; df["Int"][5] = 200000; DataFrame dfTest; DataFrame dfTrain = SplitTrainTest(df, 0.8f, out dfTest); // Sort by "Int" in ascending order var sortedDf = dfTrain.Sort("Int"); - Assert.Null(sortedDf["Int"][sortedDf.RowCount - 1]); + Assert.Null(sortedDf["Int"][sortedDf.Rows.Count - 1]); Assert.Equal(1, sortedDf["Int"][0]); - Assert.Equal(100000, sortedDf["Int"][sortedDf.RowCount - 3]); - Assert.Equal(200000, sortedDf["Int"][sortedDf.RowCount - 2]); + Assert.Equal(100000, sortedDf["Int"][sortedDf.Rows.Count - 3]); + Assert.Equal(200000, sortedDf["Int"][sortedDf.Rows.Count - 2]); } [Fact] @@ -940,8 +940,8 @@ public void TestPrimitiveColumnSort(int numberOfNulls) private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlgorithm joinAlgorithm) { - PrimitiveDataFrameColumn mapIndices = new PrimitiveDataFrameColumn("map", join.RowCount); - for (long i = 0; i < join.RowCount; i++) + PrimitiveDataFrameColumn mapIndices = new PrimitiveDataFrameColumn("map", join.Rows.Count); + for (long i = 0; i < join.Rows.Count; i++) { mapIndices[i] = i; } @@ -961,7 +961,7 @@ private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlg { int columnIndex = i - left.Columns.Count; DataFrameColumn rightColumn = right.Columns[columnIndex]; - DataFrameColumn compareColumn = rightColumn.Length <= join.RowCount ? rightColumn.Clone(numberOfNullsToAppend: join.RowCount - rightColumn.Length) : rightColumn.Clone(mapIndices); + DataFrameColumn compareColumn = rightColumn.Length <= join.Rows.Count ? rightColumn.Clone(numberOfNullsToAppend: join.Rows.Count - rightColumn.Length) : rightColumn.Clone(mapIndices); isEqual = joinColumn.ElementwiseEquals(compareColumn); } } @@ -970,7 +970,7 @@ private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlg if (i < left.Columns.Count) { DataFrameColumn leftColumn = left.Columns[i]; - DataFrameColumn compareColumn = leftColumn.Length <= join.RowCount ? leftColumn.Clone(numberOfNullsToAppend: join.RowCount - leftColumn.Length) : leftColumn.Clone(mapIndices); + DataFrameColumn compareColumn = leftColumn.Length <= join.Rows.Count ? leftColumn.Clone(numberOfNullsToAppend: join.Rows.Count - leftColumn.Length) : leftColumn.Clone(mapIndices); isEqual = joinColumn.ElementwiseEquals(compareColumn); } else @@ -999,16 +999,16 @@ private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlg if (i < left.Columns.Count) { DataFrameColumn leftColumn = left.Columns[i]; - isEqual = joinColumn.ElementwiseEquals(leftColumn.Clone(numberOfNullsToAppend: join.RowCount - leftColumn.Length)); + isEqual = joinColumn.ElementwiseEquals(leftColumn.Clone(numberOfNullsToAppend: join.Rows.Count - leftColumn.Length)); } else { int columnIndex = i - left.Columns.Count; DataFrameColumn rightColumn = right.Columns[columnIndex]; - isEqual = joinColumn.ElementwiseEquals(rightColumn.Clone(numberOfNullsToAppend: join.RowCount - rightColumn.Length)); + isEqual = joinColumn.ElementwiseEquals(rightColumn.Clone(numberOfNullsToAppend: join.Rows.Count - rightColumn.Length)); } } - for (int j = 0; j < join.RowCount; j++) + for (int j = 0; j < join.Rows.Count; j++) { Assert.Equal(true, isEqual[j]); } @@ -1062,17 +1062,17 @@ public void TestJoin() DataFrame left = MakeDataFrameWithAllMutableColumnTypes(10); DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); - // Tests with right.RowCount < left.RowCount + // Tests with right.Rows.Count < left.Rows.Count // Left join DataFrame join = left.Join(right); - Assert.Equal(join.RowCount, left.RowCount); + Assert.Equal(join.Rows.Count, left.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Null(join["Int_right"][6]); VerifyJoin(join, left, right, JoinAlgorithm.Left); // Right join join = left.Join(right, joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(join.RowCount, right.RowCount); + Assert.Equal(join.Rows.Count, right.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(join["Int_right"][3], right["Int"][3]); Assert.Null(join["Int_right"][2]); @@ -1080,31 +1080,31 @@ public void TestJoin() // Outer join join = left.Join(right, joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(join.RowCount, left.RowCount); + Assert.Equal(join.Rows.Count, left.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Null(join["Int_right"][6]); VerifyJoin(join, left, right, JoinAlgorithm.FullOuter); // Inner join join = left.Join(right, joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(join.RowCount, right.RowCount); + Assert.Equal(join.Rows.Count, right.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(join["Int_right"][3], right["Int"][3]); Assert.Null(join["Int_right"][2]); VerifyJoin(join, left, right, JoinAlgorithm.Inner); - // Tests with right.RowCount > left.RowCount + // Tests with right.Rows.Count > left.Rows.Count // Left join right = MakeDataFrameWithAllMutableColumnTypes(15); join = left.Join(right); - Assert.Equal(join.RowCount, left.RowCount); + Assert.Equal(join.Rows.Count, left.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(join["Int_right"][6], right["Int"][6]); VerifyJoin(join, left, right, JoinAlgorithm.Left); // Right join join = left.Join(right, joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(join.RowCount, right.RowCount); + Assert.Equal(join.Rows.Count, right.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(join["Int_right"][2], right["Int"][2]); Assert.Null(join["Int_left"][12]); @@ -1112,14 +1112,14 @@ public void TestJoin() // Outer join join = left.Join(right, joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(join.RowCount, right.RowCount); + Assert.Equal(join.Rows.Count, right.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Null(join["Int_left"][12]); VerifyJoin(join, left, right, JoinAlgorithm.FullOuter); // Inner join join = left.Join(right, joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(join.RowCount, left.RowCount); + Assert.Equal(join.Rows.Count, left.Rows.Count); Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(join["Int_right"][2], right["Int"][2]); VerifyJoin(join, left, right, JoinAlgorithm.Inner); @@ -1130,10 +1130,10 @@ public void TestGroupBy() { DataFrame df = MakeDataFrameWithNumericAndBoolColumns(10); DataFrame count = df.GroupBy("Bool").Count(); - Assert.Equal(2, count.RowCount); + Assert.Equal(2, count.Rows.Count); Assert.Equal((long)5, count["Int"][0]); Assert.Equal((long)4, count["Decimal"][1]); - for (int r = 0; r < count.RowCount; r++) + for (int r = 0; r < count.Rows.Count; r++) { for (int c = 1; c < count.Columns.Count; c++) { @@ -1142,7 +1142,7 @@ public void TestGroupBy() } DataFrame first = df.GroupBy("Bool").First(); - Assert.Equal(2, first.RowCount); + Assert.Equal(2, first.Rows.Count); for (int r = 0; r < 2; r++) { for (int c = 0; c < count.Columns.Count; c++) @@ -1172,10 +1172,10 @@ public void TestGroupBy() DataFrameColumn headColumn = head[originalColumn.Name]; Assert.Equal(originalColumn[5], headColumn[verify[5]]); } - Assert.Equal(6, head.RowCount); + Assert.Equal(6, head.Rows.Count); DataFrame tail = df.GroupBy("Bool").Tail(3); - Assert.Equal(6, tail.RowCount); + Assert.Equal(6, tail.Rows.Count); List originalColumnVerify = new List() { 6, 8, 7, 9 }; List tailColumnVerity = new List() { 1, 2, 4, 5 }; for (int r = 0; r < 4; r++) @@ -1189,7 +1189,7 @@ public void TestGroupBy() } DataFrame max = df.GroupBy("Bool").Max(); - Assert.Equal(2, max.RowCount); + Assert.Equal(2, max.Rows.Count); for (int r = 0; r < 2; r++) { for (int c = 0; c < count.Columns.Count; c++) @@ -1203,16 +1203,16 @@ public void TestGroupBy() } DataFrame min = df.GroupBy("Bool").Min(); - Assert.Equal(2, min.RowCount); + Assert.Equal(2, min.Rows.Count); DataFrame product = df.GroupBy("Bool").Product(); - Assert.Equal(2, product.RowCount); + Assert.Equal(2, product.Rows.Count); DataFrame sum = df.GroupBy("Bool").Sum(); - Assert.Equal(2, sum.RowCount); + Assert.Equal(2, sum.Rows.Count); DataFrame mean = df.GroupBy("Bool").Mean(); - Assert.Equal(2, mean.RowCount); + Assert.Equal(2, mean.Rows.Count); for (int r = 0; r < 2; r++) { for (int c = 0; c < count.Columns.Count; c++) @@ -1250,13 +1250,13 @@ public void TestGroupBy() DataFrame countIntColumn = df.GroupBy("Bool").Count("Int"); Assert.Equal(2, countIntColumn.Columns.Count); - Assert.Equal(2, countIntColumn.RowCount); + Assert.Equal(2, countIntColumn.Rows.Count); Assert.Equal((long)5, countIntColumn["Int"][0]); Assert.Equal((long)4, countIntColumn["Int"][1]); DataFrame firstDecimalColumn = df.GroupBy("Bool").First("Decimal"); Assert.Equal(2, firstDecimalColumn.Columns.Count); - Assert.Equal(2, firstDecimalColumn.RowCount); + Assert.Equal(2, firstDecimalColumn.Rows.Count); Assert.Equal((decimal)0, firstDecimalColumn["Decimal"][0]); Assert.Equal((decimal)1, firstDecimalColumn["Decimal"][1]); } @@ -1267,7 +1267,7 @@ public void TestGoupByDifferentColumnTypes() void GroupCountAndAssert(DataFrame frame) { DataFrame grouped = frame.GroupBy("Column1").Count(); - Assert.Equal(2, grouped.RowCount); + Assert.Equal(2, grouped.Rows.Count); } DataFrame df = MakeDataFrame(10, false); @@ -1503,7 +1503,7 @@ public void TestDataFrameFilter() DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10); DataFrame boolColumnFiltered = df[df["Bool"].ElementwiseEquals(true)]; List verify = new List { 0, 2, 4, 6, 8 }; - Assert.Equal(5, boolColumnFiltered.RowCount); + Assert.Equal(5, boolColumnFiltered.Rows.Count); for (int i = 0; i < boolColumnFiltered.Columns.Count; i++) { DataFrameColumn column = boolColumnFiltered.Columns[i]; @@ -1582,7 +1582,7 @@ public void TestSample() { DataFrame df = MakeDataFrameWithAllColumnTypes(10); DataFrame sampled = df.Sample(3); - Assert.Equal(3, sampled.RowCount); + Assert.Equal(3, sampled.Rows.Count); Assert.Equal(df.Columns.Count, sampled.Columns.Count); } @@ -1592,10 +1592,10 @@ public void TestMerge() DataFrame left = MakeDataFrameWithAllMutableColumnTypes(10); DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); - // Tests with right.RowCount < left.RowCount + // Tests with right.Rows.Count < left.Rows.Count // Left merge DataFrame merge = left.Merge(right, "Int", "Int"); - Assert.Equal(10, merge.RowCount); + Assert.Equal(10, merge.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Null(merge["Int_right"][6]); Assert.Null(merge["Int_left"][5]); @@ -1603,7 +1603,7 @@ public void TestMerge() // Right merge merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(5, merge.RowCount); + Assert.Equal(5, merge.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(merge["Int_right"][3], right["Int"][3]); Assert.Null(merge["Int_right"][2]); @@ -1611,31 +1611,31 @@ public void TestMerge() // Outer merge merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(merge.RowCount, left.RowCount); + Assert.Equal(merge.Rows.Count, left.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Null(merge["Int_right"][6]); VerifyMerge(merge, left, right, JoinAlgorithm.FullOuter); // Inner merge merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(merge.RowCount, right.RowCount); + Assert.Equal(merge.Rows.Count, right.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(merge["Int_right"][2], right["Int"][3]); Assert.Null(merge["Int_right"][4]); VerifyMerge(merge, left, right, JoinAlgorithm.Inner); - // Tests with right.RowCount > left.RowCount + // Tests with right.Rows.Count > left.Rows.Count // Left merge right = MakeDataFrameWithAllMutableColumnTypes(15); merge = left.Merge(right, "Int", "Int"); - Assert.Equal(merge.RowCount, left.RowCount); + Assert.Equal(merge.Rows.Count, left.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(merge["Int_right"][6], right["Int"][6]); VerifyMerge(merge, left, right, JoinAlgorithm.Left); // Right merge merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(merge.RowCount, right.RowCount); + Assert.Equal(merge.Rows.Count, right.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(merge["Int_right"][2], right["Int"][2]); Assert.Null(merge["Int_left"][12]); @@ -1643,7 +1643,7 @@ public void TestMerge() // Outer merge merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(16, merge.RowCount); + Assert.Equal(16, merge.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Null(merge["Int_left"][12]); Assert.Null(merge["Int_left"][5]); @@ -1651,7 +1651,7 @@ public void TestMerge() // Inner merge merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(9, merge.RowCount); + Assert.Equal(9, merge.Rows.Count); Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); Assert.Equal(merge["Int_right"][2], right["Int"][2]); VerifyMerge(merge, left, right, JoinAlgorithm.Inner); @@ -1730,10 +1730,10 @@ public void TestDropNulls() { DataFrame df = MakeDataFrameWithAllMutableColumnTypes(20); DataFrame anyNulls = df.DropNulls(); - Assert.Equal(19, anyNulls.RowCount); + Assert.Equal(19, anyNulls.Rows.Count); DataFrame allNulls = df.DropNulls(DropNullOptions.All); - Assert.Equal(19, allNulls.RowCount); + Assert.Equal(19, allNulls.Rows.Count); } [Fact] @@ -1775,7 +1775,7 @@ public void TestValueCounts() { DataFrame df = MakeDataFrameWithAllColumnTypes(10, withNulls: false); DataFrame valueCounts = df["Bool"].ValueCounts(); - Assert.Equal(2, valueCounts.RowCount); + Assert.Equal(2, valueCounts.Rows.Count); Assert.Equal((long)5, valueCounts["Counts"][0]); Assert.Equal((long)5, valueCounts["Counts"][1]); } @@ -1828,13 +1828,13 @@ public void TestClone(int dfLength, int intDfLength) DataFrame intDf = MakeDataFrameWithTwoColumns(intDfLength, false); PrimitiveDataFrameColumn intColumn = intDf["Int1"] as PrimitiveDataFrameColumn; DataFrame clone = df[intColumn]; - Assert.Equal(intDfLength, clone.RowCount); + Assert.Equal(intDfLength, clone.Rows.Count); Assert.Equal(df.Columns.Count, clone.Columns.Count); for (int i = 0; i < df.Columns.Count; i++) { DataFrameColumn dfColumn = df.Columns[i]; DataFrameColumn cloneColumn = clone.Columns[i]; - for (long r = 0; r < clone.RowCount; r++) + for (long r = 0; r < clone.Rows.Count; r++) { Assert.Equal(dfColumn[r], cloneColumn[r]); } @@ -1869,46 +1869,100 @@ public void TestColumns() } + [Fact] + public void TestRows() + { + DataFrame df = MakeDataFrameWithAllColumnTypes(10); + DataFrameRowCollection rows = df.Rows; + Assert.Equal(10, rows.Count); + DataFrameRow firstRow = rows[0]; + object firstValue = firstRow[0]; + Assert.Equal(df[0, 0], firstValue); + long rowCount = 0; + foreach (DataFrameRow row in rows) + { + int columnIndex = 0; + foreach (var value in row) + { + Assert.Equal(df.Columns[columnIndex][rowCount], value); + columnIndex++; + } + rowCount++; + } + Assert.Equal(df.Rows.Count, rowCount); + + DataFrameRow nullRow = rows[5]; + int intColumnIndex = df.Columns.IndexOf("Int"); + Assert.Equal(1, df.Columns[intColumnIndex].NullCount); + nullRow[intColumnIndex] = 5; + Assert.Equal(0, df.Columns[intColumnIndex].NullCount); + nullRow[intColumnIndex] = null; + Assert.Equal(1, df.Columns[intColumnIndex].NullCount); + } + + [Fact] + public void TestMutationOnRows() + { + DataFrame df = MakeDataFrameWithNumericColumns(10); + DataFrameRowCollection rows = df.Rows; + + foreach (DataFrameRow row in rows) + { + for (int i = 0; i < df.Columns.Count; i++) + { + DataFrameColumn column = df.Columns[i]; + row[i] = Convert.ChangeType(12, column.DataType); + } + } + + foreach (var column in df.Columns) + { + foreach (var value in column) + { + Assert.Equal("12", value.ToString()); + } + } + } [Fact] public void TestAppendRow() { DataFrame df = MakeDataFrame(10); df.Append(new List { 5, true }); - Assert.Equal(11, df.RowCount); + Assert.Equal(11, df.Rows.Count); Assert.Equal(1, df.Columns[0].NullCount); Assert.Equal(1, df.Columns[1].NullCount); df.Append(new List { 100 }); - Assert.Equal(12, df.RowCount); + Assert.Equal(12, df.Rows.Count); Assert.Equal(1, df.Columns[0].NullCount); Assert.Equal(2, df.Columns[1].NullCount); df.Append(new List { null, null }); - Assert.Equal(13, df.RowCount); + Assert.Equal(13, df.Rows.Count); Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(3, df.Columns[1].NullCount); df.Append(new List> { KeyValuePair.Create("Column1", (object)5), KeyValuePair.Create("Column2", (object)false) }); - Assert.Equal(14, df.RowCount); + Assert.Equal(14, df.Rows.Count); Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(3, df.Columns[1].NullCount); df.Append(new List> { KeyValuePair.Create("Column1", (object)5) }); - Assert.Equal(15, df.RowCount); + Assert.Equal(15, df.Rows.Count); Assert.Equal(15, df["Column1"].Length); Assert.Equal(15, df["Column2"].Length); Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(4, df.Columns[1].NullCount); df.Append(new List> { KeyValuePair.Create("Column2", (object)false) }); - Assert.Equal(16, df.RowCount); + Assert.Equal(16, df.Rows.Count); Assert.Equal(16, df["Column1"].Length); Assert.Equal(16, df["Column2"].Length); Assert.Equal(3, df.Columns[0].NullCount); Assert.Equal(4, df.Columns[1].NullCount); df.Append((IEnumerable)null); - Assert.Equal(17, df.RowCount); + Assert.Equal(17, df.Rows.Count); Assert.Equal(17, df["Column1"].Length); Assert.Equal(17, df["Column2"].Length); Assert.Equal(4, df.Columns[0].NullCount); @@ -1920,7 +1974,7 @@ public void TestAppendRow() Assert.Throws(() => df.Append(new List { 5, true, true })); df.Append(); - Assert.Equal(18, df.RowCount); + Assert.Equal(18, df.Rows.Count); Assert.Equal(18, df["Column1"].Length); Assert.Equal(18, df["Column2"].Length); Assert.Equal(5, df.Columns[0].NullCount); @@ -1932,7 +1986,7 @@ public void TestAppendEmptyValue() { DataFrame df = MakeDataFrame(10); df.Append(new List { "", true }); - Assert.Equal(11, df.RowCount); + Assert.Equal(11, df.Rows.Count); Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(1, df.Columns[1].NullCount); @@ -1940,13 +1994,13 @@ public void TestAppendEmptyValue() df.Columns.Add(column); df.Append(new List { 1, true, "" }); - Assert.Equal(12, df.RowCount); + Assert.Equal(12, df.Rows.Count); Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(1, df.Columns[1].NullCount); Assert.Equal(0, df.Columns[2].NullCount); df.Append(new List { 1, true, null }); - Assert.Equal(13, df.RowCount); + Assert.Equal(13, df.Rows.Count); Assert.Equal(1, df.Columns[2].NullCount); } }