Skip to content

Commit

Permalink
Creating a Rows property, similar to Columns (dotnet#2794)
Browse files Browse the repository at this point in the history
* Rows collection, similar to Columns

* Doc

* Some minor clean up

* Make DataFrameRow a view into the DataFrame

* sq

* Address feedback

* Remove DataFrame.RowCount

* More row count changes

* sq

* Address feedback

* Merge upstream
  • Loading branch information
Prashanth Govindarajan committed Dec 5, 2019
1 parent c6eb2f7 commit 7cee9d9
Show file tree
Hide file tree
Showing 11 changed files with 291 additions and 125 deletions.
4 changes: 2 additions & 2 deletions src/Microsoft.Data.Analysis/DataFrame.Arrow.cs
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ public IEnumerable<RecordBatch> ToArrowRecordBatches()
List<Apache.Arrow.Array> arrays = new List<Apache.Arrow.Array>();

int recordBatchLength = Int32.MaxValue;
int numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, RowCount);
int numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, Rows.Count);
long numberOfRowsProcessed = 0;

// Sometimes .NET for Spark passes in DataFrames with no rows. In those cases, we just return a RecordBatch with the right Schema and no rows
Expand All @@ -166,7 +166,7 @@ public IEnumerable<RecordBatch> ToArrowRecordBatches()
}
numberOfRowsProcessed += numberOfRowsInThisRecordBatch;
yield return new RecordBatch(schema, arrays, numberOfRowsInThisRecordBatch);
} while (numberOfRowsProcessed < RowCount);
} while (numberOfRowsProcessed < Rows.Count);
}

}
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.Data.Analysis/DataFrame.IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ private DataViewSchema DataViewSchema

DataViewSchema IDataView.Schema => DataViewSchema;

long? IDataView.GetRowCount() => RowCount;
long? IDataView.GetRowCount() => Rows.Count;

private DataViewRowCursor GetRowCursorCore(IEnumerable<DataViewSchema.Column> columnsNeeded)
{
Expand Down Expand Up @@ -133,7 +133,7 @@ public override bool MoveNext()
if (_disposed)
return false;
_position++;
return _position < _dataFrame.RowCount;
return _position < _dataFrame.Rows.Count;
}
}
}
Expand Down
24 changes: 12 additions & 12 deletions src/Microsoft.Data.Analysis/DataFrame.Join.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right
DataFrameColumn newColumn = Columns[i].Clone();
ret.Columns.Insert(ret.Columns.Count, newColumn);
}
long minLength = Math.Min(RowCount, other.RowCount);
long minLength = Math.Min(Rows.Count, other.Rows.Count);
PrimitiveDataFrameColumn<long> mapIndices = new PrimitiveDataFrameColumn<long>("mapIndices", minLength);
for (long i = 0; i < minLength; i++)
{
Expand All @@ -53,9 +53,9 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right
for (int i = 0; i < other.Columns.Count; i++)
{
DataFrameColumn newColumn;
if (other.RowCount < RowCount)
if (other.Rows.Count < Rows.Count)
{
newColumn = other.Columns[i].Clone(numberOfNullsToAppend: RowCount - other.RowCount);
newColumn = other.Columns[i].Clone(numberOfNullsToAppend: Rows.Count - other.Rows.Count);
}
else
{
Expand All @@ -67,7 +67,7 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right
}
else if (joinAlgorithm == JoinAlgorithm.Right)
{
long minLength = Math.Min(RowCount, other.RowCount);
long minLength = Math.Min(Rows.Count, other.Rows.Count);
PrimitiveDataFrameColumn<long> mapIndices = new PrimitiveDataFrameColumn<long>("mapIndices", minLength);
for (long i = 0; i < minLength; i++)
{
Expand All @@ -76,9 +76,9 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right
for (int i = 0; i < Columns.Count; i++)
{
DataFrameColumn newColumn;
if (RowCount < other.RowCount)
if (Rows.Count < other.Rows.Count)
{
newColumn = Columns[i].Clone(numberOfNullsToAppend: other.RowCount - RowCount);
newColumn = Columns[i].Clone(numberOfNullsToAppend: other.Rows.Count - Rows.Count);
}
else
{
Expand All @@ -95,14 +95,14 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right
}
else if (joinAlgorithm == JoinAlgorithm.FullOuter)
{
long newRowCount = Math.Max(RowCount, other.RowCount);
long numberOfNulls = newRowCount - RowCount;
long newRowCount = Math.Max(Rows.Count, other.Rows.Count);
long numberOfNulls = newRowCount - Rows.Count;
for (int i = 0; i < Columns.Count; i++)
{
DataFrameColumn newColumn = Columns[i].Clone(numberOfNullsToAppend: numberOfNulls);
ret.Columns.Insert(ret.Columns.Count, newColumn);
}
numberOfNulls = newRowCount - other.RowCount;
numberOfNulls = newRowCount - other.Rows.Count;
for (int i = 0; i < other.Columns.Count; i++)
{
DataFrameColumn newColumn = other.Columns[i].Clone(numberOfNullsToAppend: numberOfNulls);
Expand All @@ -112,7 +112,7 @@ public DataFrame Join(DataFrame other, string leftSuffix = "_left", string right
}
else if (joinAlgorithm == JoinAlgorithm.Inner)
{
long newRowCount = Math.Min(RowCount, other.RowCount);
long newRowCount = Math.Min(Rows.Count, other.Rows.Count);
PrimitiveDataFrameColumn<long> mapIndices = new PrimitiveDataFrameColumn<long>("mapIndices", newRowCount);
for (long i = 0; i < newRowCount; i++)
{
Expand Down Expand Up @@ -242,8 +242,8 @@ public DataFrame Merge<TKey>(DataFrame other, string leftJoinColumn, string righ
else if (joinAlgorithm == JoinAlgorithm.Inner)
{
// Hash the column with the smaller RowCount
long leftRowCount = RowCount;
long rightRowCount = other.RowCount;
long leftRowCount = Rows.Count;
long rightRowCount = other.Rows.Count;
DataFrame longerDataFrame = leftRowCount <= rightRowCount ? other : this;
DataFrame shorterDataFrame = ReferenceEquals(longerDataFrame, this) ? other : this;
DataFrameColumn hashColumn = (leftRowCount <= rightRowCount) ? this[leftJoinColumn] : other[rightJoinColumn];
Expand Down
36 changes: 16 additions & 20 deletions src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,26 @@ public enum DropNullOptions
public partial class DataFrame
{
private readonly DataFrameColumnCollection _columnCollection;
private readonly DataFrameRowCollection _rowCollection;
public DataFrame()
{
_columnCollection = new DataFrameColumnCollection(OnColumnsChanged);
_rowCollection = new DataFrameRowCollection(this);
}

public DataFrame(IList<DataFrameColumn> columns)
{
_columnCollection = new DataFrameColumnCollection(columns, OnColumnsChanged);
_rowCollection = new DataFrameRowCollection(this);
}

public long RowCount => _columnCollection.RowCount;

public DataFrameColumnCollection Columns => _columnCollection;

/// <summary>
/// Returns a <see cref="DataFrameRowCollection"/> that contains a view of the rows in this <see cref="DataFrame"/>
/// </summary>
public DataFrameRowCollection Rows => _rowCollection;

internal IReadOnlyList<string> GetColumnNames() => _columnCollection.GetColumnNames();

#region Operators
Expand All @@ -55,15 +61,6 @@ public DataFrame(IList<DataFrameColumn> columns)
set => _columnCollection[columnIndex][rowIndex] = value;
}

public IList<object> this[long rowIndex]
{
get
{
return _columnCollection.GetRow(rowIndex);
}
//TODO?: set?
}

/// <summary>
/// Returns a new DataFrame using the boolean values in <paramref name="filter"/>
/// </summary>
Expand Down Expand Up @@ -177,9 +174,9 @@ public DataFrame Head(int numberOfRows)
public DataFrame Tail(int numberOfRows)
{
PrimitiveDataFrameColumn<long> filter = new PrimitiveDataFrameColumn<long>("Filter", numberOfRows);
for (long i = RowCount - numberOfRows; i < RowCount; i++)
for (long i = Rows.Count - numberOfRows; i < Rows.Count; i++)
{
filter[i - (RowCount - numberOfRows)] = i;
filter[i - (Rows.Count - numberOfRows)] = i;
}
return Clone(filter);
}
Expand Down Expand Up @@ -328,7 +325,7 @@ public DataFrame Sample(int numberOfRows)
{
Random rand = new Random();
PrimitiveDataFrameColumn<long> indices = new PrimitiveDataFrameColumn<long>("Indices", numberOfRows);
int randMaxValue = (int)Math.Min(Int32.MaxValue, RowCount);
int randMaxValue = (int)Math.Min(Int32.MaxValue, Rows.Count);
for (long i = 0; i < numberOfRows; i++)
{
indices[i] = rand.Next(randMaxValue);
Expand Down Expand Up @@ -369,7 +366,7 @@ public DataFrame DropNulls(DropNullOptions options = DropNullOptions.Any)
PrimitiveDataFrameColumn<bool> filter = new PrimitiveDataFrameColumn<bool>("Filter");
if (options == DropNullOptions.Any)
{
filter.AppendMany(true, RowCount);
filter.AppendMany(true, Rows.Count);

for (int i = 0; i < Columns.Count; i++)
{
Expand All @@ -382,7 +379,7 @@ public DataFrame DropNulls(DropNullOptions options = DropNullOptions.Any)
}
else
{
filter.AppendMany(false, RowCount);
filter.AppendMany(false, Rows.Count);
for (int i = 0; i < Columns.Count; i++)
{
DataFrameColumn column = Columns[i];
Expand Down Expand Up @@ -540,7 +537,7 @@ public void Append(IEnumerable<KeyValuePair<string, object>> row)

foreach (DataFrameColumn column in Columns)
{
if (column.Length == RowCount)
if (column.Length == Rows.Count)
{
ResizeByOneAndAppend(column, null);
}
Expand Down Expand Up @@ -570,11 +567,10 @@ public override string ToString()
sb.Append(string.Format(Columns[i].Name.PadRight(longestColumnName)));
}
sb.AppendLine();
long numberOfRows = Math.Min(RowCount, 25);
long numberOfRows = Math.Min(Rows.Count, 25);
for (int i = 0; i < numberOfRows; i++)
{
IList<object> row = this[i];
foreach (object obj in row)
foreach (object obj in Rows[i])
{
sb.Append((obj ?? "null").ToString().PadRight(longestColumnName));
}
Expand Down
10 changes: 0 additions & 10 deletions src/Microsoft.Data.Analysis/DataFrameColumnCollection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,6 @@ internal IReadOnlyList<string> GetColumnNames()
return ret;
}

internal IList<object> GetRow(long rowIndex)
{
var ret = new List<object>();
for (int i = 0; i < Count; i++)
{
ret.Add(this[i][rowIndex]);
}
return ret;
}

public void SetColumnName(DataFrameColumn column, string newName)
{
string currentName = column.Name;
Expand Down
70 changes: 70 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameRow.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;

namespace Microsoft.Data.Analysis
{
/// <summary>
/// A DataFrameRow is a collection of values that represent a row in a <see cref="DataFrame"/>.
/// </summary>
public class DataFrameRow : IEnumerable<object>
{
private readonly DataFrame _dataFrame;
private readonly long _rowIndex;
internal DataFrameRow(DataFrame df, long rowIndex)
{
Debug.Assert(rowIndex < df.Columns.RowCount);
_dataFrame = df;
_rowIndex = rowIndex;
}

/// <summary>
/// Returns an enumerator of the values in this row.
/// </summary>
public IEnumerator<object> GetEnumerator()
{
foreach (DataFrameColumn column in _dataFrame.Columns)
{
yield return column[_rowIndex];
}
}

/// <summary>
/// An indexer to return the value at <paramref name="index"/>.
/// </summary>
/// <param name="index">The index of the value to return</param>
/// <returns>The value at this <paramref name="index"/>.</returns>
public object this[int index]
{
get
{
return _dataFrame.Columns[index][_rowIndex];
}
set
{
_dataFrame.Columns[index][_rowIndex] = value;
}
}

/// <summary>
/// A simple string representation of the values in this row
/// </summary>
public override string ToString()
{
StringBuilder sb = new StringBuilder();
foreach (object value in this)
{
sb.Append(value?.ToString() ?? "null").Append(" ");
}
return sb.ToString();
}

IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
}
}
56 changes: 56 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameRowCollection.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections;
using System.Collections.Generic;

namespace Microsoft.Data.Analysis
{
/// <summary>
/// Represents the rows of a <see cref="DataFrame"/>
/// </summary>
public class DataFrameRowCollection : IEnumerable<DataFrameRow>
{
private readonly DataFrame _dataFrame;

/// <summary>
/// Initializes a <see cref="DataFrameRowCollection"/>.
/// </summary>
internal DataFrameRowCollection(DataFrame dataFrame)
{
_dataFrame = dataFrame ?? throw new ArgumentNullException(nameof(dataFrame));
}

/// <summary>
/// An indexer to return the <see cref="DataFrameRow"/> at <paramref name="index"/>
/// </summary>
/// <param name="index">The row index</param>
public DataFrameRow this[long index]
{
get
{
return new DataFrameRow(_dataFrame, index);
}
}

/// <summary>
/// Returns an enumerator of <see cref="DataFrameRow"/> objects
/// </summary>
public IEnumerator<DataFrameRow> GetEnumerator()
{
for (long i = 0; i < Count; i++)
{
yield return new DataFrameRow(_dataFrame, i);
}
}

/// <summary>
/// The number of rows in this <see cref="DataFrame"/>.
/// </summary>
public long Count => _dataFrame.Columns.RowCount;

IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public void TestEmptyArrowColumns()
RecordBatch batch1 = new RecordBatch.Builder()
.Append("EmptyDataAndNullColumns", false, col => col.Int32(array => array.Clear())).Build();
DataFrame emptyDataFrame = DataFrame.FromArrowRecordBatch(batch1);
Assert.Equal(0, emptyDataFrame.RowCount);
Assert.Equal(0, emptyDataFrame.Rows.Count);
Assert.Equal(0, emptyDataFrame["EmptyDataAndNullColumns"].Length);
Assert.Equal(0, emptyDataFrame["EmptyDataAndNullColumns"].NullCount);
}
Expand Down
Loading

0 comments on commit 7cee9d9

Please sign in to comment.