-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improvements to the sort routine #5776
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -335,7 +335,7 @@ public DataFrame Sample(int numberOfRows) | |
|
||
int shuffleLowerLimit = 0; | ||
int shuffleUpperLimit = (int)Math.Min(Int32.MaxValue, Rows.Count); | ||
|
||
int[] shuffleArray = Enumerable.Range(0, shuffleUpperLimit).ToArray(); | ||
Random rand = new Random(); | ||
while (shuffleLowerLimit < numberOfRows) | ||
|
@@ -349,7 +349,7 @@ public DataFrame Sample(int numberOfRows) | |
ArraySegment<int> segment = new ArraySegment<int>(shuffleArray, 0, shuffleLowerLimit); | ||
|
||
PrimitiveDataFrameColumn<int> indices = new PrimitiveDataFrameColumn<int>("indices", segment); | ||
|
||
return Clone(indices); | ||
} | ||
|
||
|
@@ -623,12 +623,16 @@ private void OnColumnsChanged() | |
private DataFrame Sort(string columnName, bool isAscending) | ||
{ | ||
DataFrameColumn column = Columns[columnName]; | ||
DataFrameColumn sortIndices = column.GetAscendingSortIndices(); | ||
PrimitiveDataFrameColumn<long> sortIndices = column.GetAscendingSortIndices(out Int64DataFrameColumn nullIndices); | ||
for (long i = 0; i < nullIndices.Length; i++) | ||
{ | ||
sortIndices.Append(nullIndices[i]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, no. Does seem like a good API to add though. |
||
} | ||
List<DataFrameColumn> newColumns = new List<DataFrameColumn>(Columns.Count); | ||
for (int i = 0; i < Columns.Count; i++) | ||
{ | ||
DataFrameColumn oldColumn = Columns[i]; | ||
DataFrameColumn newColumn = oldColumn.Clone(sortIndices, !isAscending, oldColumn.NullCount); | ||
DataFrameColumn newColumn = oldColumn.Clone(sortIndices, !isAscending); | ||
Debug.Assert(newColumn.NullCount == oldColumn.NullCount); | ||
newColumns.Add(newColumn); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -331,6 +331,15 @@ public virtual StringDataFrameColumn Info() | |
/// </summary> | ||
public virtual DataFrameColumn Description() => throw new NotImplementedException(); | ||
|
||
/// <summary> | ||
/// Returns the indices of non-null values that, when applied, result in this column being sorted in ascending order. Also returns the indices of null values in <paramref name="nullIndices"/>. | ||
/// </summary> | ||
/// <param name="nullIndices">Indices of values that are <see langword="null"/>.</param> | ||
internal virtual PrimitiveDataFrameColumn<long> GetAscendingSortIndices(out Int64DataFrameColumn nullIndices) => throw new NotImplementedException(); | ||
|
||
/// <summary> | ||
/// Returns the indices of non-null values that, when applied, result in this column being sorted in ascending order | ||
/// </summary> | ||
internal virtual PrimitiveDataFrameColumn<long> GetAscendingSortIndices() => throw new NotImplementedException(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do we do with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, they are just dropped. At the moment it's used for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Huh, my bad. For some reason I thought it was |
||
|
||
internal delegate long GetBufferSortIndex(int bufferIndex, int sortIndex); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would cause
null
rows to be at the top for a descending sort. I have to think about if this is acceptable or if we want to change this behavior. Just calling it out hereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the moment, I think we can leave the null rows at the top, until we get feedback about it (if any). To move the null rows to the bottom, we'd need to thread the
nullIndices
column into theClone
API implementations, and I don't think it's worth it currently.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like Pandas puts them in the 'last' position by default, but you can change it with an option.