Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DataFrame] Implement rank #1991

Merged
merged 8 commits into from
May 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 46 additions & 3 deletions python/ray/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3137,9 +3137,52 @@ def radd(self, other, axis='columns', level=None, fill_value=None):

def rank(self, axis=0, method='average', numeric_only=None,
na_option='keep', ascending=True, pct=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
"github.com/ray-project/ray.")

"""
Compute numerical data ranks (1 through n) along axis.
Equal values are assigned a rank that is the [method] of
the ranks of those values.

Args:
axis (int): 0 or 'index' for row-wise,
1 or 'columns' for column-wise
interpolation: {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}
Specifies which method to use for equal vals
numeric_only (boolean)
Include only float, int, boolean data.
na_option: {'keep', 'top', 'bottom'}
Specifies how to handle NA options
ascending (boolean):
Decedes ranking order
pct (boolean):
Computes percentage ranking of data
Returns:
A new DataFrame
"""

def rank_helper(df):
return df.rank(axis=axis, method=method,
numeric_only=numeric_only,
na_option=na_option,
ascending=ascending, pct=pct)

axis = pd.DataFrame()._get_axis_number(axis)

if (axis == 1):
new_cols = self.dtypes[self.dtypes.apply(
lambda x: is_numeric_dtype(x))].index
result = _map_partitions(rank_helper,
self._row_partitions)
return DataFrame(row_partitions=result,
columns=new_cols,
index=self.index)

if (axis == 0):
result = _map_partitions(rank_helper,
self._col_partitions)
return DataFrame(col_partitions=result,
columns=self.columns,
index=self.index)

def rdiv(self, other, axis='columns', level=None, fill_value=None):
return self._single_df_op_helper(
Expand Down
15 changes: 10 additions & 5 deletions python/ray/dataframe/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def test_int_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_diff(ray_df, pandas_df)
test_rank(ray_df, pandas_df)

test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
Expand Down Expand Up @@ -392,6 +393,7 @@ def test_float_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_diff(ray_df, pandas_df)
test_rank(ray_df, pandas_df)

test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
Expand Down Expand Up @@ -560,6 +562,9 @@ def test_mixed_dtype_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)

# TODO Reolve once Pandas-20962 is resolved.
# test_rank(ray_df, pandas_df)

test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
test___getitem__(ray_df, pandas_df)
Expand Down Expand Up @@ -718,6 +723,7 @@ def test_nan_dataframe():
test_quantile(ray_df, pandas_df, .75)
test_describe(ray_df, pandas_df)
test_diff(ray_df, pandas_df)
test_rank(ray_df, pandas_df)

test_all(ray_df, pandas_df)
test_any(ray_df, pandas_df)
Expand Down Expand Up @@ -2377,11 +2383,10 @@ def test_radd():
test_inter_df_math_right_ops("radd")


def test_rank():
ray_df = create_test_dataframe()

with pytest.raises(NotImplementedError):
ray_df.rank()
@pytest.fixture
def test_rank(ray_df, pandas_df):
assert(ray_df_equals_pandas(ray_df.rank(), pandas_df.rank()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should add tests for axis=1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

assert(ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1)))


def test_rdiv():
Expand Down