Skip to content

[DataFrame] Fixes dropna subset bug #2018

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions python/ray/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,17 +806,22 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
if how is None and thresh is None:
raise TypeError('must specify how or thresh')

indices = None
if subset is not None:
subset = set(subset)

if axis == 1:
subset = [item for item in self.index if item in subset]
indices = self.index.get_indexer_for(subset)
check = indices == -1
if check.any():
raise KeyError(list(np.compress(check, subset)))
else:
subset = [item for item in self.columns if item in subset]
indices = self.columns.get_indexer_for(subset)
check = indices == -1
if check.any():
raise KeyError(list(np.compress(check, subset)))

def dropna_helper(df):
new_df = df.dropna(axis=axis, how=how, thresh=thresh,
subset=subset, inplace=False)
subset=indices, inplace=False)

if axis == 1:
new_index = new_df.columns
Expand Down
39 changes: 39 additions & 0 deletions python/ray/dataframe/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,10 +842,15 @@ def test_dense_nan_df():
[np.nan, np.nan, np.nan, 5]],
columns=list('ABCD'))

column_subsets = [list('AD'), list('BC'), list('CD')]
row_subsets = [[0, 1], [0, 1, 2], [2, 0]]

test_dropna(ray_df, pd_df)
test_dropna_inplace(ray_df, pd_df)
test_dropna_multiple_axes(ray_df, pd_df)
test_dropna_multiple_axes_inplace(ray_df, pd_df)
test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets)
test_dropna_subset_error(ray_df)


@pytest.fixture
Expand Down Expand Up @@ -1402,6 +1407,40 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df):
assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)


@pytest.fixture
def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets):
for subset in column_subsets:
assert ray_df_equals_pandas(
ray_df.dropna(how='all', subset=subset),
pd_df.dropna(how='all', subset=subset)
)

assert ray_df_equals_pandas(
ray_df.dropna(how='any', subset=subset),
pd_df.dropna(how='any', subset=subset)
)

for subset in row_subsets:
assert ray_df_equals_pandas(
ray_df.dropna(how='all', axis=1, subset=subset),
pd_df.dropna(how='all', axis=1, subset=subset)
)

assert ray_df_equals_pandas(
ray_df.dropna(how='any', axis=1, subset=subset),
pd_df.dropna(how='any', axis=1, subset=subset)
)


@pytest.fixture
def test_dropna_subset_error(ray_df):
with pytest.raises(KeyError):
ray_df.dropna(subset=list('EF'))

with pytest.raises(KeyError):
ray_df.dropna(axis=1, subset=[4, 5])


def test_duplicated():
ray_df = create_test_dataframe()

Expand Down