Skip to content

Commit

Permalink
SNOW-1527038 Fix value_counts bug where the index is not sorted prope…
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-vbudati authored Jul 10, 2024
1 parent be0e0f3 commit 3b037e5
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
- Fixed a bug in `DataFrame.rolling` and `Series.rolling` so `window=0` now throws `NotImplementedError` instead of `ValueError`
- Fixed a bug in `DataFrame` and `Series` with `dtype=np.uint64` resulting in precision errors
- Fixed bug where `values` is set to `index` when `index` and `columns` contain all columns in DataFrame during `pivot_table`.
- Fixed bug where `value_counts` did not order the result correctly when `sort=True`.

#### Improvements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10177,7 +10177,7 @@ def value_counts(
by=by,
agg_func={COUNT_LABEL: "count"},
axis=0,
groupby_kwargs={"dropna": dropna},
groupby_kwargs={"dropna": dropna, "sort": False},
agg_args=(),
agg_kwargs={},
)
Expand All @@ -10196,13 +10196,19 @@ def value_counts(
count_identifier = internal_frame.data_column_snowflake_quoted_identifiers[
0
]
internal_frame = internal_frame.ensure_row_position_column()

# When sort=True, sort by the frequency (count column);
# otherwise, respect the original order (use the original ordering columns)
ordered_dataframe = internal_frame.ordered_dataframe
if sort:
# Need to explicitly specify the row position identifier to enforce the original order.
ordered_dataframe = ordered_dataframe.sort(
OrderingColumn(count_identifier, ascending=ascending)
OrderingColumn(count_identifier, ascending=ascending),
OrderingColumn(
internal_frame.row_position_snowflake_quoted_identifier,
ascending=True,
),
)

return SnowflakeQueryCompiler(
Expand Down
58 changes: 58 additions & 0 deletions tests/integ/modin/series/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,47 @@
]


NATIVE_SERIES_TEST_DATA = [
native_pd.Series(
[1, 2, 3, 2, 3, 5, 6, 7, 8, 4, 4, 5, 6, 7, 1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 7]
),
native_pd.Series([1.1, 2.2, 1.0, 1, 1.1, 2.2, 1, 1, 1, 2, 2, 2, 2.2]),
native_pd.Series([1, 3, 1, 1, 1, 3, 1, 1, 1, 2, 2, 2, 3]),
native_pd.Series(
[True, False, True, False, True, False, True, False, True, True], dtype=bool
),
native_pd.Series(
[
"a",
"b",
"c",
"b",
"c",
"e",
"f",
"g",
"h",
"d",
"d",
"e",
"f",
"g",
"a",
"b",
"a",
"b",
"c",
"d",
"c",
"d",
"e",
"f",
"g",
]
),
]


@pytest.mark.parametrize("test_data", TEST_DATA)
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
Expand Down Expand Up @@ -79,3 +120,20 @@ def test_value_counts_dropna(test_data, dropna):
def test_value_counts_bins():
with pytest.raises(NotImplementedError, match="bins argument is not yet supported"):
pd.Series([1, 2, 3, 4]).value_counts(bins=3)


@pytest.mark.parametrize("native_series", NATIVE_SERIES_TEST_DATA)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("dropna", [True, False])
@sql_count_checker(query_count=1)
def test_series_value_counts(native_series, normalize, sort, ascending, dropna):
snow_series = pd.Series(native_series)
eval_snowpark_pandas_result(
snow_series,
native_series,
lambda s: s.value_counts(
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
),
)

0 comments on commit 3b037e5

Please sign in to comment.