SNOW-1527038 Fix value_counts bug where the index is not sorted prope…

…rly (snowflakedb#1892)
trakmaker · Jul 10, 2024 · 3b037e5 · 3b037e5
1 parent be0e0f3
commit 3b037e5
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -104,6 +104,7 @@
 - Fixed a bug in `DataFrame.rolling` and `Series.rolling` so `window=0` now throws `NotImplementedError` instead of `ValueError`
 - Fixed a bug in `DataFrame` and `Series` with `dtype=np.uint64` resulting in precision errors
 - Fixed bug where `values` is set to `index` when `index` and `columns` contain all columns in DataFrame during `pivot_table`.
+- Fixed bug where `value_counts` did not order the result correctly when `sort=True`.
 
 #### Improvements
 

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -10177,7 +10177,7 @@ def value_counts(
             by=by,
             agg_func={COUNT_LABEL: "count"},
             axis=0,
-            groupby_kwargs={"dropna": dropna},
+            groupby_kwargs={"dropna": dropna, "sort": False},
             agg_args=(),
             agg_kwargs={},
         )
@@ -10196,13 +10196,19 @@ def value_counts(
             count_identifier = internal_frame.data_column_snowflake_quoted_identifiers[
                 0
             ]
+        internal_frame = internal_frame.ensure_row_position_column()
 
         # When sort=True, sort by the frequency (count column);
         # otherwise, respect the original order (use the original ordering columns)
         ordered_dataframe = internal_frame.ordered_dataframe
         if sort:
+            # Need to explicitly specify the row position identifier to enforce the original order.
             ordered_dataframe = ordered_dataframe.sort(
-                OrderingColumn(count_identifier, ascending=ascending)
+                OrderingColumn(count_identifier, ascending=ascending),
+                OrderingColumn(
+                    internal_frame.row_position_snowflake_quoted_identifier,
+                    ascending=True,
+                ),
             )
 
         return SnowflakeQueryCompiler(

diff --git a/tests/integ/modin/series/test_value_counts.py b/tests/integ/modin/series/test_value_counts.py
@@ -27,6 +27,47 @@
 ]
 
 
+NATIVE_SERIES_TEST_DATA = [
+    native_pd.Series(
+        [1, 2, 3, 2, 3, 5, 6, 7, 8, 4, 4, 5, 6, 7, 1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 7]
+    ),
+    native_pd.Series([1.1, 2.2, 1.0, 1, 1.1, 2.2, 1, 1, 1, 2, 2, 2, 2.2]),
+    native_pd.Series([1, 3, 1, 1, 1, 3, 1, 1, 1, 2, 2, 2, 3]),
+    native_pd.Series(
+        [True, False, True, False, True, False, True, False, True, True], dtype=bool
+    ),
+    native_pd.Series(
+        [
+            "a",
+            "b",
+            "c",
+            "b",
+            "c",
+            "e",
+            "f",
+            "g",
+            "h",
+            "d",
+            "d",
+            "e",
+            "f",
+            "g",
+            "a",
+            "b",
+            "a",
+            "b",
+            "c",
+            "d",
+            "c",
+            "d",
+            "e",
+            "f",
+            "g",
+        ]
+    ),
+]
+
+
 @pytest.mark.parametrize("test_data", TEST_DATA)
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("ascending", [True, False])
@@ -79,3 +120,20 @@ def test_value_counts_dropna(test_data, dropna):
 def test_value_counts_bins():
     with pytest.raises(NotImplementedError, match="bins argument is not yet supported"):
         pd.Series([1, 2, 3, 4]).value_counts(bins=3)
+
+
+@pytest.mark.parametrize("native_series", NATIVE_SERIES_TEST_DATA)
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("dropna", [True, False])
+@sql_count_checker(query_count=1)
+def test_series_value_counts(native_series, normalize, sort, ascending, dropna):
+    snow_series = pd.Series(native_series)
+    eval_snowpark_pandas_result(
+        snow_series,
+        native_series,
+        lambda s: s.value_counts(
+            normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
+        ),
+    )