fix: fix value_counts column label for normalize=True (#245)

TrevorBergeron · web-flow · commit d3fa6f26931d · 2023-12-06T19:04:15.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -353,7 +353,9 @@ def value_counts(
                 )
             ]
         )
-    return block.select_column(count_id).with_column_labels(["count"])
+    return block.select_column(count_id).with_column_labels(
+        ["proportion" if normalize else "count"]
+    )
 
 
 def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block:
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -3453,6 +3453,8 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index):
     ],
 )
 def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):
+    if pd.__version__.startswith("1."):
+        pytest.skip("pandas 1.x produces different column labels.")
     scalars_df, scalars_pandas_df = scalars_dfs
 
     bf_result = (
@@ -3464,10 +3466,6 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):
         subset, normalize=normalize, ascending=ascending, dropna=dropna
     )
 
-    # Older pandas version may not have these values, bigframes tries to emulate 2.0+
-    pd_result.name = "count"
-    pd_result.index.names = bf_result.index.names
-
     pd.testing.assert_series_equal(
         bf_result, pd_result, check_dtype=False, check_index_type=False
     )
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -1940,23 +1940,23 @@ def test_cummax_int(scalars_df_index, scalars_pandas_df_index):
 
 
 def test_value_counts(scalars_dfs):
+    if pd.__version__.startswith("1."):
+        pytest.skip("pandas 1.x produces different column labels.")
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "int64_too"
 
     bf_result = scalars_df[col_name].value_counts().to_pandas()
     pd_result = scalars_pandas_df[col_name].value_counts()
 
-    # Older pandas version may not have these values, bigframes tries to emulate 2.0+
-    pd_result.name = "count"
-    pd_result.index.name = col_name
-
     pd.testing.assert_series_equal(
         bf_result,
         pd_result,
     )
 
 
 def test_value_counts_w_cut(scalars_dfs):
+    if pd.__version__.startswith("1."):
+        pytest.skip("value_counts results different in pandas 1.x.")
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "int64_col"
 
@@ -1965,9 +1965,6 @@ def test_value_counts_w_cut(scalars_dfs):
 
     bf_result = bf_cut.value_counts().to_pandas()
     pd_result = pd_cut.value_counts()
-    # Older pandas version may not have these values, bigframes tries to emulate 2.0+
-    pd_result.name = "count"
-    pd_result.index.name = col_name
     pd_result.index = pd_result.index.astype(pd.Int64Dtype())
 
     pd.testing.assert_series_equal(

Original file line number	Diff line number	Diff line change
`@@ -353,7 +353,9 @@ def value_counts(`
`353`	`353`	`)`
`354`	`354`	`]`
`355`	`355`	`)`
`356`		`- return block.select_column(count_id).with_column_labels(["count"])`
	`356`	`+ return block.select_column(count_id).with_column_labels(`
	`357`	`+ ["proportion" if normalize else "count"]`
	`358`	`+ )`
`357`	`359`
`358`	`360`
`359`	`361`	`def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block:`
Original file line number	Diff line number	Diff line change
`@@ -3453,6 +3453,8 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index):`
`3453`	`3453`	`],`
`3454`	`3454`	`)`
`3455`	`3455`	`def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):`
	`3456`	`+ if pd.__version__.startswith("1."):`
	`3457`	`+ pytest.skip("pandas 1.x produces different column labels.")`
`3456`	`3458`	`scalars_df, scalars_pandas_df = scalars_dfs`
`3457`	`3459`
`3458`	`3460`	`bf_result = (`
`@@ -3464,10 +3466,6 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):`
`3464`	`3466`	`subset, normalize=normalize, ascending=ascending, dropna=dropna`
`3465`	`3467`	`)`
`3466`	`3468`
`3467`		`- # Older pandas version may not have these values, bigframes tries to emulate 2.0+`
`3468`		`- pd_result.name = "count"`
`3469`		`- pd_result.index.names = bf_result.index.names`
`3470`		`-`
`3471`	`3469`	`pd.testing.assert_series_equal(`
`3472`	`3470`	`bf_result, pd_result, check_dtype=False, check_index_type=False`
`3473`	`3471`	`)`