Skip to content

Commit

Permalink
FIX-modin-project#6509: Fix 'reshuffling' in case of a string key
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev committed Aug 25, 2023
1 parent da385c9 commit 6434b3c
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 18 deletions.
15 changes: 15 additions & 0 deletions modin/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,3 +693,18 @@ def s3_resource(s3_base):
if not cli.list_buckets()["Buckets"]:
break
time.sleep(0.1)


@pytest.fixture
def modify_config(request):
values = request.param
old_values = {}

for key, value in values.items():
old_values[key] = key.get()
key.put(value)

yield # waiting for the test to be completed
# restoring old parameters
for key, value in old_values.items():
key.put(value)
6 changes: 3 additions & 3 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2369,12 +2369,12 @@ def _apply_func_to_range_partitioning(
# simply combine all partitions and apply the sorting to the whole dataframe
return self.combine_and_apply(func=func)

if self.dtypes[key_column] == object:
if is_numeric_dtype(self.dtypes[key_column]):
method = "linear"
else:
# This means we are not sorting numbers, so we need our quantiles to not try
# arithmetic on the values.
method = "inverted_cdf"
else:
method = "linear"

shuffling_functions = build_sort_functions(
self,
Expand Down
17 changes: 17 additions & 0 deletions modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2787,3 +2787,20 @@ def perform(lib):
return getattr(grp, func)()

eval_general(pd, pandas, perform)


@pytest.mark.parametrize(
"modify_config", [{ExperimentalGroupbyImpl: True}], indirect=True
)
def test_reshuffling_groupby_on_strings(modify_config):
# reproducer from https://github.com/modin-project/modin/issues/6509
modin_df, pandas_df = create_test_dfs(
{"col1": ["a"] * 50 + ["b"] * 50, "col2": range(100)}
)

modin_df = modin_df.astype({"col1": "string"})
pandas_df = pandas_df.astype({"col1": "string"})

eval_general(
modin_df.groupby("col1"), pandas_df.groupby("col1"), lambda grp: grp.mean()
)
15 changes: 0 additions & 15 deletions modin/test/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,21 +117,6 @@ def construct_modin_df_by_scheme(pandas_df, partitioning_scheme):
return md_df


@pytest.fixture
def modify_config(request):
values = request.param
old_values = {}

for key, value in values.items():
old_values[key] = key.get()
key.put(value)

yield # waiting for the test to be completed
# restoring old parameters
for key, value in old_values.items():
key.put(value)


def validate_partitions_cache(df):
"""Assert that the ``PandasDataframe`` shape caches correspond to the actual partition's shapes."""
row_lengths = df._row_lengths_cache
Expand Down

0 comments on commit 6434b3c

Please sign in to comment.