Skip to content

Commit c863b3d

Browse files
authored
FIX-#2453: Remove sorting indices for equal values in Series.value_counts (#2454)
Signed-off-by: Igoshev, Yaroslav <yaroslav.igoshev@intel.com>
1 parent 3f65c89 commit c863b3d

File tree

5 files changed

+33
-87
lines changed

5 files changed

+33
-87
lines changed

docs/supported_apis/series_supported.rst

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -474,10 +474,8 @@ the related section on `Defaulting to pandas`_.
474474
+-----------------------------+---------------------------------+----------------------------------------------------+
475475
| ``valid`` | D | |
476476
+-----------------------------+---------------------------------+----------------------------------------------------+
477-
| ``value_counts`` | Y | The indices of resulting object will be in |
478-
| | | descending (ascending, if ascending=True) order for|
479-
| | | equal values. |
480-
| | | In pandas indices are located in random order. |
477+
| ``value_counts`` | Y | The indices order of resulting object may differ |
478+
| | | from pandas. |
481479
+-----------------------------+---------------------------------+----------------------------------------------------+
482480
| ``values`` | Y | |
483481
+-----------------------------+---------------------------------+----------------------------------------------------+

docs/supported_apis/utilities_supported.rst

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,8 @@ default to pandas.
2121
+---------------------------+---------------------------------+----------------------------------------------------+
2222
| `pd.unique`_ | Y | |
2323
+---------------------------+---------------------------------+----------------------------------------------------+
24-
| ``pd.value_counts`` | Y | The indices of resulting object will be in |
25-
| | | descending (ascending, if ascending=True) order for|
26-
| | | equal values. |
27-
| | | In pandas indices are located in random order. |
24+
| ``pd.value_counts`` | Y | The indices order of resulting object may differ |
25+
| | | from pandas. |
2826
+---------------------------+---------------------------------+----------------------------------------------------+
2927
| `pd.cut`_ | D | |
3028
+---------------------------+---------------------------------+----------------------------------------------------+

modin/backends/pandas/query_compiler.py

Lines changed: 1 addition & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -750,58 +750,7 @@ def reduce_func(df, *args, **kwargs):
750750
if normalize:
751751
result = result / df.squeeze(axis=1).sum()
752752

753-
result = result.sort_values(ascending=ascending) if sort else result
754-
755-
# We want to sort both values and indices of the result object.
756-
# This function will sort indices for equal values.
757-
def sort_index_for_equal_values(result, ascending):
758-
"""
759-
Sort indices for equal values of result object.
760-
761-
Parameters
762-
----------
763-
result : pandas.Series or pandas.DataFrame with one column
764-
The object whose indices for equal values is needed to sort.
765-
ascending : boolean
766-
Sort in ascending (if it is True) or descending (if it is False) order.
767-
768-
Returns
769-
-------
770-
pandas.DataFrame
771-
A new DataFrame with sorted indices.
772-
"""
773-
is_range = False
774-
is_end = False
775-
i = 0
776-
new_index = np.empty(len(result), dtype=type(result.index))
777-
while i < len(result):
778-
j = i
779-
if i < len(result) - 1:
780-
while result[result.index[i]] == result[result.index[i + 1]]:
781-
i += 1
782-
if is_range is False:
783-
is_range = True
784-
if i == len(result) - 1:
785-
is_end = True
786-
break
787-
if is_range:
788-
k = j
789-
for val in sorted(
790-
result.index[j : i + 1], reverse=not ascending
791-
):
792-
new_index[k] = val
793-
k += 1
794-
if is_end:
795-
break
796-
is_range = False
797-
else:
798-
new_index[j] = result.index[j]
799-
i += 1
800-
return pandas.DataFrame(
801-
result, index=new_index, columns=["__reduced__"]
802-
)
803-
804-
return sort_index_for_equal_values(result, ascending)
753+
return result.sort_values(ascending=ascending) if sort else result
805754

806755
return MapReduceFunction.register(
807756
map_func, reduce_func, axis=0, preserve_index=False

modin/pandas/test/test_general.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -352,23 +352,29 @@ def sort_index_for_equal_values(result, ascending):
352352
else:
353353
new_index[j] = result.index[j]
354354
i += 1
355-
return pandas.Series(result, index=new_index)
355+
return type(result)(result, index=new_index)
356356

357-
# We sort indices for pandas result because of issue #1650
357+
# We sort indices for Modin and pandas result because of issue #1650
358358
values = np.array([3, 1, 2, 3, 4, np.nan])
359-
modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
359+
modin_result = sort_index_for_equal_values(
360+
pd.value_counts(values, normalize=normalize, ascending=False), False
361+
)
360362
pandas_result = sort_index_for_equal_values(
361363
pandas.value_counts(values, normalize=normalize, ascending=False), False
362364
)
363365
df_equals(modin_result, pandas_result)
364366

365-
modin_result = pd.value_counts(values, bins=bins, ascending=False)
367+
modin_result = sort_index_for_equal_values(
368+
pd.value_counts(values, bins=bins, ascending=False), False
369+
)
366370
pandas_result = sort_index_for_equal_values(
367371
pandas.value_counts(values, bins=bins, ascending=False), False
368372
)
369373
df_equals(modin_result, pandas_result)
370374

371-
modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
375+
modin_result = sort_index_for_equal_values(
376+
pd.value_counts(values, dropna=dropna, ascending=True), True
377+
)
372378
pandas_result = sort_index_for_equal_values(
373379
pandas.value_counts(values, dropna=dropna, ascending=True), True
374380
)

modin/pandas/test/test_series.py

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from pandas.core.base import SpecificationError
2222
import sys
2323

24-
from modin.utils import to_pandas, get_current_backend
24+
from modin.utils import to_pandas
2525
from .utils import (
2626
random_state,
2727
RAND_LOW,
@@ -3376,33 +3376,28 @@ def sort_index_for_equal_values(result, ascending):
33763376
i += 1
33773377
return type(result)(result, index=new_index)
33783378

3379-
# We sort indices for pandas result because of issue #1650
3379+
# We sort indices for Modin and pandas result because of issue #1650
33803380
modin_series, pandas_series = create_test_series(test_data_values[0])
3381-
modin_result = modin_series.value_counts(normalize=normalize, ascending=False)
3382-
3383-
if get_current_backend() == "BaseOnPython":
3384-
modin_result = sort_index_for_equal_values(modin_result, ascending=False)
33853381

3382+
modin_result = sort_index_for_equal_values(
3383+
modin_series.value_counts(normalize=normalize, ascending=False), False
3384+
)
33863385
pandas_result = sort_index_for_equal_values(
33873386
pandas_series.value_counts(normalize=normalize, ascending=False), False
33883387
)
33893388
df_equals(modin_result, pandas_result)
33903389

3391-
modin_result = modin_series.value_counts(bins=bins, ascending=False)
3392-
3393-
if get_current_backend() == "BaseOnPython":
3394-
modin_result = sort_index_for_equal_values(modin_result, ascending=False)
3395-
3390+
modin_result = sort_index_for_equal_values(
3391+
modin_series.value_counts(bins=bins, ascending=False), False
3392+
)
33963393
pandas_result = sort_index_for_equal_values(
33973394
pandas_series.value_counts(bins=bins, ascending=False), False
33983395
)
33993396
df_equals(modin_result, pandas_result)
34003397

3401-
modin_result = modin_series.value_counts(dropna=dropna, ascending=True)
3402-
3403-
if get_current_backend() == "BaseOnPython":
3404-
modin_result = sort_index_for_equal_values(modin_result, ascending=True)
3405-
3398+
modin_result = sort_index_for_equal_values(
3399+
modin_series.value_counts(dropna=dropna, ascending=True), True
3400+
)
34063401
pandas_result = sort_index_for_equal_values(
34073402
pandas_series.value_counts(dropna=dropna, ascending=True), True
34083403
)
@@ -3412,20 +3407,20 @@ def sort_index_for_equal_values(result, ascending):
34123407
arr = np.random.rand(2 ** 6)
34133408
arr[::10] = np.nan
34143409
modin_series, pandas_series = create_test_series(arr)
3415-
modin_result = modin_series.value_counts(dropna=False, ascending=True)
3410+
modin_result = sort_index_for_equal_values(
3411+
modin_series.value_counts(dropna=False, ascending=True), True
3412+
)
34163413
pandas_result = sort_index_for_equal_values(
34173414
pandas_series.value_counts(dropna=False, ascending=True), True
34183415
)
3419-
if get_current_backend() == "BaseOnPython":
3420-
modin_result = sort_index_for_equal_values(modin_result, ascending=True)
34213416
df_equals(modin_result, pandas_result)
34223417

3423-
modin_result = modin_series.value_counts(dropna=False, ascending=False)
3418+
modin_result = sort_index_for_equal_values(
3419+
modin_series.value_counts(dropna=False, ascending=False), False
3420+
)
34243421
pandas_result = sort_index_for_equal_values(
34253422
pandas_series.value_counts(dropna=False, ascending=False), False
34263423
)
3427-
if get_current_backend() == "BaseOnPython":
3428-
modin_result = sort_index_for_equal_values(modin_result, ascending=False)
34293424
df_equals(modin_result, pandas_result)
34303425

34313426

0 commit comments

Comments
 (0)