Skip to content

Commit 787d2b1

Browse files
authored
FIX-#2313: improved handling non-numeric types at 'mean' when 'axis=1' (#2535)
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
1 parent 1a8cd0a commit 787d2b1

File tree

3 files changed

+23
-24
lines changed

3 files changed

+23
-24
lines changed

asv_bench/benchmarks/benchmarks.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,6 @@ def time_nunique(self, impl, data_type, data_size, axis):
260260

261261
def time_apply(self, impl, data_type, data_size, axis):
262262
self.df.apply(lambda df: df.sum(), axis=axis)
263+
264+
def time_mean(self, impl, data_type, data_size, axis):
265+
self.df.mean(axis=axis)

modin/backends/pandas/query_compiler.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -677,33 +677,31 @@ def mean(self, axis, **kwargs):
677677

678678
skipna = kwargs.get("skipna", True)
679679

680-
def map_apply_fn(ser, **kwargs):
681-
try:
682-
sum_result = ser.sum(skipna=skipna)
683-
count_result = ser.count()
684-
except TypeError:
685-
return None
686-
else:
687-
return (sum_result, count_result)
688-
689-
def reduce_apply_fn(ser, **kwargs):
690-
sum_result = ser.apply(lambda x: x[0]).sum(skipna=skipna)
691-
count_result = ser.apply(lambda x: x[1]).sum(skipna=skipna)
692-
return sum_result / count_result
680+
# TODO-FIX: this function may work incorrectly with user-defined "numeric" values.
681+
# Since `count(numeric_only=True)` discards all unknown "numeric" types, we can get incorrect
682+
# divisor inside the reduce function.
683+
def map_fn(df, **kwargs):
684+
result = pandas.DataFrame(
685+
{
686+
"sum": df.sum(axis=axis, skipna=skipna),
687+
"count": df.count(axis=axis, numeric_only=True),
688+
}
689+
)
690+
return result if axis else result.T
693691

694692
def reduce_fn(df, **kwargs):
695-
df.dropna(axis=1, inplace=True, how="any")
696-
return build_applyier(reduce_apply_fn, axis=axis)(df)
697-
698-
def build_applyier(func, **applyier_kwargs):
699-
def applyier(df, **kwargs):
700-
result = df.apply(func, **applyier_kwargs)
701-
return result.set_axis(df.axes[axis ^ 1], axis=0)
693+
sum_cols = df["sum"] if axis else df.loc["sum"]
694+
count_cols = df["count"] if axis else df.loc["count"]
702695

703-
return applyier
696+
if not isinstance(sum_cols, pandas.Series):
697+
# If we got `NaN` as the result of the sum in any axis partition,
698+
# then we must consider the whole sum as `NaN`, so setting `skipna=False`
699+
sum_cols = sum_cols.sum(axis=axis, skipna=False)
700+
count_cols = count_cols.sum(axis=axis, skipna=False)
701+
return sum_cols / count_cols
704702

705703
return MapReduceFunction.register(
706-
build_applyier(map_apply_fn, axis=axis, result_type="reduce"),
704+
map_fn,
707705
reduce_fn,
708706
preserve_index=(kwargs.get("numeric_only") is not None),
709707
)(self, axis=axis, **kwargs)

modin/pandas/test/dataframe/test_reduction.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,6 @@ def test_sum_single_column(data):
363363
"numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)
364364
)
365365
def test_reduction_specific(fn, numeric_only, axis):
366-
if fn == "mean" and axis == 1:
367-
pytest.skip("See issue #2313 for details")
368366
eval_general(
369367
*create_test_dfs(test_data_diff_dtype),
370368
lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),

0 commit comments

Comments
 (0)