FIX-#2313: improved handling non-numeric types at 'mean' when 'axis=1' (#2535)

dchigarev · web-flow · commit 787d2b1cb40d · 2020-12-15T18:32:54.000+03:00
Signed-off-by: Dmitry Chigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -260,3 +260,6 @@ def time_nunique(self, impl, data_type, data_size, axis):
 
     def time_apply(self, impl, data_type, data_size, axis):
         self.df.apply(lambda df: df.sum(), axis=axis)
+    
+    def time_mean(self, impl, data_type, data_size, axis):
+        self.df.mean(axis=axis)
diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
@@ -677,33 +677,31 @@ def mean(self, axis, **kwargs):
 
         skipna = kwargs.get("skipna", True)
 
-        def map_apply_fn(ser, **kwargs):
-            try:
-                sum_result = ser.sum(skipna=skipna)
-                count_result = ser.count()
-            except TypeError:
-                return None
-            else:
-                return (sum_result, count_result)
-
-        def reduce_apply_fn(ser, **kwargs):
-            sum_result = ser.apply(lambda x: x[0]).sum(skipna=skipna)
-            count_result = ser.apply(lambda x: x[1]).sum(skipna=skipna)
-            return sum_result / count_result
+        # TODO-FIX: this function may work incorrectly with user-defined "numeric" values.
+        # Since `count(numeric_only=True)` discards all unknown "numeric" types, we can get incorrect
+        # divisor inside the reduce function.
+        def map_fn(df, **kwargs):
+            result = pandas.DataFrame(
+                {
+                    "sum": df.sum(axis=axis, skipna=skipna),
+                    "count": df.count(axis=axis, numeric_only=True),
+                }
+            )
+            return result if axis else result.T
 
         def reduce_fn(df, **kwargs):
-            df.dropna(axis=1, inplace=True, how="any")
-            return build_applyier(reduce_apply_fn, axis=axis)(df)
-
-        def build_applyier(func, **applyier_kwargs):
-            def applyier(df, **kwargs):
-                result = df.apply(func, **applyier_kwargs)
-                return result.set_axis(df.axes[axis ^ 1], axis=0)
+            sum_cols = df["sum"] if axis else df.loc["sum"]
+            count_cols = df["count"] if axis else df.loc["count"]
 
-            return applyier
+            if not isinstance(sum_cols, pandas.Series):
+                # If we got `NaN` as the result of the sum in any axis partition,
+                # then we must consider the whole sum as `NaN`, so setting `skipna=False`
+                sum_cols = sum_cols.sum(axis=axis, skipna=False)
+                count_cols = count_cols.sum(axis=axis, skipna=False)
+            return sum_cols / count_cols
 
         return MapReduceFunction.register(
-            build_applyier(map_apply_fn, axis=axis, result_type="reduce"),
+            map_fn,
             reduce_fn,
             preserve_index=(kwargs.get("numeric_only") is not None),
         )(self, axis=axis, **kwargs)
diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py
@@ -363,8 +363,6 @@ def test_sum_single_column(data):
     "numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)
 )
 def test_reduction_specific(fn, numeric_only, axis):
-    if fn == "mean" and axis == 1:
-        pytest.skip("See issue #2313 for details")
     eval_general(
         *create_test_dfs(test_data_diff_dtype),
         lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),

Original file line number	Diff line number	Diff line change
`@@ -363,8 +363,6 @@ def test_sum_single_column(data):`
`363`	`363`	`"numeric_only", bool_arg_values, ids=arg_keys("numeric_only", bool_arg_keys)`
`364`	`364`	`)`
`365`	`365`	`def test_reduction_specific(fn, numeric_only, axis):`
`366`		`- if fn == "mean" and axis == 1:`
`367`		`- pytest.skip("See issue #2313 for details")`
`368`	`366`	`eval_general(`
`369`	`367`	`*create_test_dfs(test_data_diff_dtype),`
`370`	`368`	`lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),`