FIX-#1953: Fix computing of reduced indices

for reduction operation Signed-off-by: Igoshev, Yaroslav <yaroslav.igoshev@intel.com>
modin-project · Sep 2, 2020 · 8fec8a3 · 8fec8a3
1 parent ff6ff0d
commit 8fec8a3
Show file tree

Hide file tree

Showing 7 changed files with 800 additions and 231 deletions.
diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py
@@ -78,9 +78,9 @@ def __init__(
             )
         self._column_widths_cache = column_widths
         self._dtypes = dtypes
-        self._filter_empties()
         if validate_axes is not False:
             self._validate_internal_indices(mode=validate_axes)
+        self._filter_empties()
 
     @property
     def _row_lengths(self):
@@ -284,6 +284,11 @@ def _validate_axis_equality(self, axis: int, force: bool = False):
         is_lenghts_matches = len(self.axes[axis]) == len(internal_axis)
         if not is_equals:
             if force:
+                if not is_lenghts_matches:
+                    if axis:
+                        self._column_widths_cache = None
+                    else:
+                        self._row_lengths_cache = None
                 new_axis = self.axes[axis] if is_lenghts_matches else internal_axis
                 self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)
             else:
@@ -336,9 +341,9 @@ def _validate_internal_indices(self, mode=None, **kwargs):
         args = args_dict.get(mode, args_dict["custom"])
 
         if args.get("validate_index", True):
-            self._validate_axis_equality(axis=0)
+            self._validate_axis_equality(axis=0, force=args.get("force"))
         if args.get("validate_columns", True):
-            self._validate_axis_equality(axis=1)
+            self._validate_axis_equality(axis=1, force=args.get("force"))
 
     def _apply_index_objs(self, axis=None):
         """Lazily applies the index object (Index or Columns) to the partitions.
@@ -1000,13 +1005,19 @@ def _compute_map_reduce_metadata(self, axis, new_parts):
         )
 
     def _fold_reduce(self, axis, func):
-        """Applies map that reduce Manager to series but require knowledge of full axis.
+        """
+        Apply function that reduce Manager to series but require knowledge of full axis.
 
-        Args:
-            func: Function to reduce the Manager by. This function takes in a Manager.
-            axis: axis to apply the function to.
+        Parameters
+        ----------
+            axis : 0 or 1
+                The axis to apply the function to (0 - index, 1 - columns).
+            func : callable
+                The function to reduce the Manager by. This function takes in a Manager.
 
-        Return:
+        Returns
+        -------
+        BasePandasFrame
             Pandas series containing the reduced data.
         """
         func = self._build_mapreduce_func(axis, func)

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -1655,29 +1655,6 @@ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
             )
         )
 
-    def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
-        """Computes median across the DataFrame.
-
-        Args:
-            axis (int): The axis to take the median on.
-            skipna (bool): True to skip NA values, false otherwise.
-
-        Returns:
-            The median of the DataFrame. (Pandas series)
-        """
-        axis = self._get_axis_number(axis) if axis is not None else 0
-        if numeric_only is not None and not numeric_only:
-            self._validate_dtypes(numeric_only=True)
-        return self._reduce_dimension(
-            self._query_compiler.median(
-                axis=axis,
-                skipna=skipna,
-                level=level,
-                numeric_only=numeric_only,
-                **kwargs,
-            )
-        )
-
     def memory_usage(self, index=True, deep=False):
         """Returns the memory usage of each column in bytes
 
@@ -1862,52 +1839,6 @@ def pow(self, other, axis="columns", level=None, fill_value=None):
             "pow", other, axis=axis, level=level, fill_value=fill_value
         )
 
-    def prod(
-        self,
-        axis=None,
-        skipna=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
-    ):
-        """Return the product of the values for the requested axis
-
-        Args:
-            axis : {index (0), columns (1)}
-            skipna : boolean, default True
-            level : int or level name, default None
-            numeric_only : boolean, default None
-            min_count : int, default 0
-
-        Returns:
-            prod : Series or DataFrame (if level specified)
-        """
-        axis = self._get_axis_number(axis) if axis is not None else 0
-        data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
-        if min_count > 1:
-            return data._reduce_dimension(
-                query_compiler=data._query_compiler.prod_min_count(
-                    axis=axis,
-                    skipna=skipna,
-                    level=level,
-                    numeric_only=numeric_only,
-                    min_count=min_count,
-                    **kwargs,
-                )
-            )
-        return data._reduce_dimension(
-            data._query_compiler.prod(
-                axis=axis,
-                skipna=skipna,
-                level=level,
-                numeric_only=numeric_only,
-                min_count=min_count,
-                **kwargs,
-            )
-        )
-
-    product = prod
     radd = add
 
     def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
@@ -2733,32 +2664,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         else:
             return self.tshift(periods, freq)
 
-    def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
-        """Return unbiased skew over requested axis Normalized by N-1
-
-        Args:
-            axis : {index (0), columns (1)}
-            skipna : boolean, default True
-            Exclude NA/null values when computing the result.
-            level : int or level name, default None
-            numeric_only : boolean, default None
-
-        Returns:
-            skew : Series or DataFrame (if level specified)
-        """
-        axis = self._get_axis_number(axis) if axis is not None else 0
-        if numeric_only is not None and not numeric_only:
-            self._validate_dtypes(numeric_only=True)
-        return self._reduce_dimension(
-            self._query_compiler.skew(
-                axis=axis,
-                skipna=skipna,
-                level=level,
-                numeric_only=numeric_only,
-                **kwargs,
-            )
-        )
-
     def sort_index(
         self,
         axis=0,
@@ -2842,33 +2747,6 @@ def sort_values(
             )
         return self._create_or_update_from_compiler(result, inplace)
 
-    def std(
-        self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
-    ):
-        """Computes standard deviation across the DataFrame.
-
-        Args:
-            axis (int): The axis to take the std on.
-            skipna (bool): True to skip NA values, false otherwise.
-            ddof (int): degrees of freedom
-
-        Returns:
-            The std of the DataFrame (Pandas Series)
-        """
-        axis = self._get_axis_number(axis) if axis is not None else 0
-        if numeric_only is not None and not numeric_only:
-            self._validate_dtypes(numeric_only=True)
-        return self._reduce_dimension(
-            self._query_compiler.std(
-                axis=axis,
-                skipna=skipna,
-                level=level,
-                ddof=ddof,
-                numeric_only=numeric_only,
-                **kwargs,
-            )
-        )
-
     def sub(self, other, axis="columns", level=None, fill_value=None):
         """Subtract a DataFrame/Series/scalar from this DataFrame.
 
@@ -2887,50 +2765,6 @@ def sub(self, other, axis="columns", level=None, fill_value=None):
 
     subtract = sub
 
-    def sum(
-        self,
-        axis=None,
-        skipna=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
-    ):
-        """Perform a sum across the DataFrame.
-
-        Args:
-            axis (int): The axis to sum on.
-            skipna (bool): True to skip NA values, false otherwise.
-
-        Returns:
-            The sum of the DataFrame.
-        """
-        axis = self._get_axis_number(axis) if axis is not None else 0
-        data = self._validate_dtypes_sum_prod_mean(
-            axis, numeric_only, ignore_axis=False
-        )
-        if min_count > 1:
-            return data._reduce_dimension(
-                query_compiler=data._query_compiler.sum_min_count(
-                    axis=axis,
-                    skipna=skipna,
-                    level=level,
-                    numeric_only=numeric_only,
-                    min_count=min_count,
-                    **kwargs,
-                )
-            )
-        return data._reduce_dimension(
-            data._query_compiler.sum(
-                axis=axis,
-                skipna=skipna,
-                level=level,
-                numeric_only=numeric_only,
-                min_count=min_count,
-                **kwargs,
-            )
-        )
-
     def swapaxes(self, axis1, axis2, copy=True):
         axis1 = self._get_axis_number(axis1)
         axis2 = self._get_axis_number(axis2)
@@ -3333,33 +3167,6 @@ def tz_localize(
         )
         return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)
 
-    def var(
-        self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
-    ):
-        """Computes variance across the DataFrame.
-
-        Args:
-            axis (int): The axis to take the variance on.
-            skipna (bool): True to skip NA values, false otherwise.
-            ddof (int): degrees of freedom
-
-        Returns:
-            The variance of the DataFrame.
-        """
-        axis = self._get_axis_number(axis) if axis is not None else 0
-        if numeric_only is not None and not numeric_only:
-            self._validate_dtypes(numeric_only=True)
-        return self._reduce_dimension(
-            self._query_compiler.var(
-                axis=axis,
-                skipna=skipna,
-                level=level,
-                ddof=ddof,
-                numeric_only=numeric_only,
-                **kwargs,
-            )
-        )
-
     def __abs__(self):
         """Creates a modified DataFrame by taking the absolute value.