Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#1953: Fix computing of reduced indices for reduction operation #1960

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions modin/engines/base/frame/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ def __init__(
)
self._column_widths_cache = column_widths
self._dtypes = dtypes
self._filter_empties()
if validate_axes is not False:
self._validate_internal_indices(mode=validate_axes)
self._filter_empties()

@property
def _row_lengths(self):
Expand Down Expand Up @@ -284,6 +284,11 @@ def _validate_axis_equality(self, axis: int, force: bool = False):
is_lenghts_matches = len(self.axes[axis]) == len(internal_axis)
if not is_equals:
if force:
if not is_lenghts_matches:
if axis:
self._column_widths_cache = None
else:
self._row_lengths_cache = None
new_axis = self.axes[axis] if is_lenghts_matches else internal_axis
self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)
else:
Expand Down Expand Up @@ -336,9 +341,9 @@ def _validate_internal_indices(self, mode=None, **kwargs):
args = args_dict.get(mode, args_dict["custom"])

if args.get("validate_index", True):
self._validate_axis_equality(axis=0)
self._validate_axis_equality(axis=0, force=args.get("force"))
if args.get("validate_columns", True):
self._validate_axis_equality(axis=1)
self._validate_axis_equality(axis=1, force=args.get("force"))

def _apply_index_objs(self, axis=None):
"""Lazily applies the index object (Index or Columns) to the partitions.
Expand Down Expand Up @@ -1000,13 +1005,19 @@ def _compute_map_reduce_metadata(self, axis, new_parts):
)

def _fold_reduce(self, axis, func):
"""Applies map that reduce Manager to series but require knowledge of full axis.
"""
Apply function that reduce Manager to series but require knowledge of full axis.
Args:
func: Function to reduce the Manager by. This function takes in a Manager.
axis: axis to apply the function to.
Parameters
----------
axis : 0 or 1
The axis to apply the function to (0 - index, 1 - columns).
func : callable
The function to reduce the Manager by. This function takes in a Manager.
Return:
Returns
-------
BasePandasFrame
Pandas series containing the reduced data.
"""
func = self._build_mapreduce_func(axis, func)
Expand Down
193 changes: 0 additions & 193 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1655,29 +1655,6 @@ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
)
)

def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Computes median across the DataFrame.
Args:
axis (int): The axis to take the median on.
skipna (bool): True to skip NA values, false otherwise.
Returns:
The median of the DataFrame. (Pandas series)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.median(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
)

def memory_usage(self, index=True, deep=False):
"""Returns the memory usage of each column in bytes
Expand Down Expand Up @@ -1862,52 +1839,6 @@ def pow(self, other, axis="columns", level=None, fill_value=None):
"pow", other, axis=axis, level=level, fill_value=fill_value
)

def prod(
self,
axis=None,
skipna=None,
level=None,
numeric_only=None,
min_count=0,
**kwargs,
):
"""Return the product of the values for the requested axis
Args:
axis : {index (0), columns (1)}
skipna : boolean, default True
level : int or level name, default None
numeric_only : boolean, default None
min_count : int, default 0
Returns:
prod : Series or DataFrame (if level specified)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
if min_count > 1:
return data._reduce_dimension(
query_compiler=data._query_compiler.prod_min_count(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)
return data._reduce_dimension(
data._query_compiler.prod(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)

product = prod
radd = add

def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
Expand Down Expand Up @@ -2733,32 +2664,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
else:
return self.tshift(periods, freq)

def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Return unbiased skew over requested axis Normalized by N-1
Args:
axis : {index (0), columns (1)}
skipna : boolean, default True
Exclude NA/null values when computing the result.
level : int or level name, default None
numeric_only : boolean, default None
Returns:
skew : Series or DataFrame (if level specified)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.skew(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
)

def sort_index(
self,
axis=0,
Expand Down Expand Up @@ -2842,33 +2747,6 @@ def sort_values(
)
return self._create_or_update_from_compiler(result, inplace)

def std(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""Computes standard deviation across the DataFrame.
Args:
axis (int): The axis to take the std on.
skipna (bool): True to skip NA values, false otherwise.
ddof (int): degrees of freedom
Returns:
The std of the DataFrame (Pandas Series)
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.std(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def sub(self, other, axis="columns", level=None, fill_value=None):
"""Subtract a DataFrame/Series/scalar from this DataFrame.
Expand All @@ -2887,50 +2765,6 @@ def sub(self, other, axis="columns", level=None, fill_value=None):

subtract = sub

def sum(
self,
axis=None,
skipna=None,
level=None,
numeric_only=None,
min_count=0,
**kwargs,
):
"""Perform a sum across the DataFrame.
Args:
axis (int): The axis to sum on.
skipna (bool): True to skip NA values, false otherwise.
Returns:
The sum of the DataFrame.
"""
axis = self._get_axis_number(axis) if axis is not None else 0
data = self._validate_dtypes_sum_prod_mean(
axis, numeric_only, ignore_axis=False
)
if min_count > 1:
return data._reduce_dimension(
query_compiler=data._query_compiler.sum_min_count(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)
return data._reduce_dimension(
data._query_compiler.sum(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)

def swapaxes(self, axis1, axis2, copy=True):
axis1 = self._get_axis_number(axis1)
axis2 = self._get_axis_number(axis2)
Expand Down Expand Up @@ -3333,33 +3167,6 @@ def tz_localize(
)
return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)

def var(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
"""Computes variance across the DataFrame.
Args:
axis (int): The axis to take the variance on.
skipna (bool): True to skip NA values, false otherwise.
ddof (int): degrees of freedom
Returns:
The variance of the DataFrame.
"""
axis = self._get_axis_number(axis) if axis is not None else 0
if numeric_only is not None and not numeric_only:
self._validate_dtypes(numeric_only=True)
return self._reduce_dimension(
self._query_compiler.var(
axis=axis,
skipna=skipna,
level=level,
ddof=ddof,
numeric_only=numeric_only,
**kwargs,
)
)

def __abs__(self):
"""Creates a modified DataFrame by taking the absolute value.
Expand Down
Loading