diff --git a/modin/engines/base/frame/data.py b/modin/engines/base/frame/data.py index b6bfdec1801..01ae29917da 100644 --- a/modin/engines/base/frame/data.py +++ b/modin/engines/base/frame/data.py @@ -78,9 +78,9 @@ def __init__( ) self._column_widths_cache = column_widths self._dtypes = dtypes - self._filter_empties() if validate_axes is not False: self._validate_internal_indices(mode=validate_axes) + self._filter_empties() @property def _row_lengths(self): @@ -284,6 +284,11 @@ def _validate_axis_equality(self, axis: int, force: bool = False): is_lenghts_matches = len(self.axes[axis]) == len(internal_axis) if not is_equals: if force: + if not is_lenghts_matches: + if axis: + self._column_widths_cache = None + else: + self._row_lengths_cache = None new_axis = self.axes[axis] if is_lenghts_matches else internal_axis self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches) else: @@ -336,9 +341,9 @@ def _validate_internal_indices(self, mode=None, **kwargs): args = args_dict.get(mode, args_dict["custom"]) if args.get("validate_index", True): - self._validate_axis_equality(axis=0) + self._validate_axis_equality(axis=0, force=args.get("force")) if args.get("validate_columns", True): - self._validate_axis_equality(axis=1) + self._validate_axis_equality(axis=1, force=args.get("force")) def _apply_index_objs(self, axis=None): """Lazily applies the index object (Index or Columns) to the partitions. @@ -1000,13 +1005,19 @@ def _compute_map_reduce_metadata(self, axis, new_parts): ) def _fold_reduce(self, axis, func): - """Applies map that reduce Manager to series but require knowledge of full axis. + """ + Apply function that reduce Manager to series but require knowledge of full axis. - Args: - func: Function to reduce the Manager by. This function takes in a Manager. - axis: axis to apply the function to. + Parameters + ---------- + axis : 0 or 1 + The axis to apply the function to (0 - index, 1 - columns). + func : callable + The function to reduce the Manager by. This function takes in a Manager. - Return: + Returns + ------- + BasePandasFrame Pandas series containing the reduced data. """ func = self._build_mapreduce_func(axis, func) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 71450b75182..4bf2f4c0c12 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1655,29 +1655,6 @@ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): ) ) - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Computes median across the DataFrame. - - Args: - axis (int): The axis to take the median on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The median of the DataFrame. (Pandas series) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.median( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - ) - def memory_usage(self, index=True, deep=False): """Returns the memory usage of each column in bytes @@ -1862,52 +1839,6 @@ def pow(self, other, axis="columns", level=None, fill_value=None): "pow", other, axis=axis, level=level, fill_value=fill_value ) - def prod( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """Return the product of the values for the requested axis - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - level : int or level name, default None - numeric_only : boolean, default None - min_count : int, default 0 - - Returns: - prod : Series or DataFrame (if level specified) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) - if min_count > 1: - return data._reduce_dimension( - query_compiler=data._query_compiler.prod_min_count( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - return data._reduce_dimension( - data._query_compiler.prod( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - - product = prod radd = add def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): @@ -2733,32 +2664,6 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): else: return self.tshift(periods, freq) - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Return unbiased skew over requested axis Normalized by N-1 - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - numeric_only : boolean, default None - - Returns: - skew : Series or DataFrame (if level specified) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.skew( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - ) - def sort_index( self, axis=0, @@ -2842,33 +2747,6 @@ def sort_values( ) return self._create_or_update_from_compiler(result, inplace) - def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - """Computes standard deviation across the DataFrame. - - Args: - axis (int): The axis to take the std on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The std of the DataFrame (Pandas Series) - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.std( - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - ) - def sub(self, other, axis="columns", level=None, fill_value=None): """Subtract a DataFrame/Series/scalar from this DataFrame. @@ -2887,50 +2765,6 @@ def sub(self, other, axis="columns", level=None, fill_value=None): subtract = sub - def sum( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """Perform a sum across the DataFrame. - - Args: - axis (int): The axis to sum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The sum of the DataFrame. - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - data = self._validate_dtypes_sum_prod_mean( - axis, numeric_only, ignore_axis=False - ) - if min_count > 1: - return data._reduce_dimension( - query_compiler=data._query_compiler.sum_min_count( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - return data._reduce_dimension( - data._query_compiler.sum( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - ) - def swapaxes(self, axis1, axis2, copy=True): axis1 = self._get_axis_number(axis1) axis2 = self._get_axis_number(axis2) @@ -3333,33 +3167,6 @@ def tz_localize( ) return self.set_axis(labels=new_labels, axis=axis, inplace=not copy) - def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - """Computes variance across the DataFrame. - - Args: - axis (int): The axis to take the variance on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The variance of the DataFrame. - """ - axis = self._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - return self._reduce_dimension( - self._query_compiler.var( - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - ) - def __abs__(self): """Creates a modified DataFrame by taking the absolute value. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 9e9bfa384b8..1d71d36003f 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1376,6 +1376,53 @@ def lt(self, other, axis="columns", level=None): "lt", other, axis=axis, level=level, broadcast=isinstance(other, Series) ) + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The median of the values for the requested axis + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + def melt( self, id_vars=None, @@ -1820,6 +1867,32 @@ def prod( min_count=0, **kwargs, ): + """ + Return the product of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The product of the values for the requested axis. + """ axis = self._get_axis_number(axis) axis_to_apply = self.columns if axis else self.index if ( @@ -1831,9 +1904,22 @@ def prod( return Series( [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") ) + + data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.prod_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) if min_count > 1: - return self._reduce_dimension( - query_compiler=self._query_compiler.prod_min_count( + return data._reduce_dimension( + data._query_compiler.prod_min_count( axis=axis, skipna=skipna, level=level, @@ -1842,13 +1928,15 @@ def prod( **kwargs, ) ) - return super(DataFrame, self).prod( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + return data._reduce_dimension( + data._query_compiler.prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) product = prod @@ -2167,6 +2255,53 @@ def set_index( if not inplace: return frame + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return unbiased skew over requested axis. Normalized by N-1 + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : boolean, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), + count along a particular level, collapsing into a Series. + numeric_only : boolean, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + Unbiased skew over requested axis. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + @property def sparse(self): return self._default_to_pandas(pandas.DataFrame.sparse) @@ -2182,6 +2317,62 @@ def squeeze(self, axis=None): else: return self.copy() + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return sample standard deviation over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0), columns (1)} + The axis to take the std on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The sample standard deviation. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def stack(self, level=-1, dropna=True): """ Stack the prescribed level(s) from columns to index. @@ -2244,6 +2435,32 @@ def sum( min_count=0, **kwargs, ): + """ + Return the sum of the values for the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The sum of the values for the requested axis + """ axis = self._get_axis_number(axis) axis_to_apply = self.columns if axis else self.index if ( @@ -2255,13 +2472,41 @@ def sum( return Series( [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") ) - return super(DataFrame, self).sum( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + + data = self._validate_dtypes_sum_prod_mean( + axis, numeric_only, ignore_axis=False + ) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + if min_count > 1: + return data._reduce_dimension( + data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + return data._reduce_dimension( + data._query_compiler.sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) def _to_datetime(self, **kwargs): @@ -2488,6 +2733,62 @@ def update( ) self._update_inplace(new_query_compiler=query_compiler) + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + The axis to take the variance on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a Series. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or DataFrame (if level specified) + The unbiased variance. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def where( self, cond, diff --git a/modin/pandas/series.py b/modin/pandas/series.py index b9c41f60454..65551ab381c 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -990,6 +990,53 @@ def arg(s): ) ) + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The median of the values for the requested axis + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + def memory_usage(self, index=True, deep=False): if index: result = self._reduce_dimension( @@ -1109,6 +1156,109 @@ def unstack(self, level=-1, fill_value=None): return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """ + Return unbiased skew over requested axis. Normalized by N-1 + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : boolean, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), + count along a particular level, collapsing into a scalar. + numeric_only : boolean, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Unbiased skew over requested axis. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + ) + + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return sample standard deviation over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0)} + The axis to take the std on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The sample standard deviation. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + @property def plot( self, @@ -1154,17 +1304,69 @@ def prod( min_count=0, **kwargs, ): + """ + Return the product of the values for the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The product of the values for the requested axis. + """ axis = self._get_axis_number(axis) new_index = self.columns if axis else self.index if min_count > len(new_index): return np.nan - return super(Series, self).prod( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + + data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.prod_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + if min_count > 1: + return data._reduce_dimension( + data._query_compiler.prod_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + return data._reduce_dimension( + data._query_compiler.prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) product = prod @@ -1458,17 +1660,71 @@ def sum( min_count=0, **kwargs, ): + """ + Return the sum of the values for the requested axis. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + min_count : int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The sum of the values for the requested axis + """ axis = self._get_axis_number(axis) new_index = self.columns if axis else self.index if min_count > len(new_index): return np.nan - return super(Series, self).sum( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, + + data = self._validate_dtypes_sum_prod_mean( + axis, numeric_only, ignore_axis=False + ) + if level is not None: + return data.__constructor__( + query_compiler=data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + if min_count > 1: + return data._reduce_dimension( + data._query_compiler.sum_min_count( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + ) + return data._reduce_dimension( + data._query_compiler.sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) ) def swaplevel(self, i=-2, j=-1, copy=True): @@ -1659,6 +1915,62 @@ def value_counts( ) ) + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0)} + The axis to take the variance on. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a particular level, + collapsing into a scalar. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to use everything, + then use only numeric data. Not implemented for Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + The unbiased variance. + """ + axis = self._get_axis_number(axis) + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + if level is not None: + return self.__constructor__( + query_compiler=self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + return self._reduce_dimension( + self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + ) + def view(self, dtype=None): return self.__constructor__( query_compiler=self._query_compiler.series_view(dtype=dtype) diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py index dc3587a58c0..85d1246fd29 100644 --- a/modin/pandas/test/dataframe/test_reduction.py +++ b/modin/pandas/test/dataframe/test_reduction.py @@ -279,6 +279,18 @@ def test_prod( ), ) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.prod(level=0) + pandas_result = pandas_df.prod(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize( "numeric_only", @@ -315,6 +327,18 @@ def test_sum(data, axis, skipna, is_transposed): ), ) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.sum(level=0) + pandas_result = pandas_df.sum(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize( "numeric_only", diff --git a/modin/pandas/test/dataframe/test_window.py b/modin/pandas/test/dataframe/test_window.py index 621b776def7..e228606a5ae 100644 --- a/modin/pandas/test/dataframe/test_window.py +++ b/modin/pandas/test/dataframe/test_window.py @@ -570,6 +570,18 @@ def test_median(request, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.median(level=0) + pandas_result = pandas_df.median(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @@ -782,6 +794,18 @@ def test_skew(request, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.skew(level=0) + pandas_result = pandas_df.skew(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @@ -824,6 +848,18 @@ def test_std(request, data, axis, skipna, numeric_only, ddof): ) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.std(level=0) + pandas_result = pandas_df.std(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_values(data): @@ -872,3 +908,15 @@ def test_var(request, data, axis, skipna, numeric_only, ddof): modin_result = modin_df.T.var( axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof ) + + # test for issue #1953 + arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] + modin_df = pd.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + pandas_df = pandas.DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays + ) + modin_result = modin_df.var(level=0) + pandas_result = pandas_df.var(level=0) + df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 95fa93ec505..8695d7004b3 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1971,6 +1971,17 @@ def test_median(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.median(level=0) + pandas_result = pandas_series.median(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", [True, False], ids=["True", "False"]) @@ -2200,6 +2211,17 @@ def test_prod(data, axis, skipna, numeric_only, min_count, operation): min_count=min_count, ) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.prod(level=0) + pandas_result = pandas_series.prod(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) @@ -2660,6 +2682,17 @@ def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.skew(level=0) + pandas_result = pandas_series.skew(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("index", ["default", "ndarray"]) @@ -2778,6 +2811,17 @@ def test_std(request, data, skipna, ddof): modin_result = modin_series.std(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.std(level=0) + pandas_result = pandas_series.std(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_sub(data): @@ -2826,6 +2870,17 @@ def test_sum(data, axis, skipna, numeric_only, min_count): min_count=min_count, ) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.sum(level=0) + pandas_result = pandas_series.sum(level=0) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis1", [0, 1, "columns", "index"]) @@ -3204,6 +3259,17 @@ def test_var(data, skipna, ddof): modin_result = modin_series.var(skipna=skipna, ddof=ddof) df_equals(modin_result, pandas_result) + # test for issue #1953 + arrays = [ + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["1", "2", "3", "4", "5", "6", "7", "8", "9"], + ] + modin_series = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + pandas_series = pandas.Series([3, 3, 3, 3, 3, 3, 3, 3, 3], index=arrays) + modin_result = modin_series.var(level=0) + pandas_result = pandas_series.var(level=0) + df_equals(modin_result, pandas_result) + def test_view(): modin_series = pd.Series([-2, -1, 0, 1, 2], dtype="int8")