Skip to content

Commit

Permalink
PERF: Remove columnarize [upstream] (modin-project#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
mvashishtha authored Mar 27, 2023
1 parent 20ca149 commit abf176c
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 71 deletions.
21 changes: 0 additions & 21 deletions modin/core/storage_formats/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,27 +1065,6 @@ def transpose(self, *args, **kwargs): # noqa: PR02
self, *args, **kwargs
)

def columnarize(self):
"""
Transpose this QueryCompiler if it has a single row but multiple columns.
This method should be called for QueryCompilers representing a Series object,
i.e. ``self.is_series_like()`` should be True.
Returns
-------
BaseQueryCompiler
Transposed new QueryCompiler or self.
"""
if self._shape_hint == "column":
return self

if len(self.columns) != 1 or (
len(self.index) == 1 and self.index[0] == MODIN_UNNAMED_SERIES_LABEL
):
return self.transpose()
return self

def is_series_like(self):
"""
Check whether this QueryCompiler can represent ``modin.pandas.Series`` object.
Expand Down
63 changes: 29 additions & 34 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,10 +769,10 @@ def all(
"level > 0 or level < -1 only valid with MultiIndex"
)
return self.groupby(level=level, axis=axis, sort=False).all(**kwargs)
return self._reduce_dimension(
self._query_compiler.all(
compiler = self._query_compiler.all(
axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs
)
return self._reduce_dimension(compiler.transpose() if axis == 0 else compiler
)
else:
if bool_only:
Expand All @@ -790,7 +790,7 @@ def all(
skipna=skipna,
level=level,
**kwargs,
)
).transpose()
)
if isinstance(result, BasePandasDataset):
return result.all(
Expand Down Expand Up @@ -832,10 +832,10 @@ def any(
"level > 0 or level < -1 only valid with MultiIndex"
)
return self.groupby(level=level, axis=axis, sort=False).any(**kwargs)
return self._reduce_dimension(
self._query_compiler.any(
axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs
)
compiler = self._query_compiler.any(
axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs
)
return self._reduce_dimension(compiler.transpose() if axis == 0 else compiler
)
else:
if bool_only:
Expand All @@ -851,7 +851,7 @@ def any(
skipna=skipna,
level=level,
**kwargs,
)
).transpose()
)
if isinstance(result, BasePandasDataset):
return result.any(
Expand Down Expand Up @@ -1172,11 +1172,10 @@ def count(self, axis=0, level=None, numeric_only=False): # noqa: PR01, RT01, D2
if not frame._query_compiler.has_multiindex(axis=axis):
raise TypeError("Can only count levels on hierarchical columns.")
return frame.groupby(level=level, axis=axis, sort=True).count()
return frame._reduce_dimension(
frame._query_compiler.count(
compiler = frame._query_compiler.count(
axis=axis, level=level, numeric_only=numeric_only
)
)
return frame._reduce_dimension(compiler.transpose() if axis == 0 else compiler)

def cummax(self, axis=None, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D200
"""
Expand Down Expand Up @@ -1805,16 +1804,14 @@ def kurt(
if numeric_only is None or numeric_only
else self
)

return self._reduce_dimension(
data._query_compiler.kurt(
compiler = data._query_compiler.kurt(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
)
return self._reduce_dimension(compiler.transpose() if axis == 0 else compiler)

kurtosis = kurt

Expand Down Expand Up @@ -1912,15 +1909,14 @@ def max(
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
axis = self._get_axis_number(axis)
data = self._validate_dtypes_min_max(axis, numeric_only)
return data._reduce_dimension(
data._query_compiler.max(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
)
compiler = data._query_compiler.max(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
return data._reduce_dimension(compiler.transpose() if axis == 0 else compiler)

def _stat_operation(
self,
Expand Down Expand Up @@ -1986,14 +1982,14 @@ def _stat_operation(
numeric_only=numeric_only,
**kwargs,
)
return self._reduce_dimension(result_qc)
return self._reduce_dimension(result_qc.transpose() if axis == 0 else result_qc)

def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200
"""
Return the memory usage of the `BasePandasDataset`.
"""
return self._reduce_dimension(
self._query_compiler.memory_usage(index=index, deep=deep)
self._query_compiler.memory_usage(index=index, deep=deep).transpose()
)

def min(
Expand All @@ -2010,15 +2006,14 @@ def min(
validate_bool_kwarg(skipna, "skipna", none_allowed=False)
axis = self._get_axis_number(axis)
data = self._validate_dtypes_min_max(axis, numeric_only)
return data._reduce_dimension(
data._query_compiler.min(
compiler = data._query_compiler.min(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
**kwargs,
)
)
return data._reduce_dimension(compiler.transpose() if axis == 0 else compiler)

def mod(
self, other, axis="columns", level=None, fill_value=None
Expand All @@ -2035,10 +2030,11 @@ def mode(self, axis=0, numeric_only=False, dropna=True): # noqa: PR01, RT01, D2
Get the mode(s) of each element along the selected axis.
"""
axis = self._get_axis_number(axis)
return self.__constructor__(
query_compiler=self._query_compiler.mode(
compiler = self._query_compiler.mode(
axis=axis, numeric_only=numeric_only, dropna=dropna
)
return self.__constructor__(
query_compiler=compiler.transpose() if axis == 0 else compiler
)

def mul(
Expand Down Expand Up @@ -2072,9 +2068,8 @@ def nunique(self, axis=0, dropna=True): # noqa: PR01, RT01, D200
Return number of unique elements in the `BasePandasDataset`.
"""
axis = self._get_axis_number(axis)
return self._reduce_dimension(
self._query_compiler.nunique(axis=axis, dropna=dropna)
)
compiler = self._query_compiler.nunique(axis=axis, dropna=dropna)
return self._reduce_dimension(compiler.transpose() if self._get_axis_number(axis) == 0 else compiler)

def pct_change(
self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
Expand Down
27 changes: 12 additions & 15 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,27 +2055,24 @@ def sum(
return self.groupby(level=level, axis=axis, sort=False).sum(
numeric_only=numeric_only, min_count=min_count
)
if min_count > 1:
return data._reduce_dimension(
data._query_compiler.sum_min_count(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)
return data._reduce_dimension(
data._query_compiler.sum(
sum_result = (data._query_compiler.sum_min_count(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
)
)
) if min_count > 1 else data._query_compiler.sum(
axis=axis,
skipna=skipna,
level=level,
numeric_only=numeric_only,
min_count=min_count,
**kwargs,
))
if axis == 0:
sum_result = sum_result.transpose()
return data._reduce_dimension(sum_result)

def to_feather(self, path, **kwargs): # pragma: no cover # noqa: PR01, RT01, D200
"""
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def __init__(
)
)
)._query_compiler
self._query_compiler = query_compiler.columnarize()
self._query_compiler = query_compiler
if name is not None:
self.name = name

Expand Down

0 comments on commit abf176c

Please sign in to comment.