Skip to content

Commit

Permalink
FIX-modin-project#6793: use 'pandas_dtype' instead of 'np.dtype' for …
Browse files Browse the repository at this point in the history
…some more places in Modin code (modin-project#6794)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev authored Mar 28, 2024
1 parent 4c1e448 commit 478b86c
Show file tree
Hide file tree
Showing 13 changed files with 68 additions and 39 deletions.
24 changes: 17 additions & 7 deletions modin/core/dataframe/algebra/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ def maybe_compute_dtypes_common_cast(
# belong to the intersection, these will be NaN columns in the result
mismatch_columns = columns_first ^ columns_second
elif isinstance(second, dict):
dtypes_second = {key: np.dtype(type(value)) for key, value in second.items()}
dtypes_second = {
key: pandas.api.types.pandas_dtype(type(value))
for key, value in second.items()
}
columns_first = set(first.columns)
columns_second = set(second.keys())
common_columns = columns_first.intersection(columns_second)
Expand All @@ -90,16 +93,21 @@ def maybe_compute_dtypes_common_cast(
else:
if isinstance(second, (list, tuple)):
second_dtypes_list = (
[np.dtype(type(value)) for value in second]
[pandas.api.types.pandas_dtype(type(value)) for value in second]
if axis == 1
# Here we've been given a column so it has only one dtype,
# Infering the dtype using `np.array`, TODO: maybe there's more efficient way?
else [np.array(second).dtype] * len(dtypes_first)
)
elif is_scalar(second) or isinstance(second, np.ndarray):
second_dtypes_list = [
getattr(second, "dtype", np.dtype(type(second)))
] * len(dtypes_first)
try:
dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype(
type(second)
)
except TypeError:
# For example, dtype '<class 'datetime.datetime'>' not understood
dtype = pandas.Series(second).dtype
second_dtypes_list = [dtype] * len(dtypes_first)
else:
raise NotImplementedError(
f"Can't compute common type for {type(first)} and {type(second)}."
Expand All @@ -117,7 +125,7 @@ def maybe_compute_dtypes_common_cast(
mismatch_columns = []

# If at least one column doesn't match, the result of the non matching column would be nan.
nan_dtype = np.dtype(type(np.nan))
nan_dtype = pandas.api.types.pandas_dtype(type(np.nan))
dtypes = None
if func is not None:
try:
Expand Down Expand Up @@ -242,7 +250,9 @@ def try_compute_new_dtypes(

try:
if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
dtypes = maybe_build_dtypes_series(first, second, dtype=np.dtype(bool))
dtypes = maybe_build_dtypes_series(
first, second, dtype=pandas.api.types.pandas_dtype(bool)
)
elif infer_dtypes == "common_cast":
dtypes = maybe_compute_dtypes_common_cast(
first, second, axis=axis, func=None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def pandas_dtype_to_arrow_c(dtype: Union[np.dtype, pandas.CategoricalDtype]) ->
"""
if isinstance(dtype, pandas.CategoricalDtype):
return ArrowCTypes.INT64
elif dtype == np.dtype("O"):
elif dtype == pandas.api.types.pandas_dtype("O"):
return ArrowCTypes.STRING

format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
Expand Down
22 changes: 13 additions & 9 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from pandas.core.dtypes.common import is_dtype_equal, is_list_like, is_numeric_dtype
from pandas.core.indexes.api import Index, RangeIndex

from modin.config import IsRayCluster, NPartitions
from modin.config import Engine, IsRayCluster, NPartitions
from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe
from modin.core.dataframe.base.dataframe.utils import Axis, JoinType
from modin.core.dataframe.pandas.dataframe.utils import (
Expand Down Expand Up @@ -1647,13 +1647,13 @@ def astype(self, col_dtypes, errors: str = "raise"):
if new_dtypes is None:
new_dtypes = self_dtypes.copy()
# Update the new dtype series to the proper pandas dtype
try:
new_dtype = np.dtype(dtype)
except TypeError:
new_dtype = dtype
new_dtype = pandas.api.types.pandas_dtype(dtype)
if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"):
# FIXME: https://github.com/dask/distributed/issues/8585
_ = dtype._materialize_categories()

# We cannot infer without computing the dtype if
if isinstance(new_dtype, str) and new_dtype == "category":
if isinstance(new_dtype, pandas.CategoricalDtype):
new_dtypes[column] = LazyProxyCategoricalDtype._build_proxy(
# Actual parent will substitute `None` at `.set_dtypes_cache`
parent=None,
Expand Down Expand Up @@ -2187,7 +2187,8 @@ def map(
# Materializing lazy columns in order to build dtype's index
new_columns = new_columns.get(return_lengths=False)
dtypes = pandas.Series(
[np.dtype(dtypes)] * len(new_columns), index=new_columns
[pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
index=new_columns,
)
return self.__constructor__(
new_partitions,
Expand Down Expand Up @@ -3425,14 +3426,17 @@ def broadcast_apply_full_axis(
else:
if new_columns is None:
kw["dtypes"] = ModinDtypes(
DtypesDescriptor(remaining_dtype=np.dtype(dtypes))
DtypesDescriptor(
remaining_dtype=pandas.api.types.pandas_dtype(dtypes)
)
)
else:
kw["dtypes"] = (
pandas.Series(dtypes, index=new_columns)
if is_list_like(dtypes)
else pandas.Series(
[np.dtype(dtypes)] * len(new_columns), index=new_columns
[pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
index=new_columns,
)
)

Expand Down
9 changes: 6 additions & 3 deletions modin/core/dataframe/pandas/metadata/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ def _merge_dtypes(
# meaning that we shouldn't try computing a new dtype for this column,
# so marking it as 'unknown'
i: (
np.dtype(float)
pandas.api.types.pandas_dtype(float)
if val._know_all_names and val._remaining_dtype is None
else "unknown"
)
Expand All @@ -531,7 +531,7 @@ def _merge_dtypes(
def combine_dtypes(row):
if (row == "unknown").any():
return "unknown"
row = row.fillna(np.dtype("float"))
row = row.fillna(pandas.api.types.pandas_dtype("float"))
return find_common_type(list(row.values))

dtypes = dtypes_matrix.apply(combine_dtypes, axis=1)
Expand Down Expand Up @@ -1238,6 +1238,9 @@ def extract_dtype(value):
elif hasattr(value, "dtypes"):
return value.dtypes
elif is_scalar(value):
return np.dtype(type(value))
if value is None:
# previous type was object instead of 'float64'
return pandas.api.types.pandas_dtype(value)
return pandas.api.types.pandas_dtype(type(value))
else:
return np.array(value).dtype
6 changes: 4 additions & 2 deletions modin/core/storage_formats/pandas/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,17 @@ def corr_method(
qc._modin_frame.copy_columns_cache(),
)
new_dtypes = pandas.Series(
np.repeat(np.dtype("float"), len(new_columns)), index=new_columns
np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
index=new_columns,
)
elif numeric_only and qc._modin_frame.has_materialized_dtypes:
old_dtypes = qc._modin_frame.dtypes

new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index
new_index = new_columns.copy()
new_dtypes = pandas.Series(
np.repeat(np.dtype("float"), len(new_columns)), index=new_columns
np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
index=new_columns,
)
else:
new_index, new_columns, new_dtypes = None, None, None
Expand Down
4 changes: 2 additions & 2 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2734,7 +2734,7 @@ def _set_item(df, row_loc): # pragma: no cover
# we would like to convert it and get its proper internal dtype
item_type = item.to_numpy().dtype
else:
item_type = np.dtype(type(item))
item_type = pandas.api.types.pandas_dtype(type(item))

if isinstance(old_dtypes, pandas.Series):
new_dtypes[col_loc] = [
Expand Down Expand Up @@ -3062,7 +3062,7 @@ def _compute_duplicated(df): # pragma: no cover
hashed_modin_frame = self._modin_frame.reduce(
axis=1,
function=_compute_hash,
dtypes=np.dtype("O"),
dtypes=pandas.api.types.pandas_dtype("O"),
)
else:
hashed_modin_frame = self._modin_frame
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def is_supported_arrow_type(dtype: pa.lib.DataType) -> bool:
return False
try:
pandas_dtype = dtype.to_pandas_dtype()
return pandas_dtype != np.dtype("O")
return pandas_dtype != pandas.api.types.pandas_dtype("O")
except NotImplementedError:
return False

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import datetime

import numpy as np
import pandas
import pytest
from pandas.core.dtypes.common import is_datetime64_any_dtype, is_object_dtype
Expand Down Expand Up @@ -108,7 +107,7 @@ def align_datetime_dtypes(*dfs):
# datetime.time is considered to be an 'object' dtype in pandas that's why
# we have to explicitly check the values type in the column
elif (
dtype == np.dtype("O")
dtype == pandas.api.types.pandas_dtype("O")
and col not in time_cols
# HDK has difficulties with empty frames, so explicitly skip them
# https://github.com/modin-project/modin/issues/3428
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -915,5 +915,5 @@ def dtypes(self):

def _check_int_or_float(op, dtypes): # noqa: GL08
for t in dtypes:
if t.char not in _SUPPORTED_NUM_TYPE_CODES:
if not isinstance(t, np.dtype) or t.char not in _SUPPORTED_NUM_TYPE_CODES:
raise NotImplementedError(f"Operation '{op}' on type '{t.name}'")
4 changes: 2 additions & 2 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1864,7 +1864,7 @@ def idxmax(self, axis=0, skipna=True, numeric_only=False): # noqa: PR01, RT01,
"""
Return index of first occurrence of maximum over requested axis.
"""
if not all(d != np.dtype("O") for d in self._get_dtypes()):
if not all(d != pandas.api.types.pandas_dtype("O") for d in self._get_dtypes()):
raise TypeError("reduce operation 'argmax' not allowed for this dtype")
axis = self._get_axis_number(axis)
return self._reduce_dimension(
Expand All @@ -1877,7 +1877,7 @@ def idxmin(self, axis=0, skipna=True, numeric_only=False): # noqa: PR01, RT01,
"""
Return index of first occurrence of minimum over requested axis.
"""
if not all(d != np.dtype("O") for d in self._get_dtypes()):
if not all(d != pandas.api.types.pandas_dtype("O") for d in self._get_dtypes()):
raise TypeError("reduce operation 'argmin' not allowed for this dtype")
axis = self._get_axis_number(axis)
return self._reduce_dimension(
Expand Down
21 changes: 14 additions & 7 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1590,7 +1590,9 @@ def prod(
):
new_index = self.columns if not axis else self.index
return Series(
[np.nan] * len(new_index), index=new_index, dtype=np.dtype("object")
[np.nan] * len(new_index),
index=new_index,
dtype=pandas.api.types.pandas_dtype("object"),
)

data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
Expand Down Expand Up @@ -2107,7 +2109,9 @@ def sum(
):
new_index = self.columns if not axis else self.index
return Series(
[np.nan] * len(new_index), index=new_index, dtype=np.dtype("object")
[np.nan] * len(new_index),
index=new_index,
dtype=pandas.api.types.pandas_dtype("object"),
)

data = self._validate_dtypes_sum_prod_mean(
Expand Down Expand Up @@ -2991,8 +2995,8 @@ def _validate_dtypes_min_max(self, axis, numeric_only):
):
# check if there are columns with dtypes datetime or timedelta
if all(
dtype != np.dtype("datetime64[ns]")
and dtype != np.dtype("timedelta64[ns]")
dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
for dtype in self.dtypes
):
raise TypeError("Cannot compare Numeric and Non-Numeric Types")
Expand Down Expand Up @@ -3024,7 +3028,10 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
if (
not axis
and numeric_only is False
and any(dtype == np.dtype("datetime64[ns]") for dtype in self.dtypes)
and any(
dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
for dtype in self.dtypes
)
):
raise TypeError("Cannot add Timestamp Types")

Expand All @@ -3042,8 +3049,8 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
):
# check if there are columns with dtypes datetime or timedelta
if all(
dtype != np.dtype("datetime64[ns]")
and dtype != np.dtype("timedelta64[ns]")
dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
for dtype in self.dtypes
):
raise TypeError("Cannot operate on Numeric and Non-Numeric Types")
Expand Down
5 changes: 5 additions & 0 deletions modin/pandas/test/dataframe/test_map_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,11 @@ def test_astype():
expected_df_casted = expected_df.astype(str)
df_equals(modin_df_casted, expected_df_casted)

# pandas nullable dtype
modin_df_casted = modin_df.astype("Float64")
expected_df_casted = expected_df.astype("Float64")
df_equals(modin_df_casted, expected_df_casted)

modin_df_casted = modin_df.astype("category")
expected_df_casted = expected_df.astype("category")
df_equals(modin_df_casted, expected_df_casted)
Expand Down
3 changes: 1 addition & 2 deletions modin/pandas/test/dataframe/test_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pytest

import modin.pandas as pd
from modin.config import Engine, NPartitions, StorageFormat
from modin.config import NPartitions, StorageFormat
from modin.pandas.test.utils import (
arg_keys,
assert_dtypes_equal,
Expand Down Expand Up @@ -316,7 +316,6 @@ def test_sum(data, axis, skipna, is_transposed, request):
df_equals(modin_result, pandas_result)


@pytest.mark.skipif(Engine.get() == "Native", reason="Fails on HDK")
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_dtype_consistency(dtype):
# test for issue #6781
Expand Down

0 comments on commit 478b86c

Please sign in to comment.