Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#6793: use pandas.api.types.pandas_dtype instead of np.dtype for some more places #6794

Merged
merged 1 commit into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
FIX-#6793: use 'pandas_dtype' instead of 'np.dtype' for some more pla…
…ces in Modin code

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev committed Mar 26, 2024
commit 8b154c1207228bb9af3fd5fa3b860bf06410a680
24 changes: 17 additions & 7 deletions modin/core/dataframe/algebra/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ def maybe_compute_dtypes_common_cast(
# belong to the intersection, these will be NaN columns in the result
mismatch_columns = columns_first ^ columns_second
elif isinstance(second, dict):
dtypes_second = {key: np.dtype(type(value)) for key, value in second.items()}
dtypes_second = {
key: pandas.api.types.pandas_dtype(type(value))
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
for key, value in second.items()
}
columns_first = set(first.columns)
columns_second = set(second.keys())
common_columns = columns_first.intersection(columns_second)
Expand All @@ -90,16 +93,21 @@ def maybe_compute_dtypes_common_cast(
else:
if isinstance(second, (list, tuple)):
second_dtypes_list = (
[np.dtype(type(value)) for value in second]
[pandas.api.types.pandas_dtype(type(value)) for value in second]
if axis == 1
# Here we've been given a column so it has only one dtype,
# Infering the dtype using `np.array`, TODO: maybe there's more efficient way?
else [np.array(second).dtype] * len(dtypes_first)
)
elif is_scalar(second) or isinstance(second, np.ndarray):
second_dtypes_list = [
getattr(second, "dtype", np.dtype(type(second)))
] * len(dtypes_first)
try:
dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype(
type(second)
)
except TypeError:
# For example, dtype '<class 'datetime.datetime'>' not understood
dtype = pandas.Series(second).dtype
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
Fixed Show fixed Hide fixed
second_dtypes_list = [dtype] * len(dtypes_first)
else:
raise NotImplementedError(
f"Can't compute common type for {type(first)} and {type(second)}."
Expand All @@ -117,7 +125,7 @@ def maybe_compute_dtypes_common_cast(
mismatch_columns = []

# If at least one column doesn't match, the result of the non matching column would be nan.
nan_dtype = np.dtype(type(np.nan))
nan_dtype = pandas.api.types.pandas_dtype(type(np.nan))
dtypes = None
if func is not None:
try:
Expand Down Expand Up @@ -242,7 +250,9 @@ def try_compute_new_dtypes(

try:
if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
dtypes = maybe_build_dtypes_series(first, second, dtype=np.dtype(bool))
dtypes = maybe_build_dtypes_series(
first, second, dtype=pandas.api.types.pandas_dtype(bool)
)
elif infer_dtypes == "common_cast":
dtypes = maybe_compute_dtypes_common_cast(
first, second, axis=axis, func=None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def pandas_dtype_to_arrow_c(dtype: Union[np.dtype, pandas.CategoricalDtype]) ->
"""
if isinstance(dtype, pandas.CategoricalDtype):
return ArrowCTypes.INT64
elif dtype == np.dtype("O"):
elif dtype == pandas.api.types.pandas_dtype("O"):
return ArrowCTypes.STRING

format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
Expand Down
22 changes: 13 additions & 9 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from pandas.core.dtypes.common import is_dtype_equal, is_list_like, is_numeric_dtype
from pandas.core.indexes.api import Index, RangeIndex

from modin.config import IsRayCluster, NPartitions
from modin.config import Engine, IsRayCluster, NPartitions
from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe
from modin.core.dataframe.base.dataframe.utils import Axis, JoinType
from modin.core.dataframe.pandas.dataframe.utils import (
Expand Down Expand Up @@ -1647,13 +1647,13 @@ def astype(self, col_dtypes, errors: str = "raise"):
if new_dtypes is None:
new_dtypes = self_dtypes.copy()
# Update the new dtype series to the proper pandas dtype
try:
new_dtype = np.dtype(dtype)
except TypeError:
new_dtype = dtype
new_dtype = pandas.api.types.pandas_dtype(dtype)
if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"):
# FIXME: https://github.com/dask/distributed/issues/8585
_ = dtype._materialize_categories()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case dtype cannot be serialized by Dask.

Fixed Show fixed Hide fixed

# We cannot infer without computing the dtype if
if isinstance(new_dtype, str) and new_dtype == "category":
if isinstance(new_dtype, pandas.CategoricalDtype):
new_dtypes[column] = LazyProxyCategoricalDtype._build_proxy(
# Actual parent will substitute `None` at `.set_dtypes_cache`
parent=None,
Expand Down Expand Up @@ -2187,7 +2187,8 @@ def map(
# Materializing lazy columns in order to build dtype's index
new_columns = new_columns.get(return_lengths=False)
dtypes = pandas.Series(
[np.dtype(dtypes)] * len(new_columns), index=new_columns
[pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
index=new_columns,
)
return self.__constructor__(
new_partitions,
Expand Down Expand Up @@ -3425,14 +3426,17 @@ def broadcast_apply_full_axis(
else:
if new_columns is None:
kw["dtypes"] = ModinDtypes(
DtypesDescriptor(remaining_dtype=np.dtype(dtypes))
DtypesDescriptor(
remaining_dtype=pandas.api.types.pandas_dtype(dtypes)
)
)
else:
kw["dtypes"] = (
pandas.Series(dtypes, index=new_columns)
if is_list_like(dtypes)
else pandas.Series(
[np.dtype(dtypes)] * len(new_columns), index=new_columns
[pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
index=new_columns,
)
)

Expand Down
9 changes: 6 additions & 3 deletions modin/core/dataframe/pandas/metadata/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ def _merge_dtypes(
# meaning that we shouldn't try computing a new dtype for this column,
# so marking it as 'unknown'
i: (
np.dtype(float)
pandas.api.types.pandas_dtype(float)
if val._know_all_names and val._remaining_dtype is None
else "unknown"
)
Expand All @@ -531,7 +531,7 @@ def _merge_dtypes(
def combine_dtypes(row):
if (row == "unknown").any():
return "unknown"
row = row.fillna(np.dtype("float"))
row = row.fillna(pandas.api.types.pandas_dtype("float"))
return find_common_type(list(row.values))

dtypes = dtypes_matrix.apply(combine_dtypes, axis=1)
Expand Down Expand Up @@ -1238,6 +1238,9 @@ def extract_dtype(value):
elif hasattr(value, "dtypes"):
return value.dtypes
elif is_scalar(value):
return np.dtype(type(value))
if value is None:
# previous type was object instead of 'float64'
return pandas.api.types.pandas_dtype(value)
return pandas.api.types.pandas_dtype(type(value))
else:
return np.array(value).dtype
6 changes: 4 additions & 2 deletions modin/core/storage_formats/pandas/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,17 @@ def corr_method(
qc._modin_frame.copy_columns_cache(),
)
new_dtypes = pandas.Series(
np.repeat(np.dtype("float"), len(new_columns)), index=new_columns
np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
index=new_columns,
)
elif numeric_only and qc._modin_frame.has_materialized_dtypes:
old_dtypes = qc._modin_frame.dtypes

new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index
new_index = new_columns.copy()
new_dtypes = pandas.Series(
np.repeat(np.dtype("float"), len(new_columns)), index=new_columns
np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
index=new_columns,
)
else:
new_index, new_columns, new_dtypes = None, None, None
Expand Down
4 changes: 2 additions & 2 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2734,7 +2734,7 @@ def _set_item(df, row_loc): # pragma: no cover
# we would like to convert it and get its proper internal dtype
item_type = item.to_numpy().dtype
else:
item_type = np.dtype(type(item))
item_type = pandas.api.types.pandas_dtype(type(item))

if isinstance(old_dtypes, pandas.Series):
new_dtypes[col_loc] = [
Expand Down Expand Up @@ -3062,7 +3062,7 @@ def _compute_duplicated(df): # pragma: no cover
hashed_modin_frame = self._modin_frame.reduce(
axis=1,
function=_compute_hash,
dtypes=np.dtype("O"),
dtypes=pandas.api.types.pandas_dtype("O"),
)
else:
hashed_modin_frame = self._modin_frame
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def is_supported_arrow_type(dtype: pa.lib.DataType) -> bool:
return False
try:
pandas_dtype = dtype.to_pandas_dtype()
return pandas_dtype != np.dtype("O")
return pandas_dtype != pandas.api.types.pandas_dtype("O")
except NotImplementedError:
return False

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import datetime

import numpy as np
import pandas
import pytest
from pandas.core.dtypes.common import is_datetime64_any_dtype, is_object_dtype
Expand Down Expand Up @@ -108,7 +107,7 @@ def align_datetime_dtypes(*dfs):
# datetime.time is considered to be an 'object' dtype in pandas that's why
# we have to explicitly check the values type in the column
elif (
dtype == np.dtype("O")
dtype == pandas.api.types.pandas_dtype("O")
and col not in time_cols
# HDK has difficulties with empty frames, so explicitly skip them
# https://github.com/modin-project/modin/issues/3428
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -915,5 +915,5 @@ def dtypes(self):

def _check_int_or_float(op, dtypes): # noqa: GL08
for t in dtypes:
if t.char not in _SUPPORTED_NUM_TYPE_CODES:
if not isinstance(t, np.dtype) or t.char not in _SUPPORTED_NUM_TYPE_CODES:
raise NotImplementedError(f"Operation '{op}' on type '{t.name}'")
4 changes: 2 additions & 2 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1864,7 +1864,7 @@ def idxmax(self, axis=0, skipna=True, numeric_only=False): # noqa: PR01, RT01,
"""
Return index of first occurrence of maximum over requested axis.
"""
if not all(d != np.dtype("O") for d in self._get_dtypes()):
if not all(d != pandas.api.types.pandas_dtype("O") for d in self._get_dtypes()):
raise TypeError("reduce operation 'argmax' not allowed for this dtype")
axis = self._get_axis_number(axis)
return self._reduce_dimension(
Expand All @@ -1877,7 +1877,7 @@ def idxmin(self, axis=0, skipna=True, numeric_only=False): # noqa: PR01, RT01,
"""
Return index of first occurrence of minimum over requested axis.
"""
if not all(d != np.dtype("O") for d in self._get_dtypes()):
if not all(d != pandas.api.types.pandas_dtype("O") for d in self._get_dtypes()):
raise TypeError("reduce operation 'argmin' not allowed for this dtype")
axis = self._get_axis_number(axis)
return self._reduce_dimension(
Expand Down
21 changes: 14 additions & 7 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1590,7 +1590,9 @@ def prod(
):
new_index = self.columns if not axis else self.index
return Series(
[np.nan] * len(new_index), index=new_index, dtype=np.dtype("object")
[np.nan] * len(new_index),
index=new_index,
dtype=pandas.api.types.pandas_dtype("object"),
)

data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
Expand Down Expand Up @@ -2107,7 +2109,9 @@ def sum(
):
new_index = self.columns if not axis else self.index
return Series(
[np.nan] * len(new_index), index=new_index, dtype=np.dtype("object")
[np.nan] * len(new_index),
index=new_index,
dtype=pandas.api.types.pandas_dtype("object"),
)

data = self._validate_dtypes_sum_prod_mean(
Expand Down Expand Up @@ -2991,8 +2995,8 @@ def _validate_dtypes_min_max(self, axis, numeric_only):
):
# check if there are columns with dtypes datetime or timedelta
if all(
dtype != np.dtype("datetime64[ns]")
and dtype != np.dtype("timedelta64[ns]")
dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
for dtype in self.dtypes
):
raise TypeError("Cannot compare Numeric and Non-Numeric Types")
Expand Down Expand Up @@ -3024,7 +3028,10 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
if (
not axis
and numeric_only is False
and any(dtype == np.dtype("datetime64[ns]") for dtype in self.dtypes)
and any(
dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
for dtype in self.dtypes
)
):
raise TypeError("Cannot add Timestamp Types")

Expand All @@ -3042,8 +3049,8 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
):
# check if there are columns with dtypes datetime or timedelta
if all(
dtype != np.dtype("datetime64[ns]")
and dtype != np.dtype("timedelta64[ns]")
dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
for dtype in self.dtypes
):
raise TypeError("Cannot operate on Numeric and Non-Numeric Types")
Expand Down
5 changes: 5 additions & 0 deletions modin/pandas/test/dataframe/test_map_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,11 @@ def test_astype():
expected_df_casted = expected_df.astype(str)
df_equals(modin_df_casted, expected_df_casted)

# pandas nullable dtype
modin_df_casted = modin_df.astype("Float64")
expected_df_casted = expected_df.astype("Float64")
df_equals(modin_df_casted, expected_df_casted)

modin_df_casted = modin_df.astype("category")
expected_df_casted = expected_df.astype("category")
df_equals(modin_df_casted, expected_df_casted)
Expand Down
3 changes: 1 addition & 2 deletions modin/pandas/test/dataframe/test_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pytest

import modin.pandas as pd
from modin.config import Engine, NPartitions, StorageFormat
from modin.config import NPartitions, StorageFormat
from modin.pandas.test.utils import (
arg_keys,
assert_dtypes_equal,
Expand Down Expand Up @@ -316,7 +316,6 @@ def test_sum(data, axis, skipna, is_transposed, request):
df_equals(modin_result, pandas_result)


@pytest.mark.skipif(Engine.get() == "Native", reason="Fails on HDK")
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_dtype_consistency(dtype):
# test for issue #6781
Expand Down
Loading