FIX-#6793: use 'pandas_dtype' instead of 'np.dtype' for some more pla…

…ces in Modin code Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
modin-project · dchigarev · Mar 28, 2024 · Dec 2, 2023 · Mar 26, 2024 · 8b154c1207228bb9af3fd5fa3b860bf06410a680
commit 8b154c1207228bb9af3fd5fa3b860bf06410a680
@@ -80,7 +80,10 @@ def maybe_compute_dtypes_common_cast(
         # belong to the intersection, these will be NaN columns in the result
         mismatch_columns = columns_first ^ columns_second
     elif isinstance(second, dict):
-        dtypes_second = {key: np.dtype(type(value)) for key, value in second.items()}
+        dtypes_second = {
+            key: pandas.api.types.pandas_dtype(type(value))
+            for key, value in second.items()
+        }
         columns_first = set(first.columns)
         columns_second = set(second.keys())
         common_columns = columns_first.intersection(columns_second)
@@ -90,16 +93,21 @@ def maybe_compute_dtypes_common_cast(
     else:
         if isinstance(second, (list, tuple)):
             second_dtypes_list = (
-                [np.dtype(type(value)) for value in second]
+                [pandas.api.types.pandas_dtype(type(value)) for value in second]
                 if axis == 1
                 # Here we've been given a column so it has only one dtype,
                 # Infering the dtype using `np.array`, TODO: maybe there's more efficient way?
                 else [np.array(second).dtype] * len(dtypes_first)
             )
         elif is_scalar(second) or isinstance(second, np.ndarray):
-            second_dtypes_list = [
-                getattr(second, "dtype", np.dtype(type(second)))
-            ] * len(dtypes_first)
+            try:
+                dtype = getattr(second, "dtype", None) or pandas.api.types.pandas_dtype(
+                    type(second)
+                )
+            except TypeError:
+                # For example, dtype '<class 'datetime.datetime'>' not understood
+                dtype = pandas.Series(second).dtype
+            second_dtypes_list = [dtype] * len(dtypes_first)
         else:
             raise NotImplementedError(
                 f"Can't compute common type for {type(first)} and {type(second)}."
@@ -117,7 +125,7 @@ def maybe_compute_dtypes_common_cast(
         mismatch_columns = []
 
     # If at least one column doesn't match, the result of the non matching column would be nan.
-    nan_dtype = np.dtype(type(np.nan))
+    nan_dtype = pandas.api.types.pandas_dtype(type(np.nan))
     dtypes = None
     if func is not None:
         try:
@@ -242,7 +250,9 @@ def try_compute_new_dtypes(
 
     try:
         if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
-            dtypes = maybe_build_dtypes_series(first, second, dtype=np.dtype(bool))
+            dtypes = maybe_build_dtypes_series(
+                first, second, dtype=pandas.api.types.pandas_dtype(bool)
+            )
         elif infer_dtypes == "common_cast":
             dtypes = maybe_compute_dtypes_common_cast(
                 first, second, axis=axis, func=None

@@ -151,7 +151,7 @@ def pandas_dtype_to_arrow_c(dtype: Union[np.dtype, pandas.CategoricalDtype]) ->
     """
     if isinstance(dtype, pandas.CategoricalDtype):
         return ArrowCTypes.INT64
-    elif dtype == np.dtype("O"):
+    elif dtype == pandas.api.types.pandas_dtype("O"):
         return ArrowCTypes.STRING
 
     format_str = getattr(ArrowCTypes, dtype.name.upper(), None)

@@ -28,7 +28,7 @@
 from pandas.core.dtypes.common import is_dtype_equal, is_list_like, is_numeric_dtype
 from pandas.core.indexes.api import Index, RangeIndex
 
-from modin.config import IsRayCluster, NPartitions
+from modin.config import Engine, IsRayCluster, NPartitions
 from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe
 from modin.core.dataframe.base.dataframe.utils import Axis, JoinType
 from modin.core.dataframe.pandas.dataframe.utils import (
@@ -1647,13 +1647,13 @@ def astype(self, col_dtypes, errors: str = "raise"):
                 if new_dtypes is None:
                     new_dtypes = self_dtypes.copy()
                 # Update the new dtype series to the proper pandas dtype
-                try:
-                    new_dtype = np.dtype(dtype)
-                except TypeError:
-                    new_dtype = dtype
+                new_dtype = pandas.api.types.pandas_dtype(dtype)
+                if Engine.get() == "Dask" and hasattr(dtype, "_is_materialized"):
+                    # FIXME: https://github.com/dask/distributed/issues/8585
+                    _ = dtype._materialize_categories()
 
                 # We cannot infer without computing the dtype if
-                if isinstance(new_dtype, str) and new_dtype == "category":
+                if isinstance(new_dtype, pandas.CategoricalDtype):
                     new_dtypes[column] = LazyProxyCategoricalDtype._build_proxy(
                         # Actual parent will substitute `None` at `.set_dtypes_cache`
                         parent=None,
@@ -2187,7 +2187,8 @@ def map(
                 # Materializing lazy columns in order to build dtype's index
                 new_columns = new_columns.get(return_lengths=False)
             dtypes = pandas.Series(
-                [np.dtype(dtypes)] * len(new_columns), index=new_columns
+                [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
+                index=new_columns,
             )
         return self.__constructor__(
             new_partitions,
@@ -3425,14 +3426,17 @@ def broadcast_apply_full_axis(
             else:
                 if new_columns is None:
                     kw["dtypes"] = ModinDtypes(
-                        DtypesDescriptor(remaining_dtype=np.dtype(dtypes))
+                        DtypesDescriptor(
+                            remaining_dtype=pandas.api.types.pandas_dtype(dtypes)
+                        )
                     )
                 else:
                     kw["dtypes"] = (
                         pandas.Series(dtypes, index=new_columns)
                         if is_list_like(dtypes)
                         else pandas.Series(
-                            [np.dtype(dtypes)] * len(new_columns), index=new_columns
+                            [pandas.api.types.pandas_dtype(dtypes)] * len(new_columns),
+                            index=new_columns,
                         )
                     )
 

@@ -504,7 +504,7 @@ def _merge_dtypes(
                         # meaning that we shouldn't try computing a new dtype for this column,
                         # so marking it as 'unknown'
                         i: (
-                            np.dtype(float)
+                            pandas.api.types.pandas_dtype(float)
                             if val._know_all_names and val._remaining_dtype is None
                             else "unknown"
                         )
@@ -531,7 +531,7 @@ def _merge_dtypes(
         def combine_dtypes(row):
             if (row == "unknown").any():
                 return "unknown"
-            row = row.fillna(np.dtype("float"))
+            row = row.fillna(pandas.api.types.pandas_dtype("float"))
             return find_common_type(list(row.values))
 
         dtypes = dtypes_matrix.apply(combine_dtypes, axis=1)
@@ -1238,6 +1238,9 @@ def extract_dtype(value):
     elif hasattr(value, "dtypes"):
         return value.dtypes
     elif is_scalar(value):
-        return np.dtype(type(value))
+        if value is None:
+            # previous type was object instead of 'float64'
+            return pandas.api.types.pandas_dtype(value)
+        return pandas.api.types.pandas_dtype(type(value))
     else:
         return np.array(value).dtype
@@ -65,15 +65,17 @@ def corr_method(
                     qc._modin_frame.copy_columns_cache(),
                 )
                 new_dtypes = pandas.Series(
-                    np.repeat(np.dtype("float"), len(new_columns)), index=new_columns
+                    np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
+                    index=new_columns,
                 )
             elif numeric_only and qc._modin_frame.has_materialized_dtypes:
                 old_dtypes = qc._modin_frame.dtypes
 
                 new_columns = old_dtypes[old_dtypes.map(is_numeric_dtype)].index
                 new_index = new_columns.copy()
                 new_dtypes = pandas.Series(
-                    np.repeat(np.dtype("float"), len(new_columns)), index=new_columns
+                    np.repeat(pandas.api.types.pandas_dtype("float"), len(new_columns)),
+                    index=new_columns,
                 )
             else:
                 new_index, new_columns, new_dtypes = None, None, None

@@ -2734,7 +2734,7 @@ def _set_item(df, row_loc):  # pragma: no cover
                 # we would like to convert it and get its proper internal dtype
                 item_type = item.to_numpy().dtype
             else:
-                item_type = np.dtype(type(item))
+                item_type = pandas.api.types.pandas_dtype(type(item))
 
             if isinstance(old_dtypes, pandas.Series):
                 new_dtypes[col_loc] = [
@@ -3062,7 +3062,7 @@ def _compute_duplicated(df):  # pragma: no cover
             hashed_modin_frame = self._modin_frame.reduce(
                 axis=1,
                 function=_compute_hash,
-                dtypes=np.dtype("O"),
+                dtypes=pandas.api.types.pandas_dtype("O"),
             )
         else:
             hashed_modin_frame = self._modin_frame

@@ -554,7 +554,7 @@ def is_supported_arrow_type(dtype: pa.lib.DataType) -> bool:
         return False
     try:
         pandas_dtype = dtype.to_pandas_dtype()
-        return pandas_dtype != np.dtype("O")
+        return pandas_dtype != pandas.api.types.pandas_dtype("O")
     except NotImplementedError:
         return False
 

@@ -13,7 +13,6 @@
 
 import datetime
 
-import numpy as np
 import pandas
 import pytest
 from pandas.core.dtypes.common import is_datetime64_any_dtype, is_object_dtype
@@ -108,7 +107,7 @@ def align_datetime_dtypes(*dfs):
             # datetime.time is considered to be an 'object' dtype in pandas that's why
             # we have to explicitly check the values type in the column
             elif (
-                dtype == np.dtype("O")
+                dtype == pandas.api.types.pandas_dtype("O")
                 and col not in time_cols
                 # HDK has difficulties with empty frames, so explicitly skip them
                 # https://github.com/modin-project/modin/issues/3428

@@ -915,5 +915,5 @@ def dtypes(self):
 
 def _check_int_or_float(op, dtypes):  # noqa: GL08
     for t in dtypes:
-        if t.char not in _SUPPORTED_NUM_TYPE_CODES:
+        if not isinstance(t, np.dtype) or t.char not in _SUPPORTED_NUM_TYPE_CODES:
             raise NotImplementedError(f"Operation '{op}' on type '{t.name}'")
@@ -1864,7 +1864,7 @@ def idxmax(self, axis=0, skipna=True, numeric_only=False):  # noqa: PR01, RT01,
         """
         Return index of first occurrence of maximum over requested axis.
         """
-        if not all(d != np.dtype("O") for d in self._get_dtypes()):
+        if not all(d != pandas.api.types.pandas_dtype("O") for d in self._get_dtypes()):
             raise TypeError("reduce operation 'argmax' not allowed for this dtype")
         axis = self._get_axis_number(axis)
         return self._reduce_dimension(
@@ -1877,7 +1877,7 @@ def idxmin(self, axis=0, skipna=True, numeric_only=False):  # noqa: PR01, RT01,
         """
         Return index of first occurrence of minimum over requested axis.
         """
-        if not all(d != np.dtype("O") for d in self._get_dtypes()):
+        if not all(d != pandas.api.types.pandas_dtype("O") for d in self._get_dtypes()):
             raise TypeError("reduce operation 'argmin' not allowed for this dtype")
         axis = self._get_axis_number(axis)
         return self._reduce_dimension(

@@ -1590,7 +1590,9 @@ def prod(
         ):
             new_index = self.columns if not axis else self.index
             return Series(
-                [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object")
+                [np.nan] * len(new_index),
+                index=new_index,
+                dtype=pandas.api.types.pandas_dtype("object"),
             )
 
         data = self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True)
@@ -2107,7 +2109,9 @@ def sum(
         ):
             new_index = self.columns if not axis else self.index
             return Series(
-                [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object")
+                [np.nan] * len(new_index),
+                index=new_index,
+                dtype=pandas.api.types.pandas_dtype("object"),
             )
 
         data = self._validate_dtypes_sum_prod_mean(
@@ -2991,8 +2995,8 @@ def _validate_dtypes_min_max(self, axis, numeric_only):
         ):
             # check if there are columns with dtypes datetime or timedelta
             if all(
-                dtype != np.dtype("datetime64[ns]")
-                and dtype != np.dtype("timedelta64[ns]")
+                dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
+                and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
                 for dtype in self.dtypes
             ):
                 raise TypeError("Cannot compare Numeric and Non-Numeric Types")
@@ -3024,7 +3028,10 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
         if (
             not axis
             and numeric_only is False
-            and any(dtype == np.dtype("datetime64[ns]") for dtype in self.dtypes)
+            and any(
+                dtype == pandas.api.types.pandas_dtype("datetime64[ns]")
+                for dtype in self.dtypes
+            )
         ):
             raise TypeError("Cannot add Timestamp Types")
 
@@ -3042,8 +3049,8 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False):
         ):
             # check if there are columns with dtypes datetime or timedelta
             if all(
-                dtype != np.dtype("datetime64[ns]")
-                and dtype != np.dtype("timedelta64[ns]")
+                dtype != pandas.api.types.pandas_dtype("datetime64[ns]")
+                and dtype != pandas.api.types.pandas_dtype("timedelta64[ns]")
                 for dtype in self.dtypes
             ):
                 raise TypeError("Cannot operate on Numeric and Non-Numeric Types")

@@ -409,6 +409,11 @@ def test_astype():
     expected_df_casted = expected_df.astype(str)
     df_equals(modin_df_casted, expected_df_casted)
 
+    # pandas nullable dtype
+    modin_df_casted = modin_df.astype("Float64")
+    expected_df_casted = expected_df.astype("Float64")
+    df_equals(modin_df_casted, expected_df_casted)
+
     modin_df_casted = modin_df.astype("category")
     expected_df_casted = expected_df.astype("category")
     df_equals(modin_df_casted, expected_df_casted)

@@ -17,7 +17,7 @@
 import pytest
 
 import modin.pandas as pd
-from modin.config import Engine, NPartitions, StorageFormat
+from modin.config import NPartitions, StorageFormat
 from modin.pandas.test.utils import (
     arg_keys,
     assert_dtypes_equal,
@@ -316,7 +316,6 @@ def test_sum(data, axis, skipna, is_transposed, request):
     df_equals(modin_result, pandas_result)
 
 
-@pytest.mark.skipif(Engine.get() == "Native", reason="Fails on HDK")
 @pytest.mark.parametrize("dtype", ["int64", "Int64"])
 def test_dtype_consistency(dtype):
     # test for issue #6781