Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ pip install -U khiops
Other installation methods are documented at the [Khiops website][khiops-install].

### Requirements
- [Python][python] (>=3.8)
- [Pandas][pandas] (>=0.25.3)
- [Scikit-Learn][sklearn] (>=0.22.2)
- [Python][python] (>=3.10)
- [Pandas][pandas] (>=2.3.3)
- [Scikit-Learn][sklearn] (>=1.7.2)

[pandas]: https://pandas.pydata.org
[sklearn]: https://scikit-learn.org/stable
Expand Down
2 changes: 1 addition & 1 deletion doc/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ ipykernel>=6.9.1
nbconvert==6.4.4
nbformat==5.3.0
numpydoc>=1.5.0
pandas>=0.25.3,<=2.3.3
pandas>=2.3.3,<4.0.0
scikit-learn>=1.7.2,<1.9.0
sphinx-copybutton>=0.5.0
98 changes: 68 additions & 30 deletions khiops/sklearn/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import numpy as np
import pandas as pd
import sklearn
from scipy import sparse as sp
from sklearn.utils import check_array
from sklearn.utils.validation import column_or_1d
Expand All @@ -33,6 +32,13 @@
# pylint --disable=all --enable=invalid-names dataset.py
# pylint: disable=invalid-name

# Set a special pandas option to force the new string data type (`StringDtype`)
# even for version 2.0 which is still required for python 3.10.
# This new string data type no longer maps to a NumPy data type.
# Hence, code assuming NumPy type compatibility will break unless
# this string data type is handled separately.
pd.options.future.infer_string = True


def check_dataset_spec(ds_spec):
"""Checks that a dataset spec is valid
Expand Down Expand Up @@ -393,16 +399,11 @@ def write_internal_data_table(dataframe, file_path_or_stream):


def _column_or_1d_with_dtype(y, dtype=None):
# 'dtype' has been introduced on `column_or_1d' since Scikit-learn 1.2;
if sklearn.__version__ < "1.2":
if pd.api.types.is_string_dtype(dtype) and y.isin(["True", "False"]).all():
warnings.warn(
"'y' stores strings restricted to 'True'/'False' values: "
"The predict method may return a bool vector."
)
return column_or_1d(y, warn=True)
else:
return column_or_1d(y, warn=True, dtype=dtype)
"""Checks the data is of the provided `dtype`.
If a problem is detected a warning is printed or an error raised,
otherwise the pandas object is transformed into a numpy.array
"""
return column_or_1d(y, warn=True, dtype=dtype)


class Dataset:
Expand Down Expand Up @@ -607,16 +608,54 @@ def _init_target_column(self, y):
# pandas.Series, pandas.DataFrame or numpy.ndarray
else:
if hasattr(y, "dtype"):
if not isinstance(y, np.ndarray):
# Since pandas 3.0, numbers and boolean values in an array
# but with a carriage-return are wrongly inferred first
# respectively as `object` dtype instead of `int64` and
# `object` dtype instead of `bool`.
# Forcing pandas to `infer_objects` fixes the error
if pd.api.types.is_object_dtype(y):
y = y.infer_objects()

# Since pandas 3.0 (and even in 2.0 if the option is activated)
# a new `StringDtype` is used to handle strings.
# It does not match any longer the one recognized by numpy.
# An issue was created on scikit-learn
# https://github.com/scikit-learn/scikit-learn/issues/33383
# Until it is fixed, 'y' is not checked by
# `_column_or_1d_with_dtype` when pandas dtype is `StringDtype`.

if isinstance(y.dtype, pd.CategoricalDtype):
y_checked = _column_or_1d_with_dtype(
y, dtype=y.dtype.categories.dtype
y,
dtype=(
y.dtype.categories.dtype
if not pd.api.types.is_string_dtype(
y.dtype.categories.dtype
)
else None
),
)
else:
y_checked = _column_or_1d_with_dtype(y, dtype=y.dtype)
y_checked = _column_or_1d_with_dtype(
y,
dtype=(
y.dtype
if not pd.api.types.is_string_dtype(y.dtype)
else None
),
)
elif hasattr(y, "dtypes"):
if isinstance(y.dtypes.iloc[0], pd.CategoricalDtype):
y_checked = _column_or_1d_with_dtype(
y, dtype=y.dtypes.iloc[0].categories.dtype
y,
dtype=(
y.dtypes.iloc[0].categories.dtype
if not pd.api.types.is_string_dtype(
y.dtypes.iloc[0].categories.dtype
)
else None
),
)
else:
y_checked = _column_or_1d_with_dtype(y)
Expand Down Expand Up @@ -965,21 +1004,16 @@ def __init__(self, name, dataframe, key=None):

# Initialize feature columns and verify their types
self.column_ids = self.data_source.columns.values
if not np.issubdtype(self.column_ids.dtype, np.integer):
if np.issubdtype(self.column_ids.dtype, object):
for i, column_id in enumerate(self.column_ids):
if not isinstance(column_id, str):
raise TypeError(
f"Dataframe column ids must be either all integers or "
f"all strings. Column id at index {i} ('{column_id}') is"
f" of type '{type(column_id).__name__}'"
)
else:
raise TypeError(
f"Dataframe column ids must be either all integers or "
f"all strings. The column index has dtype "
f"'{self.column_ids.dtype}'"
)
# Ensure the feature columns are either all string
# or all numeric but not a mix of both.
if not pd.api.types.is_numeric_dtype(
self.column_ids
) and not pd.api.types.is_string_dtype(self.column_ids):
raise TypeError(
"Dataframe column ids must be either all integers or "
"all strings. Columns have the following mixed types: "
f"{sorted(set([type(cid).__name__ for cid in self.column_ids]))}."
)

# Initialize Khiops types
self.khiops_types = {}
Expand All @@ -988,7 +1022,11 @@ def __init__(self, name, dataframe, key=None):
column_numpy_type = column.dtype
column_max_size = None
if isinstance(column_numpy_type, pd.StringDtype):
column_max_size = column.str.len().max()
# If a value is missing in column,
# column.str.len() would be typed as float64 instead of int64
# Until this is changed, the type is forced to int64
# cf https://github.com/pandas-dev/pandas/issues/51948
column_max_size = column.str.len().astype(pd.Int64Dtype()).max()
self.khiops_types[column_id] = get_khiops_type(
column_numpy_type, column_max_size
)
Expand Down
4 changes: 2 additions & 2 deletions khiops/sklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2021,7 +2021,7 @@ def predict_proba(self, X):
y_probas, (pd.DataFrame, np.ndarray)
), "y_probas is not a Pandas DataFrame nor Numpy array"
y_probas = y_probas.reindex(
self._sorted_prob_variable_names(), axis=1, copy=False
self._sorted_prob_variable_names(), axis=1
).to_numpy(copy=False)

assert isinstance(y_probas, (str, np.ndarray)), "Expected str or np.ndarray"
Expand Down Expand Up @@ -2265,7 +2265,7 @@ def predict(self, X):

# Transform to np.ndarray
if isinstance(y_pred, pd.DataFrame):
y_pred = y_pred.astype("float64", copy=False).to_numpy(copy=False).ravel()
y_pred = y_pred.astype("float64").to_numpy(copy=False).ravel()

assert isinstance(y_pred, (str, np.ndarray)), "Expected str or np.array"
return y_pred
Expand Down
2 changes: 1 addition & 1 deletion packaging/conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ requirements:
run:
- python
- khiops-core =11.0.0
- pandas >=0.25.3,<=2.3.3
- pandas >=2.3.3,<4.0.0
- scikit-learn>=1.7.2,<1.9.0
run_constrained:
# do not necessary use the latest version
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ classifiers = [
requires-python = ">=3.8"
dependencies = [
# do not use the latest versions, to avoid undesired breaking changes
"pandas>=0.25.3,<=2.3.3",
"pandas>=2.3.3,<4.0.0",
"scikit-learn>=1.7.2,<1.9.0",
]

Expand Down
7 changes: 7 additions & 0 deletions tests/test_dataset_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,13 @@ def test_out_file_from_dataframe_monotable(self):
ref_table = spec["main_table"][0]
ref_table["class"] = y

# Since pandas 3.0 the default precision for parsing a datetime
# is now microseconds (us) instead of nanoseconds (ns)
# unless enough precision is given.
# Unfortunately only the changelog states this, not the docstring.
# To avoid any comparison error in tests
# we need set the required precision (ns) to the datetime
ref_table["Date"] = ref_table["Date"].astype("datetime64[ns]")
# Check that the dataframes are equal
assert_frame_equal(
ref_table.sort_values(by="User_ID").reset_index(drop=True),
Expand Down
2 changes: 1 addition & 1 deletion tests/test_dataset_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,6 @@ def test_pandas_table_column_ids_must_all_be_int_or_str(self):
output_error_msg = str(context.exception)
expected_msg = (
"Dataframe column ids must be either all integers or all "
"strings. Column id at index 0 ('1') is of type 'int'"
"strings. Columns have the following mixed types: ['int', 'str']."
)
self.assertEqual(output_error_msg, expected_msg)
Loading