Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: improving sklearn compatibility via parametrize_with_checks #660

Merged
Merged
4 changes: 2 additions & 2 deletions .github/workflows/schedule-dependencies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install wheel
python -m pip install ${{ matrix.pre-release-dependencies }} -e ".[test-dep]"
python -m pip install ${{ matrix.pre-release-dependencies }} -e ".[test]"
python -m pip freeze
- name: Test with pytest
run: pytest -n auto --disable-warnings --cov=sklego -m "not cvxpy and not formulaic and not umap"
Expand All @@ -47,7 +47,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install wheel
python -m pip install ${{ matrix.pre-release-dependencies }} -e ".[test-dep,${{ matrix.extra }}]"
python -m pip install ${{ matrix.pre-release-dependencies }} -e ".[test,${{ matrix.extra }}]"
python -m pip freeze
- name: Test with pytest
run: pytest -n auto --disable-warnings --cov=sklego -m "${{ matrix.extra }}"
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ issue-tracker = "https://github.com/koaning/scikit-lego/issues"
documentation = "https://koaning.github.io/scikit-lego/"

[project.optional-dependencies]
cvxpy = ["cmake", "osqp", "cvxpy>=1.1.8", "numpy<2.0"]
cvxpy = ["cmake", "osqp", "cvxpy>=1.1.8"]
formulaic = ["formulaic>=0.6.0"]
umap = ["umap-learn>=0.4.6", "numpy<2.0"]
umap = ["umap-learn>=0.4.6"]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the good news! They are both working with numpy 2.0rc


all = ["scikit-lego[cvxpy,formulaic,umap]"]

Expand All @@ -60,15 +60,15 @@ docs = [
"mkdocstrings-python>=1.7.3",
]

test-dep = [
test = [
"pytest>=6.2.5",
"pytest-xdist>=1.34.0",
"pytest-cov>=2.6.1",
"pytest-mock>=1.6.3",
]

test-all = [
"scikit-lego[all,test-dep]",
"scikit-lego[all,test]",
]

utils = [
Expand Down
10 changes: 3 additions & 7 deletions sklego/decomposition/pca_reconstruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted


class PCAOutlierDetection(BaseEstimator, OutlierMixin):
class PCAOutlierDetection(OutlierMixin, BaseEstimator):
"""`PCAOutlierDetection` is an outlier detector based on the reconstruction error from PCA.

If the difference between original and reconstructed data is larger than the `threshold`, the point is
Expand Down Expand Up @@ -93,13 +93,9 @@ def fit(self, X, y=None):
)
self.pca_.fit(X, y)
self.offset_ = -self.threshold
return self

def transform(self, X):
Copy link
Collaborator Author

@FBruzzesi FBruzzesi May 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method was throwing off the checks as they dynamically look for method names

"""Transform the data using the underlying PCA method."""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["pca_", "offset_"])
return self.pca_.transform(X)
self.n_features_in_ = X.shape[1]
return self

def difference(self, X):
"""Return the calculated difference between original and reconstructed data. Row by row.
Expand Down
33 changes: 25 additions & 8 deletions sklego/decomposition/umap_reconstruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@

umap = NotInstalledPackage("umap-learn")

from numbers import Integral

import numpy as np
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.utils._param_validation import Interval
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted


Expand Down Expand Up @@ -44,6 +47,12 @@ class UMAPOutlierDetection(BaseEstimator, OutlierMixin):
The offset used for the decision function.
"""

_parameter_constraints: dict = {
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved
"n_components": [
Interval(Integral, 1, None, closed="left"),
]
}

def __init__(
self,
n_components=2,
Expand Down Expand Up @@ -85,8 +94,9 @@ def fit(self, X, y=None):
- If `threshold` is `None`.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
if self.n_components < 2:
raise ValueError("Number of components must be at least two.")
if y is not None:
y = check_array(y, estimator=self, ensure_2d=False)

if not self.threshold:
raise ValueError("The `threshold` value cannot be `None`.")

Expand All @@ -99,14 +109,9 @@ def fit(self, X, y=None):
)
self.umap_.fit(X, y)
self.offset_ = -self.threshold
self.n_features_in_ = X.shape[1]
return self

def transform(self, X):
"""Transform the data using the underlying UMAP method."""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["umap_", "offset_"])
return self.umap_.transform(X)

def difference(self, X):
"""Return the calculated difference between original and reconstructed data. Row by row.

Expand Down Expand Up @@ -148,3 +153,15 @@ def predict(self, X):
result = np.ones(X.shape[0])
result[self.difference(X) > self.threshold] = -1
return result.astype(int)

def decision_function(self, X):
"""Calculate the decision function for the data as the difference between `threshold` and the `.difference(X)`
(which is the difference between original data and reconstructed data)."""
return self.threshold - self.difference(X)

def score_samples(self, X):
"""Calculate the score for the samples"""
return -self.difference(X)

def _more_tags(self):
return {"non_deterministic": True}
3 changes: 3 additions & 0 deletions sklego/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,6 @@ def allowed_strategies(self):
DeprecationWarning,
)
return self._ALLOWED_STRATEGIES

def _more_tags(self):
return {"poor_score": True, "non_deterministic": True}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love the poor_score tag 😂😂

4 changes: 3 additions & 1 deletion sklego/feature_selection/mrmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ class MaximumRelevanceMinimumRedundancy(SelectorMixin, BaseEstimator):
```
"""

_required_parameters = ["k"]

def __init__(self, k, *, relevance_func="f", redundancy_func="p", kind="auto"):
self.k = k
self.relevance_func = relevance_func
Expand Down Expand Up @@ -199,7 +201,7 @@ def fit(self, X, y):

k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
"""
X, y = check_X_y(X, y)
X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
self._y_dtype = y.dtype

relevance = self._get_relevance
Expand Down
22 changes: 14 additions & 8 deletions sklego/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def fit(self, X, y):
raise ValueError(f"Param `sigma` must be >= 0, got: {self.sigma}")
self.X_ = X
self.y_ = y
self.n_features_in_ = X.shape[1]
return self

def _calc_wts(self, x_i):
Expand Down Expand Up @@ -491,14 +492,19 @@ def fit(self, X, y):
raise ValueError(f"penalty should be either 'l1' or 'none', got {self.penalty}")

self.sensitive_col_idx_ = self.sensitive_cols

if isinstance(X, pd.DataFrame):
self.sensitive_col_idx_ = [i for i, name in enumerate(X.columns) if name in self.sensitive_cols]
X, y = check_X_y(X, y, accept_large_sparse=False)

sensitive = X[:, self.sensitive_col_idx_]
if not self.train_sensitive_cols:
X = np.delete(X, self.sensitive_col_idx_, axis=1)

X = self._add_intercept(X)
self.n_features_in_ = (
X.shape[1] - self.fit_intercept
) # + (1 - int(self.train_sensitive_cols)) * len(self.sensitive_col_idx_)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not necessarily true


column_or_1d(y)
label_encoder = LabelEncoder().fit(y)
Expand Down Expand Up @@ -1017,17 +1023,16 @@ def __init__(

def _get_objective(self, X, y, sample_weight):
def imbalanced_loss(params):
return 0.5 * np.mean(
sample_weight
* np.where(X @ params > y, self.overestimation_punishment_factor, 1)
* np.square(y - X @ params)
return 0.5 * np.average(
np.where(X @ params > y, self.overestimation_punishment_factor, 1) * np.square(y - X @ params),
weights=sample_weight,
) + self._regularized_loss(params)

def grad_imbalanced_loss(params):
return (
-(sample_weight * np.where(X @ params > y, self.overestimation_punishment_factor, 1) * (y - X @ params))
@ X
/ X.shape[0]
/ sample_weight.sum()
) + self._regularized_grad_loss(params)

return imbalanced_loss, grad_imbalanced_loss
Expand Down Expand Up @@ -1128,15 +1133,16 @@ def __init__(

def _get_objective(self, X, y, sample_weight):
def quantile_loss(params):
return np.mean(
sample_weight * np.where(X @ params < y, self.quantile, 1 - self.quantile) * np.abs(y - X @ params)
return np.average(
np.where(X @ params < y, self.quantile, 1 - self.quantile) * np.abs(y - X @ params),
weights=sample_weight,
) + self._regularized_loss(params)

def grad_quantile_loss(params):
return (
-(sample_weight * np.where(X @ params < y, self.quantile, 1 - self.quantile) * np.sign(y - X @ params))
@ X
/ X.shape[0]
/ sample_weight.sum()
Comment on lines +1148 to +1157
Copy link
Collaborator Author

@FBruzzesi FBruzzesi May 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @Garve, sorry to drag you into this mess.

No need for you to read all the changes. TL;DR is that in this PR I added tests using scikit-learn parametrize_with_checks to check for (better) compatibility.

As some of the tests were failing I took a closer look and adjusted the formulas to take into account sample_weight. However we still have tests failing for just one of the solvers. I wonder if you have any clue or insight on why this is the case.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's really just one solver I would just not sweat it. We should document it for sure but we can always patch that later.

Copy link
Collaborator Author

@FBruzzesi FBruzzesi May 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently, for ubuntu and windows 'TNC' as well is failing, but only for quantile=0.3 and fit_intercept=False, not in the other cases (runs).
I am skipping that as well and improving the warning in the class description

) + self._regularized_grad_loss(params)

return quantile_loss, grad_quantile_loss
Expand Down
20 changes: 5 additions & 15 deletions sklego/meta/_grouped_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,22 @@
from sklearn.utils import check_array
from sklearn.utils.validation import _ensure_no_complex_data

from sklego.common import as_list


def _split_groups_and_values(
X, groups, name="", min_value_cols=1, check_X=True, **kwargs
) -> Tuple[pd.DataFrame, np.ndarray]:
_data_format_checks(X, name=name)
_shape_check(X, min_value_cols)
check_array(X, ensure_min_features=min_value_cols, dtype=None)

print(groups)
FBruzzesi marked this conversation as resolved.
Show resolved Hide resolved
try:
if isinstance(X, pd.DataFrame):
X_group = X.loc[:, as_list(groups)]
X_group = X.loc[:, groups]
X_value = X.drop(columns=groups).values
else:
X_group = pd.DataFrame(X[:, as_list(groups)])
X_group = pd.DataFrame(X[:, groups])
pos_indexes = range(X.shape[1])
X_value = np.delete(X, [pos_indexes[g] for g in as_list(groups)], axis=1)
X_value = np.delete(X, [pos_indexes[g] for g in groups], axis=1)
except (KeyError, IndexError):
raise ValueError(f"Could not drop groups {groups} from columns of X")

Expand All @@ -41,15 +40,6 @@ def _data_format_checks(X, name):
raise ValueError(f"The estimator {name} does not work on sparse matrices")


def _shape_check(X, min_value_cols):
if min_value_cols > 1:
if X.ndim == 1 or X.shape[1] < 2:
raise ValueError(f"0 feature(s) (shape={X.shape}) while a minimum of {min_value_cols} is required.")
else:
if X.ndim == 2 and X.shape[1] < 1:
raise ValueError(f"0 feature(s) (shape={X.shape}) while a minimum of {min_value_cols} is required.")


def _check_grouping_columns(X_group, **kwargs) -> pd.DataFrame:
"""Do basic checks on grouping columns"""
# Do regular checks on numeric columns
Expand Down
15 changes: 9 additions & 6 deletions sklego/meta/_shrinkage_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from functools import partial

import numpy as np
from sklearn.utils.validation import check_is_fitted

Expand Down Expand Up @@ -47,6 +49,11 @@ def relative_shrinkage(group_sizes) -> np.ndarray:
return np.asarray(group_sizes)


def no_shrinkage_function(x, n):
# n = len(self.fitted_levels_[-1])
return np.pad([1], (len(x) - 1, n - len(x)), "constant", constant_values=(0))


def min_n_obs_shrinkage(group_sizes, min_n_obs: int) -> np.ndarray:
"""Use only the smallest group with a certain amount of observations.

Expand Down Expand Up @@ -117,7 +124,7 @@ def _set_shrinkage_function(self):

elif self.shrinkage is None:
"""Instead of keeping two different behaviors for shrinkage and non-shrinkage cases, this conditional block
maps no shrinkage to a constant shrinkage function, wit all the weight on the grouped passed,
maps no shrinkage to a constant shrinkage function, with all the weight on the grouped passed,
independently from the level sizes, as expected from the other shrinkage functions (*).
This allows the rest of the code to be agnostic to the shrinkage function, and the shrinkage factors.

Expand Down Expand Up @@ -150,11 +157,7 @@ def _set_shrinkage_function(self):
}
"""

def no_shrinkage_function(x):
n = len(self.fitted_levels_[-1])
return np.pad([1], (len(x) - 1, n - len(x)), "constant", constant_values=(0))

shrinkage_function_ = no_shrinkage_function
shrinkage_function_ = partial(no_shrinkage_function, n=self.n_fitted_levels_)

else:
raise ValueError(
Expand Down
19 changes: 12 additions & 7 deletions sklego/meta/confusion_balancer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from sklearn import clone
from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
Expand All @@ -6,7 +7,7 @@
from sklego.base import ProbabilisticClassifier


class ConfusionBalancer(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
class ConfusionBalancer(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
r"""The `ConfusionBalancer` estimator attempts to give it's child estimator a more balanced output by learning from
the confusion matrix during training.

Expand All @@ -33,6 +34,8 @@ class that the underlying model gives. We use these probabilities to attempt a m
The confusion matrix used for the correction.
"""

_required_parameters = ["estimator"]

def __init__(self, estimator, alpha: float = 0.5, cfm_smooth=0):
self.estimator = estimator
self.alpha = alpha
Expand Down Expand Up @@ -65,10 +68,11 @@ def fit(self, X, y):
raise ValueError(
"The ConfusionBalancer meta model only works on classification models with .predict_proba."
)
self.estimator.fit(X, y)
self.estimator_ = clone(self.estimator).fit(X, y)
self.classes_ = unique_labels(y)
cfm = confusion_matrix(y, self.estimator.predict(X)).T + self.cfm_smooth
cfm = confusion_matrix(y, self.estimator_.predict(X)).T + self.cfm_smooth
self.cfm_ = cfm / cfm.sum(axis=1).reshape(-1, 1)
self.n_features_in_ = X.shape[1]
return self

def predict_proba(self, X):
Expand All @@ -85,8 +89,9 @@ def predict_proba(self, X):
array-like of shape (n_samples, n_classes)
The predicted values.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
preds = self.estimator.predict_proba(X)
check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
X = check_array(X, dtype=FLOAT_DTYPES)
preds = self.estimator_.predict_proba(X)
return (1 - self.alpha) * preds + self.alpha * preds @ self.cfm_

def predict(self, X):
Expand All @@ -102,6 +107,6 @@ def predict(self, X):
array-like of shape (n_samples,)
The predicted values.
"""
check_is_fitted(self, ["cfm_", "classes_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
X = check_array(X, dtype=FLOAT_DTYPES)
return self.classes_[self.predict_proba(X).argmax(axis=1)]
Loading
Loading