koaning · FBruzzesi · May 20, 2024 · May 8, 2024 · May 8, 2024 · May 9, 2024
diff --git a/.github/workflows/schedule-dependencies.yml b/.github/workflows/schedule-dependencies.yml
@@ -24,7 +24,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install wheel 
-        python -m pip install ${{ matrix.pre-release-dependencies }}  -e ".[test-dep]"
+        python -m pip install ${{ matrix.pre-release-dependencies }}  -e ".[test]"
         python -m pip freeze
     - name: Test with pytest
       run: pytest -n auto --disable-warnings --cov=sklego -m "not cvxpy and not formulaic and not umap"
@@ -47,7 +47,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install wheel 
-        python -m pip install ${{ matrix.pre-release-dependencies }} -e ".[test-dep,${{ matrix.extra }}]" 
+        python -m pip install ${{ matrix.pre-release-dependencies }} -e ".[test,${{ matrix.extra }}]" 
         python -m pip freeze
     - name: Test with pytest
       run: pytest -n auto --disable-warnings --cov=sklego -m "${{ matrix.extra }}"
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,9 +45,9 @@ issue-tracker = "https://github.com/koaning/scikit-lego/issues"
 documentation = "https://koaning.github.io/scikit-lego/"
 
 [project.optional-dependencies]
-cvxpy = ["cmake", "osqp", "cvxpy>=1.1.8", "numpy<2.0"]
+cvxpy = ["cmake", "osqp", "cvxpy>=1.1.8"]
 formulaic = ["formulaic>=0.6.0"]
-umap = ["umap-learn>=0.4.6", "numpy<2.0"]
+umap = ["umap-learn>=0.4.6"]
 
 all = ["scikit-lego[cvxpy,formulaic,umap]"]
 
@@ -60,15 +60,15 @@ docs = [
     "mkdocstrings-python>=1.7.3",
 ]
 
-test-dep = [
+test = [
     "pytest>=6.2.5",
     "pytest-xdist>=1.34.0",
     "pytest-cov>=2.6.1",
     "pytest-mock>=1.6.3",
 ]
 
 test-all = [
-    "scikit-lego[all,test-dep]",
+    "scikit-lego[all,test]",
 ]
 
 utils = [

diff --git a/sklego/decomposition/pca_reconstruction.py b/sklego/decomposition/pca_reconstruction.py
@@ -4,7 +4,7 @@
 from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
 
 
-class PCAOutlierDetection(BaseEstimator, OutlierMixin):
+class PCAOutlierDetection(OutlierMixin, BaseEstimator):
     """`PCAOutlierDetection` is an outlier detector based on the reconstruction error from PCA.
 
     If the difference between original and reconstructed data is larger than the `threshold`, the point is
@@ -93,13 +93,9 @@ def fit(self, X, y=None):
         )
         self.pca_.fit(X, y)
         self.offset_ = -self.threshold
-        return self
 
-    def transform(self, X):
-        """Transform the data using the underlying PCA method."""
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
-        check_is_fitted(self, ["pca_", "offset_"])
-        return self.pca_.transform(X)
+        self.n_features_in_ = X.shape[1]
+        return self
 
     def difference(self, X):
         """Return the calculated difference between original and reconstructed data. Row by row.

diff --git a/sklego/decomposition/umap_reconstruction.py b/sklego/decomposition/umap_reconstruction.py
@@ -5,8 +5,11 @@
 
     umap = NotInstalledPackage("umap-learn")
 
+from numbers import Integral
+
 import numpy as np
 from sklearn.base import BaseEstimator, OutlierMixin
+from sklearn.utils._param_validation import Interval
 from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
 
 
@@ -44,6 +47,12 @@ class UMAPOutlierDetection(BaseEstimator, OutlierMixin):
         The offset used for the decision function.
     """
 
+    _parameter_constraints: dict = {
+        "n_components": [
+            Interval(Integral, 1, None, closed="left"),
+        ]
+    }
+
     def __init__(
         self,
         n_components=2,
@@ -85,8 +94,9 @@ def fit(self, X, y=None):
             - If `threshold` is `None`.
         """
         X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
-        if self.n_components < 2:
-            raise ValueError("Number of components must be at least two.")
+        if y is not None:
+            y = check_array(y, estimator=self, ensure_2d=False)
+
         if not self.threshold:
             raise ValueError("The `threshold` value cannot be `None`.")
 
@@ -99,14 +109,9 @@ def fit(self, X, y=None):
         )
         self.umap_.fit(X, y)
         self.offset_ = -self.threshold
+        self.n_features_in_ = X.shape[1]
         return self
 
-    def transform(self, X):
-        """Transform the data using the underlying UMAP method."""
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
-        check_is_fitted(self, ["umap_", "offset_"])
-        return self.umap_.transform(X)
-
     def difference(self, X):
         """Return the calculated difference between original and reconstructed data. Row by row.
 
@@ -148,3 +153,15 @@ def predict(self, X):
         result = np.ones(X.shape[0])
         result[self.difference(X) > self.threshold] = -1
         return result.astype(int)
+
+    def decision_function(self, X):
+        """Calculate the decision function for the data as the difference between `threshold` and the `.difference(X)`
+        (which is the difference between original data and reconstructed data)."""
+        return self.threshold - self.difference(X)
+
+    def score_samples(self, X):
+        """Calculate the score for the samples"""
+        return -self.difference(X)
+
+    def _more_tags(self):
+        return {"non_deterministic": True}
diff --git a/sklego/dummy.py b/sklego/dummy.py
@@ -124,3 +124,6 @@ def allowed_strategies(self):
             DeprecationWarning,
         )
         return self._ALLOWED_STRATEGIES
+
+    def _more_tags(self):
+        return {"poor_score": True, "non_deterministic": True}
diff --git a/sklego/feature_selection/mrmr.py b/sklego/feature_selection/mrmr.py
@@ -130,6 +130,8 @@ class MaximumRelevanceMinimumRedundancy(SelectorMixin, BaseEstimator):
     ```
     """
 
+    _required_parameters = ["k"]
+
     def __init__(self, k, *, relevance_func="f", redundancy_func="p", kind="auto"):
         self.k = k
         self.relevance_func = relevance_func
@@ -199,7 +201,7 @@ def fit(self, X, y):
 
                 k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
         """
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
         self._y_dtype = y.dtype
 
         relevance = self._get_relevance

diff --git a/sklego/linear_model.py b/sklego/linear_model.py
@@ -84,6 +84,7 @@ def fit(self, X, y):
             raise ValueError(f"Param `sigma` must be >= 0, got: {self.sigma}")
         self.X_ = X
         self.y_ = y
+        self.n_features_in_ = X.shape[1]
         return self
 
     def _calc_wts(self, x_i):
@@ -491,14 +492,19 @@ def fit(self, X, y):
             raise ValueError(f"penalty should be either 'l1' or 'none', got {self.penalty}")
 
         self.sensitive_col_idx_ = self.sensitive_cols
+
         if isinstance(X, pd.DataFrame):
             self.sensitive_col_idx_ = [i for i, name in enumerate(X.columns) if name in self.sensitive_cols]
         X, y = check_X_y(X, y, accept_large_sparse=False)
 
         sensitive = X[:, self.sensitive_col_idx_]
         if not self.train_sensitive_cols:
             X = np.delete(X, self.sensitive_col_idx_, axis=1)
+
         X = self._add_intercept(X)
+        self.n_features_in_ = (
+            X.shape[1] - self.fit_intercept
+        )  # + (1 - int(self.train_sensitive_cols)) * len(self.sensitive_col_idx_)
 
         column_or_1d(y)
         label_encoder = LabelEncoder().fit(y)
@@ -1017,17 +1023,16 @@ def __init__(
 
     def _get_objective(self, X, y, sample_weight):
         def imbalanced_loss(params):
-            return 0.5 * np.mean(
-                sample_weight
-                * np.where(X @ params > y, self.overestimation_punishment_factor, 1)
-                * np.square(y - X @ params)
+            return 0.5 * np.average(
+                np.where(X @ params > y, self.overestimation_punishment_factor, 1) * np.square(y - X @ params),
+                weights=sample_weight,
             ) + self._regularized_loss(params)
 
         def grad_imbalanced_loss(params):
             return (
                 -(sample_weight * np.where(X @ params > y, self.overestimation_punishment_factor, 1) * (y - X @ params))
                 @ X
-                / X.shape[0]
+                / sample_weight.sum()
             ) + self._regularized_grad_loss(params)
 
         return imbalanced_loss, grad_imbalanced_loss
@@ -1128,15 +1133,16 @@ def __init__(
 
     def _get_objective(self, X, y, sample_weight):
         def quantile_loss(params):
-            return np.mean(
-                sample_weight * np.where(X @ params < y, self.quantile, 1 - self.quantile) * np.abs(y - X @ params)
+            return np.average(
+                np.where(X @ params < y, self.quantile, 1 - self.quantile) * np.abs(y - X @ params),
+                weights=sample_weight,
             ) + self._regularized_loss(params)
 
         def grad_quantile_loss(params):
             return (
                 -(sample_weight * np.where(X @ params < y, self.quantile, 1 - self.quantile) * np.sign(y - X @ params))
                 @ X
-                / X.shape[0]
+                / sample_weight.sum()
             ) + self._regularized_grad_loss(params)
 
         return quantile_loss, grad_quantile_loss

diff --git a/sklego/meta/_grouped_utils.py b/sklego/meta/_grouped_utils.py
@@ -6,23 +6,22 @@
 from sklearn.utils import check_array
 from sklearn.utils.validation import _ensure_no_complex_data
 
-from sklego.common import as_list
-
 
 def _split_groups_and_values(
     X, groups, name="", min_value_cols=1, check_X=True, **kwargs
 ) -> Tuple[pd.DataFrame, np.ndarray]:
     _data_format_checks(X, name=name)
-    _shape_check(X, min_value_cols)
+    check_array(X, ensure_min_features=min_value_cols, dtype=None)
 
+    print(groups)
     try:
         if isinstance(X, pd.DataFrame):
-            X_group = X.loc[:, as_list(groups)]
+            X_group = X.loc[:, groups]
             X_value = X.drop(columns=groups).values
         else:
-            X_group = pd.DataFrame(X[:, as_list(groups)])
+            X_group = pd.DataFrame(X[:, groups])
             pos_indexes = range(X.shape[1])
-            X_value = np.delete(X, [pos_indexes[g] for g in as_list(groups)], axis=1)
+            X_value = np.delete(X, [pos_indexes[g] for g in groups], axis=1)
     except (KeyError, IndexError):
         raise ValueError(f"Could not drop groups {groups} from columns of X")
 
@@ -41,15 +40,6 @@ def _data_format_checks(X, name):
         raise ValueError(f"The estimator {name} does not work on sparse matrices")
 
 
-def _shape_check(X, min_value_cols):
-    if min_value_cols > 1:
-        if X.ndim == 1 or X.shape[1] < 2:
-            raise ValueError(f"0 feature(s) (shape={X.shape}) while a minimum of {min_value_cols} is required.")
-    else:
-        if X.ndim == 2 and X.shape[1] < 1:
-            raise ValueError(f"0 feature(s) (shape={X.shape}) while a minimum of {min_value_cols} is required.")
-
-
 def _check_grouping_columns(X_group, **kwargs) -> pd.DataFrame:
     """Do basic checks on grouping columns"""
     # Do regular checks on numeric columns

diff --git a/sklego/meta/_shrinkage_utils.py b/sklego/meta/_shrinkage_utils.py
@@ -1,3 +1,5 @@
+from functools import partial
+
 import numpy as np
 from sklearn.utils.validation import check_is_fitted
 
@@ -47,6 +49,11 @@ def relative_shrinkage(group_sizes) -> np.ndarray:
     return np.asarray(group_sizes)
 
 
+def no_shrinkage_function(x, n):
+    # n = len(self.fitted_levels_[-1])
+    return np.pad([1], (len(x) - 1, n - len(x)), "constant", constant_values=(0))
+
+
 def min_n_obs_shrinkage(group_sizes, min_n_obs: int) -> np.ndarray:
     """Use only the smallest group with a certain amount of observations.
 
@@ -117,7 +124,7 @@ def _set_shrinkage_function(self):
 
         elif self.shrinkage is None:
             """Instead of keeping two different behaviors for shrinkage and non-shrinkage cases, this conditional block
-            maps no shrinkage to a constant shrinkage function, wit  all the weight on the grouped passed,
+            maps no shrinkage to a constant shrinkage function, with all the weight on the grouped passed,
             independently from the level sizes, as expected from the other shrinkage functions (*).
             This allows the rest of the code to be agnostic to the shrinkage function, and the shrinkage factors.
 
@@ -150,11 +157,7 @@ def _set_shrinkage_function(self):
                 }
             """
 
-            def no_shrinkage_function(x):
-                n = len(self.fitted_levels_[-1])
-                return np.pad([1], (len(x) - 1, n - len(x)), "constant", constant_values=(0))
-
-            shrinkage_function_ = no_shrinkage_function
+            shrinkage_function_ = partial(no_shrinkage_function, n=self.n_fitted_levels_)
 
         else:
             raise ValueError(

diff --git a/sklego/meta/confusion_balancer.py b/sklego/meta/confusion_balancer.py
@@ -1,3 +1,4 @@
+from sklearn import clone
 from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
 from sklearn.metrics import confusion_matrix
 from sklearn.utils.multiclass import unique_labels
@@ -6,7 +7,7 @@
 from sklego.base import ProbabilisticClassifier
 
 
-class ConfusionBalancer(BaseEstimator, ClassifierMixin, MetaEstimatorMixin):
+class ConfusionBalancer(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
     r"""The `ConfusionBalancer` estimator attempts to give it's child estimator a more balanced output by learning from
     the confusion matrix during training.
 
@@ -33,6 +34,8 @@ class that the underlying model gives. We use these probabilities to attempt a m
         The confusion matrix used for the correction.
     """
 
+    _required_parameters = ["estimator"]
+
     def __init__(self, estimator, alpha: float = 0.5, cfm_smooth=0):
         self.estimator = estimator
         self.alpha = alpha
@@ -65,10 +68,11 @@ def fit(self, X, y):
             raise ValueError(
                 "The ConfusionBalancer meta model only works on classification models with .predict_proba."
             )
-        self.estimator.fit(X, y)
+        self.estimator_ = clone(self.estimator).fit(X, y)
         self.classes_ = unique_labels(y)
-        cfm = confusion_matrix(y, self.estimator.predict(X)).T + self.cfm_smooth
+        cfm = confusion_matrix(y, self.estimator_.predict(X)).T + self.cfm_smooth
         self.cfm_ = cfm / cfm.sum(axis=1).reshape(-1, 1)
+        self.n_features_in_ = X.shape[1]
         return self
 
     def predict_proba(self, X):
@@ -85,8 +89,9 @@ def predict_proba(self, X):
         array-like of shape (n_samples, n_classes)
             The predicted values.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
-        preds = self.estimator.predict_proba(X)
+        check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
+        X = check_array(X, dtype=FLOAT_DTYPES)
+        preds = self.estimator_.predict_proba(X)
         return (1 - self.alpha) * preds + self.alpha * preds @ self.cfm_
 
     def predict(self, X):
@@ -102,6 +107,6 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted values.
         """
-        check_is_fitted(self, ["cfm_", "classes_"])
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
+        X = check_array(X, dtype=FLOAT_DTYPES)
         return self.classes_[self.predict_proba(X).argmax(axis=1)]