[ENH] Adds class_weight to those classifier that support it. Required for imbalanced datasets. (#1776)

patrickzib · web-flow · commit 34cf7b591954 · 2024-07-09T22:54:31.000+02:00
- This PR adds class_weight to those classifiers that support it by sklearn design. class_weight is intended for training with imbalanced datasets.

- Fixes a bug in Quant, as the random_state parameter was not passed to the internal estimator.
diff --git a/aeon/classification/convolution_based/_arsenal.py b/aeon/classification/convolution_based/_arsenal.py
@@ -76,6 +76,17 @@ class Arsenal(BaseClassifier):
         The collections of estimators trained in fit.
     weights_ : list of shape (n_estimators) of float
         Weight of each estimator in the ensemble.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     n_estimators_ : int
         The number of estimators in the ensemble.
 
@@ -125,6 +136,7 @@ def __init__(
         n_features_per_kernel=4,
         time_limit_in_minutes=0.0,
         contract_max_n_estimators=100,
+        class_weight=None,
         n_jobs=1,
         random_state=None,
     ):
@@ -135,6 +147,7 @@ def __init__(
         self.n_features_per_kernel = n_features_per_kernel
         self.time_limit_in_minutes = time_limit_in_minutes
         self.contract_max_n_estimators = contract_max_n_estimators
+        self.class_weight = class_weight
 
         self.random_state = random_state
         self.n_jobs = n_jobs
@@ -355,7 +368,9 @@ def _fit_ensemble_estimator(self, rocket, X, y, keep_transformed_data):
         transformed_x = rocket.fit_transform(X)
         scaler = StandardScaler(with_mean=False)
         scaler.fit(transformed_x, y)
-        ridge = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
+        ridge = RidgeClassifierCV(
+            alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
+        )
         ridge.fit(scaler.transform(transformed_x), y)
         return [
             make_pipeline(rocket, scaler, ridge),
@@ -380,7 +395,9 @@ def _train_probas_for_estimator(self, Xt, y, idx, rng):
 
         clf = make_pipeline(
             StandardScaler(with_mean=False),
-            RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)),
+            RidgeClassifierCV(
+                alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
+            ),
         )
         clf.fit(Xt[idx][subsample], y[subsample])
         preds = clf.predict(Xt[idx][oob])
diff --git a/aeon/classification/convolution_based/_hydra.py b/aeon/classification/convolution_based/_hydra.py
@@ -25,6 +25,17 @@ class HydraClassifier(BaseClassifier):
         Number of kernels per group.
     n_groups : int, default=64
         Number of groups per dilation.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     n_jobs : int, default=1
         The number of jobs to run in parallel for both `fit` and `predict`.
         ``-1`` means using all processors.
@@ -76,9 +87,12 @@ class HydraClassifier(BaseClassifier):
         "python_dependencies": "torch",
     }
 
-    def __init__(self, n_kernels=8, n_groups=64, n_jobs=1, random_state=None):
+    def __init__(
+        self, n_kernels=8, n_groups=64, n_jobs=1, class_weight=None, random_state=None
+    ):
         self.n_kernels = n_kernels
         self.n_groups = n_groups
+        self.class_weight = class_weight
         self.n_jobs = n_jobs
         self.random_state = random_state
 
@@ -95,7 +109,9 @@ def _fit(self, X, y):
         self._clf = make_pipeline(
             transform,
             _SparseScaler(),
-            RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)),
+            RidgeClassifierCV(
+                alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
+            ),
         )
         self._clf.fit(X, y)
 
diff --git a/aeon/classification/convolution_based/_mr_hydra.py b/aeon/classification/convolution_based/_mr_hydra.py
@@ -23,6 +23,17 @@ class MultiRocketHydraClassifier(BaseClassifier):
         Number of kernels per group for the Hydra transform.
     n_groups : int, default=64
         Number of groups per dilation for the Hydra transform.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     n_jobs : int, default=1
         The number of jobs to run in parallel for both `fit` and `predict`.
         ``-1`` means using all processors.
@@ -70,9 +81,12 @@ class MultiRocketHydraClassifier(BaseClassifier):
         "python_dependencies": "torch",
     }
 
-    def __init__(self, n_kernels=8, n_groups=64, n_jobs=1, random_state=None):
+    def __init__(
+        self, n_kernels=8, n_groups=64, n_jobs=1, class_weight=None, random_state=None
+    ):
         self.n_kernels = n_kernels
         self.n_groups = n_groups
+        self.class_weight = class_weight
         self.n_jobs = n_jobs
         self.random_state = random_state
 
@@ -101,7 +115,9 @@ def _fit(self, X, y):
 
         Xt = np.concatenate((Xt_hydra, Xt_multirocket), axis=1)
 
-        self.classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
+        self.classifier = RidgeClassifierCV(
+            alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
+        )
         self.classifier.fit(Xt, y)
 
         return self
diff --git a/aeon/classification/convolution_based/_rocket_classifier.py b/aeon/classification/convolution_based/_rocket_classifier.py
@@ -45,6 +45,18 @@ class RocketClassifier(BaseClassifier):
     estimator : sklearn compatible classifier or None, default=None
         The estimator used. If None, a RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
         is used.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        Only applies if estimator is None and the default is used.
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     random_state : int, RandomState instance or None, default=None
         If `int`, random_state is the seed used by the random number generator;
         If `RandomState` instance, random_state is the random number generator;
@@ -104,6 +116,7 @@ def __init__(
         rocket_transform="rocket",
         max_dilations_per_kernel=32,
         n_features_per_kernel=4,
+        class_weight=None,
         estimator=None,
         random_state=None,
         n_jobs=1,
@@ -113,6 +126,7 @@ def __init__(
         self.max_dilations_per_kernel = max_dilations_per_kernel
         self.n_features_per_kernel = n_features_per_kernel
         self.random_state = random_state
+        self.class_weight = class_weight
         self.estimator = estimator
         self.n_jobs = n_jobs
 
@@ -168,7 +182,9 @@ def _fit(self, X, y):
         self._scaler = StandardScaler(with_mean=False)
         self._estimator = _clone_estimator(
             (
-                RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
+                RidgeClassifierCV(
+                    alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
+                )
                 if self.estimator is None
                 else self.estimator
             ),
diff --git a/aeon/classification/dictionary_based/_muse.py b/aeon/classification/dictionary_based/_muse.py
@@ -73,6 +73,17 @@ class MUSE(BaseClassifier):
         If set to True, a LogisticRegression will be trained, which does support
         predict_proba(), yet is slower and typically less accuracy. predict_proba() is
         needed for example in Early-Classification like TEASER.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     n_jobs : int, default=1
         The number of jobs to run in parallel for both `fit` and `predict`.
         ``-1`` means using all processors.
@@ -136,6 +147,7 @@ def __init__(
         feature_selection="chi2",
         p_threshold=0.05,
         support_probabilities=False,
+        class_weight=None,
         n_jobs=1,
         random_state=None,
     ):
@@ -160,6 +172,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.support_probabilities = support_probabilities
         self.total_features_count = 0
+        self.class_weight = class_weight
         self.feature_selection = feature_selection
 
         super().__init__()
@@ -242,13 +255,15 @@ def _fit(self, X, y):
 
         # Ridge Classifier does not give probabilities
         if not self.support_probabilities:
-            self.clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
+            self.clf = RidgeClassifierCV(
+                alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
+            )
         else:
             self.clf = LogisticRegression(
                 max_iter=5000,
                 solver="liblinear",
                 dual=True,
-                # class_weight="balanced",
+                class_weight=self.class_weight,
                 penalty="l2",
                 random_state=self.random_state,
                 n_jobs=self.n_jobs,
diff --git a/aeon/classification/dictionary_based/_weasel.py b/aeon/classification/dictionary_based/_weasel.py
@@ -84,6 +84,17 @@ class WEASEL(BaseClassifier):
         If set to True, a LogisticRegression will be trained, which does support
         predict_proba(), yet is slower and typically less accurate. predict_proba() is
         needed for example in Early-Classification like TEASER.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     random_state : int, RandomState instance or None, default=None
         If `int`, random_state is the seed used by the random number generator;
         If `RandomState` instance, random_state is the random number generator;
@@ -136,6 +147,7 @@ def __init__(
         n_jobs=1,
         feature_selection="chi2",
         support_probabilities=False,
+        class_weight=None,
         random_state=None,
     ):
         self.alphabet_size = alphabet_size
@@ -159,6 +171,7 @@ def __init__(
         self.clf = None
         self.n_jobs = n_jobs
         self.support_probabilities = support_probabilities
+        self.class_weight = class_weight
         set_num_threads(n_jobs)
         super().__init__()
 
@@ -223,13 +236,15 @@ def _fit(self, X, y):
 
         # Ridge Classifier does not give probabilities
         if not self.support_probabilities:
-            self.clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
+            self.clf = RidgeClassifierCV(
+                alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
+            )
         else:
             self.clf = LogisticRegression(
                 max_iter=5000,
                 solver="liblinear",
                 dual=True,
-                # class_weight="balanced",
+                class_weight=self.class_weight,
                 penalty="l2",
                 random_state=self.random_state,
                 n_jobs=self.n_jobs,
diff --git a/aeon/classification/dictionary_based/_weasel_v2.py b/aeon/classification/dictionary_based/_weasel_v2.py
@@ -80,6 +80,17 @@ class WEASEL_V2(BaseClassifier):
     max_feature_count : int, default=30_000
        size of the dictionary - number of words to use - if feature_selection set to
        "chi2" or "random". Else ignored.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     random_state : int or None, default=None
         If `int`, random_state is the seed used by the random number generator;
         If `None`, the random number generator is the `RandomState` instance used
@@ -128,6 +139,7 @@ def __init__(
         feature_selection="chi2_top_k",
         max_feature_count=30_000,
         random_state=None,
+        class_weight=None,
         n_jobs=4,
     ):
         self.norm_options = norm_options
@@ -140,6 +152,7 @@ def __init__(
         self.max_feature_count = max_feature_count
         self.use_first_differences = use_first_differences
         self.feature_selection = feature_selection
+        self.class_weight = class_weight
 
         self.clf = None
         self.n_jobs = n_jobs
@@ -178,7 +191,9 @@ def _fit(self, X, y):
         words = self.transform.fit_transform(X, y)
 
         # use RidgeClassifierCV for classification
-        self.clf = RidgeClassifierCV(alphas=np.logspace(-1, 5, 10))
+        self.clf = RidgeClassifierCV(
+            alphas=np.logspace(-1, 5, 10), class_weight=self.class_weight
+        )
         self.clf.fit(words, y)
 
         if hasattr(self.clf, "best_score_"):
diff --git a/aeon/classification/interval_based/_quant.py b/aeon/classification/interval_based/_quant.py
@@ -36,6 +36,18 @@ class QUANTClassifier(BaseClassifier):
     estimator : sklearn estimator, default=None
         The estimator to use for classification. If None, an ExtraTreesClassifier
         with 200 estimators is used.
+    class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
+        Only applies if estimator is None, and the default ExtraTreesClassifier is used.
+        From sklearn documentation:
+        If not given, all classes are supposed to have weight one.
+        The “balanced” mode uses the values of y to automatically adjust weights
+        inversely proportional to class frequencies in the input data as
+        n_samples / (n_classes * np.bincount(y))
+        The “balanced_subsample” mode is the same as “balanced” except that weights
+        are computed based on the bootstrap sample for every tree grown.
+        For multi-output, the weights of each column of y will be multiplied.
+        Note that these weights will be multiplied with sample_weight (passed through
+        the fit method) if sample_weight is specified.
     random_state : int, RandomState instance or None, default=None
         If `int`, random_state is the seed used by the random number generator;
         If `RandomState` instance, random_state is the random number generator;
@@ -75,13 +87,18 @@ class QUANTClassifier(BaseClassifier):
     }
 
     def __init__(
-        self, interval_depth=6, quantile_divisor=4, estimator=None, random_state=None
+        self,
+        interval_depth=6,
+        quantile_divisor=4,
+        estimator=None,
+        random_state=None,
+        class_weight=None,
     ):
         self.interval_depth = interval_depth
         self.quantile_divisor = quantile_divisor
         self.estimator = estimator
         self.random_state = random_state
-
+        self.class_weight = class_weight
         super().__init__()
 
     def _fit(self, X, y):
@@ -107,7 +124,11 @@ def _fit(self, X, y):
         self._estimator = _clone_estimator(
             (
                 ExtraTreesClassifier(
-                    n_estimators=200, max_features=0.1, criterion="entropy"
+                    n_estimators=200,
+                    max_features=0.1,
+                    criterion="entropy",
+                    class_weight=self.class_weight,
+                    random_state=self.random_state,
                 )
                 if self.estimator is None
                 else self.estimator
diff --git a/aeon/classification/shapelet_based/_rdst.py b/aeon/classification/shapelet_based/_rdst.py