Skip to content

Commit 34cf7b5

Browse files
authored
[ENH] Adds class_weight to those classifier that support it. Required for imbalanced datasets. (#1776)
- This PR adds class_weight to those classifiers that support it by sklearn design. class_weight is intended for training with imbalanced datasets. - Fixes a bug in Quant, as the random_state parameter was not passed to the internal estimator.
1 parent ab2ef20 commit 34cf7b5

File tree

9 files changed

+161
-15
lines changed

9 files changed

+161
-15
lines changed

aeon/classification/convolution_based/_arsenal.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,17 @@ class Arsenal(BaseClassifier):
7676
The collections of estimators trained in fit.
7777
weights_ : list of shape (n_estimators) of float
7878
Weight of each estimator in the ensemble.
79+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
80+
From sklearn documentation:
81+
If not given, all classes are supposed to have weight one.
82+
The “balanced” mode uses the values of y to automatically adjust weights
83+
inversely proportional to class frequencies in the input data as
84+
n_samples / (n_classes * np.bincount(y))
85+
The “balanced_subsample” mode is the same as “balanced” except that weights
86+
are computed based on the bootstrap sample for every tree grown.
87+
For multi-output, the weights of each column of y will be multiplied.
88+
Note that these weights will be multiplied with sample_weight (passed through
89+
the fit method) if sample_weight is specified.
7990
n_estimators_ : int
8091
The number of estimators in the ensemble.
8192
@@ -125,6 +136,7 @@ def __init__(
125136
n_features_per_kernel=4,
126137
time_limit_in_minutes=0.0,
127138
contract_max_n_estimators=100,
139+
class_weight=None,
128140
n_jobs=1,
129141
random_state=None,
130142
):
@@ -135,6 +147,7 @@ def __init__(
135147
self.n_features_per_kernel = n_features_per_kernel
136148
self.time_limit_in_minutes = time_limit_in_minutes
137149
self.contract_max_n_estimators = contract_max_n_estimators
150+
self.class_weight = class_weight
138151

139152
self.random_state = random_state
140153
self.n_jobs = n_jobs
@@ -355,7 +368,9 @@ def _fit_ensemble_estimator(self, rocket, X, y, keep_transformed_data):
355368
transformed_x = rocket.fit_transform(X)
356369
scaler = StandardScaler(with_mean=False)
357370
scaler.fit(transformed_x, y)
358-
ridge = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
371+
ridge = RidgeClassifierCV(
372+
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
373+
)
359374
ridge.fit(scaler.transform(transformed_x), y)
360375
return [
361376
make_pipeline(rocket, scaler, ridge),
@@ -380,7 +395,9 @@ def _train_probas_for_estimator(self, Xt, y, idx, rng):
380395

381396
clf = make_pipeline(
382397
StandardScaler(with_mean=False),
383-
RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)),
398+
RidgeClassifierCV(
399+
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
400+
),
384401
)
385402
clf.fit(Xt[idx][subsample], y[subsample])
386403
preds = clf.predict(Xt[idx][oob])

aeon/classification/convolution_based/_hydra.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,17 @@ class HydraClassifier(BaseClassifier):
2525
Number of kernels per group.
2626
n_groups : int, default=64
2727
Number of groups per dilation.
28+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
29+
From sklearn documentation:
30+
If not given, all classes are supposed to have weight one.
31+
The “balanced” mode uses the values of y to automatically adjust weights
32+
inversely proportional to class frequencies in the input data as
33+
n_samples / (n_classes * np.bincount(y))
34+
The “balanced_subsample” mode is the same as “balanced” except that weights
35+
are computed based on the bootstrap sample for every tree grown.
36+
For multi-output, the weights of each column of y will be multiplied.
37+
Note that these weights will be multiplied with sample_weight (passed through
38+
the fit method) if sample_weight is specified.
2839
n_jobs : int, default=1
2940
The number of jobs to run in parallel for both `fit` and `predict`.
3041
``-1`` means using all processors.
@@ -76,9 +87,12 @@ class HydraClassifier(BaseClassifier):
7687
"python_dependencies": "torch",
7788
}
7889

79-
def __init__(self, n_kernels=8, n_groups=64, n_jobs=1, random_state=None):
90+
def __init__(
91+
self, n_kernels=8, n_groups=64, n_jobs=1, class_weight=None, random_state=None
92+
):
8093
self.n_kernels = n_kernels
8194
self.n_groups = n_groups
95+
self.class_weight = class_weight
8296
self.n_jobs = n_jobs
8397
self.random_state = random_state
8498

@@ -95,7 +109,9 @@ def _fit(self, X, y):
95109
self._clf = make_pipeline(
96110
transform,
97111
_SparseScaler(),
98-
RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)),
112+
RidgeClassifierCV(
113+
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
114+
),
99115
)
100116
self._clf.fit(X, y)
101117

aeon/classification/convolution_based/_mr_hydra.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,17 @@ class MultiRocketHydraClassifier(BaseClassifier):
2323
Number of kernels per group for the Hydra transform.
2424
n_groups : int, default=64
2525
Number of groups per dilation for the Hydra transform.
26+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
27+
From sklearn documentation:
28+
If not given, all classes are supposed to have weight one.
29+
The “balanced” mode uses the values of y to automatically adjust weights
30+
inversely proportional to class frequencies in the input data as
31+
n_samples / (n_classes * np.bincount(y))
32+
The “balanced_subsample” mode is the same as “balanced” except that weights
33+
are computed based on the bootstrap sample for every tree grown.
34+
For multi-output, the weights of each column of y will be multiplied.
35+
Note that these weights will be multiplied with sample_weight (passed through
36+
the fit method) if sample_weight is specified.
2637
n_jobs : int, default=1
2738
The number of jobs to run in parallel for both `fit` and `predict`.
2839
``-1`` means using all processors.
@@ -70,9 +81,12 @@ class MultiRocketHydraClassifier(BaseClassifier):
7081
"python_dependencies": "torch",
7182
}
7283

73-
def __init__(self, n_kernels=8, n_groups=64, n_jobs=1, random_state=None):
84+
def __init__(
85+
self, n_kernels=8, n_groups=64, n_jobs=1, class_weight=None, random_state=None
86+
):
7487
self.n_kernels = n_kernels
7588
self.n_groups = n_groups
89+
self.class_weight = class_weight
7690
self.n_jobs = n_jobs
7791
self.random_state = random_state
7892

@@ -101,7 +115,9 @@ def _fit(self, X, y):
101115

102116
Xt = np.concatenate((Xt_hydra, Xt_multirocket), axis=1)
103117

104-
self.classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
118+
self.classifier = RidgeClassifierCV(
119+
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
120+
)
105121
self.classifier.fit(Xt, y)
106122

107123
return self

aeon/classification/convolution_based/_rocket_classifier.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ class RocketClassifier(BaseClassifier):
4545
estimator : sklearn compatible classifier or None, default=None
4646
The estimator used. If None, a RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
4747
is used.
48+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
49+
Only applies if estimator is None and the default is used.
50+
From sklearn documentation:
51+
If not given, all classes are supposed to have weight one.
52+
The “balanced” mode uses the values of y to automatically adjust weights
53+
inversely proportional to class frequencies in the input data as
54+
n_samples / (n_classes * np.bincount(y))
55+
The “balanced_subsample” mode is the same as “balanced” except that weights
56+
are computed based on the bootstrap sample for every tree grown.
57+
For multi-output, the weights of each column of y will be multiplied.
58+
Note that these weights will be multiplied with sample_weight (passed through
59+
the fit method) if sample_weight is specified.
4860
random_state : int, RandomState instance or None, default=None
4961
If `int`, random_state is the seed used by the random number generator;
5062
If `RandomState` instance, random_state is the random number generator;
@@ -104,6 +116,7 @@ def __init__(
104116
rocket_transform="rocket",
105117
max_dilations_per_kernel=32,
106118
n_features_per_kernel=4,
119+
class_weight=None,
107120
estimator=None,
108121
random_state=None,
109122
n_jobs=1,
@@ -113,6 +126,7 @@ def __init__(
113126
self.max_dilations_per_kernel = max_dilations_per_kernel
114127
self.n_features_per_kernel = n_features_per_kernel
115128
self.random_state = random_state
129+
self.class_weight = class_weight
116130
self.estimator = estimator
117131
self.n_jobs = n_jobs
118132

@@ -168,7 +182,9 @@ def _fit(self, X, y):
168182
self._scaler = StandardScaler(with_mean=False)
169183
self._estimator = _clone_estimator(
170184
(
171-
RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
185+
RidgeClassifierCV(
186+
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
187+
)
172188
if self.estimator is None
173189
else self.estimator
174190
),

aeon/classification/dictionary_based/_muse.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,17 @@ class MUSE(BaseClassifier):
7373
If set to True, a LogisticRegression will be trained, which does support
7474
predict_proba(), yet is slower and typically less accuracy. predict_proba() is
7575
needed for example in Early-Classification like TEASER.
76+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
77+
From sklearn documentation:
78+
If not given, all classes are supposed to have weight one.
79+
The “balanced” mode uses the values of y to automatically adjust weights
80+
inversely proportional to class frequencies in the input data as
81+
n_samples / (n_classes * np.bincount(y))
82+
The “balanced_subsample” mode is the same as “balanced” except that weights
83+
are computed based on the bootstrap sample for every tree grown.
84+
For multi-output, the weights of each column of y will be multiplied.
85+
Note that these weights will be multiplied with sample_weight (passed through
86+
the fit method) if sample_weight is specified.
7687
n_jobs : int, default=1
7788
The number of jobs to run in parallel for both `fit` and `predict`.
7889
``-1`` means using all processors.
@@ -136,6 +147,7 @@ def __init__(
136147
feature_selection="chi2",
137148
p_threshold=0.05,
138149
support_probabilities=False,
150+
class_weight=None,
139151
n_jobs=1,
140152
random_state=None,
141153
):
@@ -160,6 +172,7 @@ def __init__(
160172
self.n_jobs = n_jobs
161173
self.support_probabilities = support_probabilities
162174
self.total_features_count = 0
175+
self.class_weight = class_weight
163176
self.feature_selection = feature_selection
164177

165178
super().__init__()
@@ -242,13 +255,15 @@ def _fit(self, X, y):
242255

243256
# Ridge Classifier does not give probabilities
244257
if not self.support_probabilities:
245-
self.clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
258+
self.clf = RidgeClassifierCV(
259+
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
260+
)
246261
else:
247262
self.clf = LogisticRegression(
248263
max_iter=5000,
249264
solver="liblinear",
250265
dual=True,
251-
# class_weight="balanced",
266+
class_weight=self.class_weight,
252267
penalty="l2",
253268
random_state=self.random_state,
254269
n_jobs=self.n_jobs,

aeon/classification/dictionary_based/_weasel.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,17 @@ class WEASEL(BaseClassifier):
8484
If set to True, a LogisticRegression will be trained, which does support
8585
predict_proba(), yet is slower and typically less accurate. predict_proba() is
8686
needed for example in Early-Classification like TEASER.
87+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
88+
From sklearn documentation:
89+
If not given, all classes are supposed to have weight one.
90+
The “balanced” mode uses the values of y to automatically adjust weights
91+
inversely proportional to class frequencies in the input data as
92+
n_samples / (n_classes * np.bincount(y))
93+
The “balanced_subsample” mode is the same as “balanced” except that weights
94+
are computed based on the bootstrap sample for every tree grown.
95+
For multi-output, the weights of each column of y will be multiplied.
96+
Note that these weights will be multiplied with sample_weight (passed through
97+
the fit method) if sample_weight is specified.
8798
random_state : int, RandomState instance or None, default=None
8899
If `int`, random_state is the seed used by the random number generator;
89100
If `RandomState` instance, random_state is the random number generator;
@@ -136,6 +147,7 @@ def __init__(
136147
n_jobs=1,
137148
feature_selection="chi2",
138149
support_probabilities=False,
150+
class_weight=None,
139151
random_state=None,
140152
):
141153
self.alphabet_size = alphabet_size
@@ -159,6 +171,7 @@ def __init__(
159171
self.clf = None
160172
self.n_jobs = n_jobs
161173
self.support_probabilities = support_probabilities
174+
self.class_weight = class_weight
162175
set_num_threads(n_jobs)
163176
super().__init__()
164177

@@ -223,13 +236,15 @@ def _fit(self, X, y):
223236

224237
# Ridge Classifier does not give probabilities
225238
if not self.support_probabilities:
226-
self.clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
239+
self.clf = RidgeClassifierCV(
240+
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
241+
)
227242
else:
228243
self.clf = LogisticRegression(
229244
max_iter=5000,
230245
solver="liblinear",
231246
dual=True,
232-
# class_weight="balanced",
247+
class_weight=self.class_weight,
233248
penalty="l2",
234249
random_state=self.random_state,
235250
n_jobs=self.n_jobs,

aeon/classification/dictionary_based/_weasel_v2.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,17 @@ class WEASEL_V2(BaseClassifier):
8080
max_feature_count : int, default=30_000
8181
size of the dictionary - number of words to use - if feature_selection set to
8282
"chi2" or "random". Else ignored.
83+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
84+
From sklearn documentation:
85+
If not given, all classes are supposed to have weight one.
86+
The “balanced” mode uses the values of y to automatically adjust weights
87+
inversely proportional to class frequencies in the input data as
88+
n_samples / (n_classes * np.bincount(y))
89+
The “balanced_subsample” mode is the same as “balanced” except that weights
90+
are computed based on the bootstrap sample for every tree grown.
91+
For multi-output, the weights of each column of y will be multiplied.
92+
Note that these weights will be multiplied with sample_weight (passed through
93+
the fit method) if sample_weight is specified.
8394
random_state : int or None, default=None
8495
If `int`, random_state is the seed used by the random number generator;
8596
If `None`, the random number generator is the `RandomState` instance used
@@ -128,6 +139,7 @@ def __init__(
128139
feature_selection="chi2_top_k",
129140
max_feature_count=30_000,
130141
random_state=None,
142+
class_weight=None,
131143
n_jobs=4,
132144
):
133145
self.norm_options = norm_options
@@ -140,6 +152,7 @@ def __init__(
140152
self.max_feature_count = max_feature_count
141153
self.use_first_differences = use_first_differences
142154
self.feature_selection = feature_selection
155+
self.class_weight = class_weight
143156

144157
self.clf = None
145158
self.n_jobs = n_jobs
@@ -178,7 +191,9 @@ def _fit(self, X, y):
178191
words = self.transform.fit_transform(X, y)
179192

180193
# use RidgeClassifierCV for classification
181-
self.clf = RidgeClassifierCV(alphas=np.logspace(-1, 5, 10))
194+
self.clf = RidgeClassifierCV(
195+
alphas=np.logspace(-1, 5, 10), class_weight=self.class_weight
196+
)
182197
self.clf.fit(words, y)
183198

184199
if hasattr(self.clf, "best_score_"):

aeon/classification/interval_based/_quant.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,18 @@ class QUANTClassifier(BaseClassifier):
3636
estimator : sklearn estimator, default=None
3737
The estimator to use for classification. If None, an ExtraTreesClassifier
3838
with 200 estimators is used.
39+
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
40+
Only applies if estimator is None, and the default ExtraTreesClassifier is used.
41+
From sklearn documentation:
42+
If not given, all classes are supposed to have weight one.
43+
The “balanced” mode uses the values of y to automatically adjust weights
44+
inversely proportional to class frequencies in the input data as
45+
n_samples / (n_classes * np.bincount(y))
46+
The “balanced_subsample” mode is the same as “balanced” except that weights
47+
are computed based on the bootstrap sample for every tree grown.
48+
For multi-output, the weights of each column of y will be multiplied.
49+
Note that these weights will be multiplied with sample_weight (passed through
50+
the fit method) if sample_weight is specified.
3951
random_state : int, RandomState instance or None, default=None
4052
If `int`, random_state is the seed used by the random number generator;
4153
If `RandomState` instance, random_state is the random number generator;
@@ -75,13 +87,18 @@ class QUANTClassifier(BaseClassifier):
7587
}
7688

7789
def __init__(
78-
self, interval_depth=6, quantile_divisor=4, estimator=None, random_state=None
90+
self,
91+
interval_depth=6,
92+
quantile_divisor=4,
93+
estimator=None,
94+
random_state=None,
95+
class_weight=None,
7996
):
8097
self.interval_depth = interval_depth
8198
self.quantile_divisor = quantile_divisor
8299
self.estimator = estimator
83100
self.random_state = random_state
84-
101+
self.class_weight = class_weight
85102
super().__init__()
86103

87104
def _fit(self, X, y):
@@ -107,7 +124,11 @@ def _fit(self, X, y):
107124
self._estimator = _clone_estimator(
108125
(
109126
ExtraTreesClassifier(
110-
n_estimators=200, max_features=0.1, criterion="entropy"
127+
n_estimators=200,
128+
max_features=0.1,
129+
criterion="entropy",
130+
class_weight=self.class_weight,
131+
random_state=self.random_state,
111132
)
112133
if self.estimator is None
113134
else self.estimator

0 commit comments

Comments
 (0)