Skip to content

Commit bf264d6

Browse files
authored
[feat] Add coalescer (#376)
* [fix] Add check dataset in transform as well for test dataset, which does not require fit * [test] Migrate tests from the francisco's PR without modifications * [fix] Modify so that tests pass * [test] Increase the coverage
1 parent ba9c86a commit bf264d6

File tree

13 files changed

+730
-1
lines changed

13 files changed

+730
-1
lines changed

autoPyTorch/configs/greedy_portfolio.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
[{"data_loader:batch_size": 60,
22
"encoder:__choice__": "OneHotEncoder",
3+
"coalescer:__choice__": "NoCoalescer",
34
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
45
"imputer:numerical_strategy": "mean",
56
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -30,6 +31,7 @@
3031
"network_backbone:ShapedMLPBackbone:max_dropout": 0.023271935735825866},
3132
{"data_loader:batch_size": 255,
3233
"encoder:__choice__": "OneHotEncoder",
34+
"coalescer:__choice__": "NoCoalescer",
3335
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
3436
"imputer:numerical_strategy": "mean",
3537
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -63,6 +65,7 @@
6365
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7662454727603789},
6466
{"data_loader:batch_size": 165,
6567
"encoder:__choice__": "OneHotEncoder",
68+
"coalescer:__choice__": "NoCoalescer",
6669
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
6770
"imputer:numerical_strategy": "mean",
6871
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -93,6 +96,7 @@
9396
"network_head:fully_connected:units_layer_1": 128},
9497
{"data_loader:batch_size": 299,
9598
"encoder:__choice__": "OneHotEncoder",
99+
"coalescer:__choice__": "NoCoalescer",
96100
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
97101
"imputer:numerical_strategy": "mean",
98102
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -124,6 +128,7 @@
124128
"network_head:fully_connected:units_layer_1": 128},
125129
{"data_loader:batch_size": 183,
126130
"encoder:__choice__": "OneHotEncoder",
131+
"coalescer:__choice__": "NoCoalescer",
127132
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
128133
"imputer:numerical_strategy": "mean",
129134
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -157,6 +162,7 @@
157162
"network_backbone:ShapedResNetBackbone:max_dropout": 0.27204101593048097},
158163
{"data_loader:batch_size": 21,
159164
"encoder:__choice__": "OneHotEncoder",
165+
"coalescer:__choice__": "NoCoalescer",
160166
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
161167
"imputer:numerical_strategy": "mean",
162168
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -185,6 +191,7 @@
185191
"network_head:fully_connected:units_layer_1": 128},
186192
{"data_loader:batch_size": 159,
187193
"encoder:__choice__": "OneHotEncoder",
194+
"coalescer:__choice__": "NoCoalescer",
188195
"feature_preprocessor:__choice__": "TruncatedSVD",
189196
"imputer:numerical_strategy": "mean",
190197
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -214,6 +221,7 @@
214221
"network_head:fully_connected:units_layer_1": 128},
215222
{"data_loader:batch_size": 442,
216223
"encoder:__choice__": "OneHotEncoder",
224+
"coalescer:__choice__": "NoCoalescer",
217225
"feature_preprocessor:__choice__": "TruncatedSVD",
218226
"imputer:numerical_strategy": "mean",
219227
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -246,6 +254,7 @@
246254
"network_head:fully_connected:units_layer_1": 128},
247255
{"data_loader:batch_size": 140,
248256
"encoder:__choice__": "OneHotEncoder",
257+
"coalescer:__choice__": "NoCoalescer",
249258
"feature_preprocessor:__choice__": "TruncatedSVD",
250259
"imputer:numerical_strategy": "mean",
251260
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -278,6 +287,7 @@
278287
"network_head:fully_connected:units_layer_1": 128},
279288
{"data_loader:batch_size": 48,
280289
"encoder:__choice__": "OneHotEncoder",
290+
"coalescer:__choice__": "NoCoalescer",
281291
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
282292
"imputer:numerical_strategy": "mean",
283293
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -305,6 +315,7 @@
305315
"network_head:fully_connected:units_layer_1": 128},
306316
{"data_loader:batch_size": 168,
307317
"encoder:__choice__": "OneHotEncoder",
318+
"coalescer:__choice__": "NoCoalescer",
308319
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
309320
"imputer:numerical_strategy": "mean",
310321
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -337,6 +348,7 @@
337348
"network_backbone:ShapedResNetBackbone:max_dropout": 0.8992826006547855},
338349
{"data_loader:batch_size": 21,
339350
"encoder:__choice__": "OneHotEncoder",
351+
"coalescer:__choice__": "NoCoalescer",
340352
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
341353
"imputer:numerical_strategy": "mean",
342354
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -365,6 +377,7 @@
365377
"network_head:fully_connected:units_layer_1": 128},
366378
{"data_loader:batch_size": 163,
367379
"encoder:__choice__": "OneHotEncoder",
380+
"coalescer:__choice__": "NoCoalescer",
368381
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
369382
"imputer:numerical_strategy": "mean",
370383
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -397,6 +410,7 @@
397410
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6341848343636569},
398411
{"data_loader:batch_size": 150,
399412
"encoder:__choice__": "OneHotEncoder",
413+
"coalescer:__choice__": "NoCoalescer",
400414
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
401415
"imputer:numerical_strategy": "mean",
402416
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -430,6 +444,7 @@
430444
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7133813761319248},
431445
{"data_loader:batch_size": 151,
432446
"encoder:__choice__": "OneHotEncoder",
447+
"coalescer:__choice__": "NoCoalescer",
433448
"feature_preprocessor:__choice__": "TruncatedSVD",
434449
"imputer:numerical_strategy": "mean",
435450
"lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -459,6 +474,7 @@
459474
"network_head:fully_connected:units_layer_1": 128},
460475
{"data_loader:batch_size": 42,
461476
"encoder:__choice__": "OneHotEncoder",
477+
"coalescer:__choice__": "NoCoalescer",
462478
"feature_preprocessor:__choice__": "TruncatedSVD",
463479
"imputer:numerical_strategy": "mean",
464480
"lr_scheduler:__choice__": "CosineAnnealingLR",
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
from ConfigSpace.hyperparameters import UniformFloatHyperparameter
5+
6+
import numpy as np
7+
8+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
9+
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
10+
from autoPyTorch.utils.implementations import MinorityCoalesceTransformer
11+
12+
13+
class MinorityCoalescer(BaseCoalescer):
14+
"""Group together categories whose occurence is less than a specified min_frac """
15+
def __init__(self, min_frac: float, random_state: np.random.RandomState):
16+
super().__init__()
17+
self.min_frac = min_frac
18+
self.random_state = random_state
19+
20+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
21+
self.check_requirements(X, y)
22+
self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_frac=self.min_frac)
23+
return self
24+
25+
@staticmethod
26+
def get_hyperparameter_search_space(
27+
dataset_properties: Optional[Dict[str, Any]] = None,
28+
min_frac: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_frac',
29+
value_range=(1e-4, 0.5),
30+
default_value=1e-2,
31+
),
32+
) -> ConfigurationSpace:
33+
34+
cs = ConfigurationSpace()
35+
add_hyperparameter(cs, min_frac, UniformFloatHyperparameter)
36+
return cs
37+
38+
@staticmethod
39+
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
40+
return {
41+
'shortname': 'MinorityCoalescer',
42+
'name': 'MinorityCoalescer',
43+
'handles_sparse': False
44+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
import numpy as np
4+
5+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
6+
7+
8+
class NoCoalescer(BaseCoalescer):
9+
def __init__(self, random_state: np.random.RandomState):
10+
super().__init__()
11+
self.random_state = random_state
12+
self._processing = False
13+
14+
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
15+
"""
16+
As no coalescing happens, only check the requirements.
17+
18+
Args:
19+
X (Dict[str, Any]):
20+
fit dictionary
21+
y (Optional[Any]):
22+
Parameter to comply with scikit-learn API. Not used.
23+
24+
Returns:
25+
instance of self
26+
"""
27+
self.check_requirements(X, y)
28+
29+
return self
30+
31+
@staticmethod
32+
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
33+
return {
34+
'shortname': 'NoCoalescer',
35+
'name': 'NoCoalescer',
36+
'handles_sparse': True
37+
}

0 commit comments

Comments
 (0)