Skip to content

Commit ba9c86a

Browse files
authored
[FIX] Remove redundant categorical imputation (#375)
* remove categorical strategy from simple imputer * fix tests * address comments from eddie * fix flake and mypy error * fix test cases for imputation
1 parent 2601421 commit ba9c86a

File tree

6 files changed

+98
-151
lines changed

6 files changed

+98
-151
lines changed

autoPyTorch/configs/greedy_portfolio.json

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
[{"data_loader:batch_size": 60,
22
"encoder:__choice__": "OneHotEncoder",
33
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
4-
"imputer:categorical_strategy": "most_frequent",
54
"imputer:numerical_strategy": "mean",
65
"lr_scheduler:__choice__": "CosineAnnealingLR",
76
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -32,7 +31,6 @@
3231
{"data_loader:batch_size": 255,
3332
"encoder:__choice__": "OneHotEncoder",
3433
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
35-
"imputer:categorical_strategy": "most_frequent",
3634
"imputer:numerical_strategy": "mean",
3735
"lr_scheduler:__choice__": "CosineAnnealingLR",
3836
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -66,7 +64,6 @@
6664
{"data_loader:batch_size": 165,
6765
"encoder:__choice__": "OneHotEncoder",
6866
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
69-
"imputer:categorical_strategy": "most_frequent",
7067
"imputer:numerical_strategy": "mean",
7168
"lr_scheduler:__choice__": "CosineAnnealingLR",
7269
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -97,7 +94,6 @@
9794
{"data_loader:batch_size": 299,
9895
"encoder:__choice__": "OneHotEncoder",
9996
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
100-
"imputer:categorical_strategy": "most_frequent",
10197
"imputer:numerical_strategy": "mean",
10298
"lr_scheduler:__choice__": "CosineAnnealingLR",
10399
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -129,7 +125,6 @@
129125
{"data_loader:batch_size": 183,
130126
"encoder:__choice__": "OneHotEncoder",
131127
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
132-
"imputer:categorical_strategy": "most_frequent",
133128
"imputer:numerical_strategy": "mean",
134129
"lr_scheduler:__choice__": "CosineAnnealingLR",
135130
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -163,7 +158,6 @@
163158
{"data_loader:batch_size": 21,
164159
"encoder:__choice__": "OneHotEncoder",
165160
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
166-
"imputer:categorical_strategy": "most_frequent",
167161
"imputer:numerical_strategy": "mean",
168162
"lr_scheduler:__choice__": "CosineAnnealingLR",
169163
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -192,7 +186,6 @@
192186
{"data_loader:batch_size": 159,
193187
"encoder:__choice__": "OneHotEncoder",
194188
"feature_preprocessor:__choice__": "TruncatedSVD",
195-
"imputer:categorical_strategy": "most_frequent",
196189
"imputer:numerical_strategy": "mean",
197190
"lr_scheduler:__choice__": "CosineAnnealingLR",
198191
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -222,7 +215,6 @@
222215
{"data_loader:batch_size": 442,
223216
"encoder:__choice__": "OneHotEncoder",
224217
"feature_preprocessor:__choice__": "TruncatedSVD",
225-
"imputer:categorical_strategy": "most_frequent",
226218
"imputer:numerical_strategy": "mean",
227219
"lr_scheduler:__choice__": "CosineAnnealingLR",
228220
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -255,7 +247,6 @@
255247
{"data_loader:batch_size": 140,
256248
"encoder:__choice__": "OneHotEncoder",
257249
"feature_preprocessor:__choice__": "TruncatedSVD",
258-
"imputer:categorical_strategy": "most_frequent",
259250
"imputer:numerical_strategy": "mean",
260251
"lr_scheduler:__choice__": "CosineAnnealingLR",
261252
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -288,7 +279,6 @@
288279
{"data_loader:batch_size": 48,
289280
"encoder:__choice__": "OneHotEncoder",
290281
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
291-
"imputer:categorical_strategy": "most_frequent",
292282
"imputer:numerical_strategy": "mean",
293283
"lr_scheduler:__choice__": "CosineAnnealingLR",
294284
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -316,7 +306,6 @@
316306
{"data_loader:batch_size": 168,
317307
"encoder:__choice__": "OneHotEncoder",
318308
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
319-
"imputer:categorical_strategy": "most_frequent",
320309
"imputer:numerical_strategy": "mean",
321310
"lr_scheduler:__choice__": "CosineAnnealingLR",
322311
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -349,7 +338,6 @@
349338
{"data_loader:batch_size": 21,
350339
"encoder:__choice__": "OneHotEncoder",
351340
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
352-
"imputer:categorical_strategy": "most_frequent",
353341
"imputer:numerical_strategy": "mean",
354342
"lr_scheduler:__choice__": "CosineAnnealingLR",
355343
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -378,7 +366,6 @@
378366
{"data_loader:batch_size": 163,
379367
"encoder:__choice__": "OneHotEncoder",
380368
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
381-
"imputer:categorical_strategy": "most_frequent",
382369
"imputer:numerical_strategy": "mean",
383370
"lr_scheduler:__choice__": "CosineAnnealingLR",
384371
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -411,7 +398,6 @@
411398
{"data_loader:batch_size": 150,
412399
"encoder:__choice__": "OneHotEncoder",
413400
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
414-
"imputer:categorical_strategy": "most_frequent",
415401
"imputer:numerical_strategy": "mean",
416402
"lr_scheduler:__choice__": "CosineAnnealingLR",
417403
"network_backbone:__choice__": "ShapedResNetBackbone",
@@ -445,7 +431,6 @@
445431
{"data_loader:batch_size": 151,
446432
"encoder:__choice__": "OneHotEncoder",
447433
"feature_preprocessor:__choice__": "TruncatedSVD",
448-
"imputer:categorical_strategy": "most_frequent",
449434
"imputer:numerical_strategy": "mean",
450435
"lr_scheduler:__choice__": "CosineAnnealingLR",
451436
"network_backbone:__choice__": "ShapedMLPBackbone",
@@ -475,7 +460,6 @@
475460
{"data_loader:batch_size": 42,
476461
"encoder:__choice__": "OneHotEncoder",
477462
"feature_preprocessor:__choice__": "TruncatedSVD",
478-
"imputer:categorical_strategy": "most_frequent",
479463
"imputer:numerical_strategy": "mean",
480464
"lr_scheduler:__choice__": "CosineAnnealingLR",
481465
"network_backbone:__choice__": "ShapedResNetBackbone",

autoPyTorch/optimizer/smbo.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,11 @@ def __init__(self,
246246

247247
self.initial_configurations: Optional[List[Configuration]] = None
248248
if portfolio_selection is not None:
249-
self.initial_configurations = read_return_initial_configurations(config_space=config_space,
250-
portfolio_selection=portfolio_selection)
249+
initial_configurations = read_return_initial_configurations(config_space=config_space,
250+
portfolio_selection=portfolio_selection)
251+
# incase we dont have any valid configuration from the portfolio
252+
self.initial_configurations = initial_configurations \
253+
if len(initial_configurations) > 0 else None
251254

252255
def reset_data_manager(self) -> None:
253256
if self.datamanager is not None:

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
from typing import Any, Dict, List, Optional, Union
1+
from typing import Any, Dict, List, Optional, Tuple, Union
22

33
import numpy as np
44

5+
from sklearn.base import BaseEstimator
56
from sklearn.compose import ColumnTransformer
67
from sklearn.pipeline import make_pipeline
78

@@ -48,18 +49,25 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
4849
"TabularColumnTransformer": an instance of self
4950
"""
5051
self.check_requirements(X, y)
51-
numerical_pipeline = 'drop'
52-
categorical_pipeline = 'drop'
5352

5453
preprocessors = get_tabular_preprocessers(X)
55-
if len(X['dataset_properties']['numerical_columns']):
54+
column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
55+
if len(preprocessors['numerical']) > 0:
5656
numerical_pipeline = make_pipeline(*preprocessors['numerical'])
57-
if len(X['dataset_properties']['categorical_columns']):
57+
column_transformers.append(
58+
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
59+
)
60+
if len(preprocessors['categorical']) > 0:
5861
categorical_pipeline = make_pipeline(*preprocessors['categorical'])
59-
60-
self.preprocessor = ColumnTransformer([
61-
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
62-
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
62+
column_transformers.append(
63+
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
64+
)
65+
66+
# in case the preprocessing steps are disabled
67+
# i.e, NoEncoder for categorical, we want to
68+
# let the data in categorical columns pass through
69+
self.preprocessor = ColumnTransformer(
70+
column_transformers,
6371
remainder='passthrough'
6472
)
6573

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py

Lines changed: 10 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -13,70 +13,42 @@
1313

1414

1515
class SimpleImputer(BaseImputer):
16-
"""An imputer for categorical and numerical columns
17-
18-
Impute missing values for categorical columns with 'constant_!missing!'
19-
20-
Note:
21-
In case of numpy data, the constant value is set to -1, under the assumption
22-
that categorical data is fit with an Ordinal Scaler.
16+
"""
17+
An imputer for numerical columns
2318
2419
Attributes:
2520
random_state (Optional[np.random.RandomState]):
2621
The random state to use for the imputer.
2722
numerical_strategy (str: default='mean'):
2823
The strategy to use for imputing numerical columns.
2924
Can be one of ['most_frequent', 'constant_!missing!']
30-
categorical_strategy (str: default='most_frequent')
31-
The strategy to use for imputing categorical columns.
32-
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
3325
"""
3426

3527
def __init__(
3628
self,
3729
random_state: Optional[np.random.RandomState] = None,
3830
numerical_strategy: str = 'mean',
39-
categorical_strategy: str = 'most_frequent'
4031
):
41-
"""
42-
Note:
43-
'constant' as numerical_strategy uses 0 as the default fill_value while
44-
'constant_!missing!' uses a fill_value of -1.
45-
This behaviour should probably be fixed.
46-
"""
4732
super().__init__()
4833
self.random_state = random_state
4934
self.numerical_strategy = numerical_strategy
50-
self.categorical_strategy = categorical_strategy
5135

5236
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
53-
""" Fits the underlying model and returns the transformed array.
37+
"""
38+
Builds the preprocessor based on the given fit dictionary 'X'.
5439
5540
Args:
56-
X (np.ndarray):
57-
The input features to fit on
58-
y (Optional[np.ndarray]):
59-
The labels for the input features `X`
41+
X (Dict[str, Any]):
42+
The fit dictionary
43+
y (Optional[Any]):
44+
Not Used -- to comply with API
6045
6146
Returns:
62-
SimpleImputer:
63-
returns self
47+
self:
48+
returns an instance of self.
6449
"""
6550
self.check_requirements(X, y)
6651

67-
# Choose an imputer for any categorical columns
68-
categorical_columns = X['dataset_properties']['categorical_columns']
69-
70-
if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
71-
if self.categorical_strategy == 'constant_!missing!':
72-
# Train data is numpy as of this point, where an Ordinal Encoding is used
73-
# for categoricals. Only Numbers are allowed for `fill_value`
74-
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
75-
self.preprocessor['categorical'] = imputer
76-
else:
77-
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
78-
self.preprocessor['categorical'] = imputer
79-
8052
# Choose an imputer for any numerical columns
8153
numerical_columns = X['dataset_properties']['numerical_columns']
8254

@@ -98,11 +70,6 @@ def get_hyperparameter_search_space(
9870
value_range=("mean", "median", "most_frequent", "constant_zero"),
9971
default_value="mean",
10072
),
101-
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
102-
hyperparameter='categorical_strategy',
103-
value_range=("most_frequent", "constant_!missing!"),
104-
default_value="most_frequent"
105-
)
10673
) -> ConfigurationSpace:
10774
"""Get the hyperparameter search space for the SimpleImputer
10875
@@ -112,8 +79,6 @@ def get_hyperparameter_search_space(
11279
Note: Not actually Optional, just adhering to its supertype
11380
numerical_strategy (HyperparameterSearchSpace: default = ...)
11481
The strategy to use for numerical imputation
115-
caterogical_strategy (HyperparameterSearchSpace: default = ...)
116-
The strategy to use for categorical imputation
11782
11883
Returns:
11984
ConfigurationSpace
@@ -132,12 +97,6 @@ def get_hyperparameter_search_space(
13297
):
13398
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
13499

135-
if (
136-
isinstance(dataset_properties['categorical_columns'], List)
137-
and len(dataset_properties['categorical_columns'])
138-
):
139-
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
140-
141100
return cs
142101

143102
@staticmethod

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
1414
def __init__(self) -> None:
1515
super().__init__()
1616
self.add_fit_requirements([
17-
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
18-
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
17+
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
1918

2019
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
2120
"""
@@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
2625
Returns:
2726
(Dict[str, Any]): the updated 'X' dictionary
2827
"""
29-
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
28+
if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
3029
raise ValueError("cant call transform on {} without fitting first."
3130
.format(self.__class__.__name__))
3231
X.update({'imputer': self.preprocessor})

0 commit comments

Comments
 (0)