Skip to content

Commit 80cda62

Browse files
committed
add remaining preprocessors
1 parent a5449db commit 80cda62

18 files changed

+638
-87
lines changed

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
7070
else:
7171
X_train = X['backend'].load_datamanager().train_tensors[0]
7272

73-
self.preprocessor.fit(X_train)
73+
if 'y_train' in X:
74+
y_train = subsampler(X['y_train'], X['train_indices'])
75+
else:
76+
y_train = X['backend'].load_datamanager().train_tensors[1]
77+
78+
self.preprocessor.fit(X_train, y=y_train)
79+
7480
return self
7581

7682
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
from functools import partial
2+
from math import ceil, floor
3+
from typing import Any, Callable, Dict, List, Optional, Union
4+
5+
from ConfigSpace.configuration_space import ConfigurationSpace
6+
from ConfigSpace.hyperparameters import (
7+
CategoricalHyperparameter,
8+
UniformFloatHyperparameter,
9+
UniformIntegerHyperparameter,
10+
)
11+
12+
import numpy as np
13+
14+
from sklearn.ensemble import ExtraTreesClassifier
15+
from sklearn.feature_selection import SelectFromModel
16+
from sklearn.base import BaseEstimator
17+
18+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
19+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
20+
base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
21+
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none, subsampler
22+
23+
24+
class ExtraTreesPreprocessorClassification(autoPyTorchFeaturePreprocessingComponent):
25+
def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
26+
criterion: str = "gini", max_features: float = 0.5,
27+
max_depth: Optional[Union[str, int]] = 5, min_samples_split: int = 2,
28+
min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0,
29+
max_leaf_nodes: Optional[Union[str, int]] = "none",
30+
min_impurity_decrease: float = 0, oob_score=False,
31+
verbose=0,
32+
random_state: Optional[np.random.RandomState] = None):
33+
self.bootstrap = bootstrap
34+
self.n_estimators = n_estimators
35+
if criterion not in ("gini", "entropy"):
36+
raise ValueError("'criterion' is not in ('gini', 'entropy'): "
37+
"%s" % criterion)
38+
self.criterion = criterion
39+
self.max_features = max_features
40+
self.min_impurity_decrease = min_impurity_decrease
41+
self.max_depth = max_depth
42+
self.min_samples_split = min_samples_split
43+
self.min_samples_leaf = min_samples_leaf
44+
self.min_weight_fraction_leaf = min_weight_fraction_leaf
45+
self.max_leaf_nodes = max_leaf_nodes
46+
self.oob_score = oob_score
47+
self.verbose = verbose
48+
49+
super().__init__(random_state=random_state)
50+
51+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
52+
53+
if check_none(self.max_leaf_nodes):
54+
self.max_leaf_nodes = None
55+
else:
56+
self.max_leaf_nodes = int(self.max_leaf_nodes)
57+
58+
if check_none(self.max_depth):
59+
self.max_depth = None
60+
else:
61+
self.max_depth = int(self.max_depth)
62+
63+
# TODO: add class_weights
64+
estimator = ExtraTreesClassifier(
65+
n_estimators=self.n_estimators,
66+
criterion=self.criterion,
67+
max_depth=self.max_depth,
68+
min_samples_split=self.min_samples_split,
69+
min_samples_leaf=self.min_samples_leaf,
70+
bootstrap=self.bootstrap,
71+
max_features=self.max_features,
72+
max_leaf_nodes=self.max_leaf_nodes,
73+
min_impurity_decrease=self.min_impurity_decrease,
74+
oob_score=self.oob_score,
75+
verbose=self.verbose,
76+
random_state=self.random_state,
77+
)
78+
79+
self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
80+
threshold='mean',
81+
prefit=False)
82+
return self
83+
84+
@staticmethod
85+
def get_hyperparameter_search_space(
86+
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
87+
bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
88+
value_range=(True, False),
89+
default_value=True,
90+
),
91+
n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
92+
value_range=(10, 100),
93+
default_value=10,
94+
),
95+
max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth',
96+
value_range=("none",),
97+
default_value="none",
98+
),
99+
max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features',
100+
value_range=(0, 1),
101+
default_value=0.5,
102+
),
103+
min_impurity_decrease: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_impurity_decrease',
104+
value_range=(0,),
105+
default_value=0,
106+
),
107+
criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
108+
value_range=("gini", "entropy"),
109+
default_value="gini",
110+
),
111+
min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
112+
value_range=(2, 20),
113+
default_value=2,
114+
),
115+
min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf',
116+
value_range=(1, 20),
117+
default_value=1,
118+
),
119+
min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
120+
hyperparameter='min_weight_fraction_leaf',
121+
value_range=(0,),
122+
default_value=0),
123+
max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
124+
value_range=("none",),
125+
default_value="none",
126+
),
127+
) -> ConfigurationSpace:
128+
129+
cs = ConfigurationSpace()
130+
add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
131+
add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
132+
add_hyperparameter(cs, max_features, UniformFloatHyperparameter)
133+
add_hyperparameter(cs, min_impurity_decrease, UniformFloatHyperparameter)
134+
add_hyperparameter(cs, criterion, CategoricalHyperparameter)
135+
add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
136+
add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
137+
add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
138+
add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
139+
add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
140+
141+
return cs
142+
143+
144+
@staticmethod
145+
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
146+
return {'shortname': 'ETC',
147+
'name': 'Extra Trees Classifier Preprocessing',
148+
'handles_sparse': True,
149+
'handles_regression': False,
150+
'handles_classification': True
151+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
from functools import partial
2+
from math import ceil, floor
3+
from typing import Any, Callable, Dict, List, Optional, Union
4+
5+
from ConfigSpace.configuration_space import ConfigurationSpace
6+
from ConfigSpace.hyperparameters import (
7+
CategoricalHyperparameter,
8+
UniformFloatHyperparameter,
9+
UniformIntegerHyperparameter,
10+
)
11+
12+
import numpy as np
13+
14+
from sklearn.ensemble import ExtraTreesRegressor
15+
from sklearn.feature_selection import SelectFromModel
16+
from sklearn.base import BaseEstimator
17+
18+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
19+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
20+
base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
21+
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, check_none
22+
23+
24+
class ExtraTreesPreprocessorRegression(autoPyTorchFeaturePreprocessingComponent):
25+
def __init__(self, bootstrap: bool = True, n_estimators: int = 10,
26+
criterion: str = "mse", max_features: float = 1,
27+
max_depth: Optional[Union[str, int]] = 5, min_samples_split: int = 2,
28+
min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0,
29+
max_leaf_nodes: Optional[Union[str, int]] = "none",
30+
oob_score=False, verbose=0,
31+
random_state: Optional[np.random.RandomState] = None):
32+
self.bootstrap = bootstrap
33+
self.n_estimators = n_estimators
34+
if criterion not in ('mse', 'friedman_mse', 'mae'):
35+
raise ValueError("'criterion' is not in ('mse', 'friedman_mse', 'mae'): "
36+
"%s" % criterion)
37+
self.criterion = criterion
38+
self.max_features = max_features
39+
self.max_depth = max_depth
40+
self.min_samples_split = min_samples_split
41+
self.min_samples_leaf = min_samples_leaf
42+
self.min_weight_fraction_leaf = min_weight_fraction_leaf
43+
self.max_leaf_nodes = max_leaf_nodes
44+
self.oob_score = oob_score
45+
self.verbose = verbose
46+
47+
super().__init__(random_state=random_state)
48+
49+
self.add_fit_requirements([
50+
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
51+
52+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
53+
54+
self.check_requirements(X, y)
55+
56+
if check_none(self.max_leaf_nodes):
57+
self.max_leaf_nodes = None
58+
else:
59+
self.max_leaf_nodes = int(self.max_leaf_nodes)
60+
61+
if check_none(self.max_depth):
62+
self.max_depth = None
63+
else:
64+
self.max_depth = int(self.max_depth)
65+
66+
num_features = len(X['dataset_properties']['numerical_columns'])
67+
max_features = int(
68+
float(self.max_features) * (np.log(num_features) + 1))
69+
# Use at most half of the features
70+
max_features = max(1, min(int(num_features / 2), max_features))
71+
72+
# TODO: add class_weights
73+
estimator = ExtraTreesRegressor(
74+
n_estimators=self.n_estimators,
75+
criterion=self.criterion,
76+
max_depth=self.max_depth,
77+
min_samples_split=self.min_samples_split,
78+
min_samples_leaf=self.min_samples_leaf,
79+
bootstrap=self.bootstrap,
80+
max_features=self.max_features,
81+
max_leaf_nodes=self.max_leaf_nodes,
82+
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
83+
oob_score=self.oob_score,
84+
verbose=self.verbose,
85+
random_state=self.random_state,
86+
)
87+
88+
self.preprocessor['numerical'] = SelectFromModel(estimator=estimator,
89+
threshold='mean',
90+
prefit=False)
91+
return self
92+
93+
@staticmethod
94+
def get_hyperparameter_search_space(
95+
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
96+
bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap',
97+
value_range=(True, False),
98+
default_value=True,
99+
),
100+
n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators',
101+
value_range=(100,),
102+
default_value=100,
103+
),
104+
max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth',
105+
value_range=("none",),
106+
default_value="none",
107+
),
108+
max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features',
109+
value_range=(0.1, 1),
110+
default_value=1,
111+
),
112+
criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion',
113+
value_range=('mse', 'friedman_mse', 'mae'),
114+
default_value="mse",
115+
),
116+
min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split',
117+
value_range=(2, 20),
118+
default_value=2,
119+
),
120+
min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf',
121+
value_range=(1, 20),
122+
default_value=1,
123+
),
124+
min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(
125+
hyperparameter='min_weight_fraction_leaf',
126+
value_range=(0,),
127+
default_value=0),
128+
max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes',
129+
value_range=("none",),
130+
default_value="none",
131+
),
132+
) -> ConfigurationSpace:
133+
134+
cs = ConfigurationSpace()
135+
add_hyperparameter(cs, bootstrap, CategoricalHyperparameter)
136+
add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter)
137+
add_hyperparameter(cs, max_features, UniformFloatHyperparameter)
138+
add_hyperparameter(cs, criterion, CategoricalHyperparameter)
139+
add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter)
140+
add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter)
141+
add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter)
142+
add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter)
143+
add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter)
144+
145+
return cs
146+
147+
148+
@staticmethod
149+
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
150+
return {'shortname': 'ETR',
151+
'name': 'Extra Trees Regressor Preprocessing',
152+
'handles_sparse': True,
153+
'handles_regression': True,
154+
'handles_classification': False
155+
}

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FastICA.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from math import ceil, floor
2-
from typing import Any, Callable, Dict, List, Optional, Union
2+
from typing import Any, Dict, List, Optional
33

44
from ConfigSpace.conditions import EqualsCondition
55
from ConfigSpace.configuration_space import ConfigurationSpace
@@ -10,8 +10,8 @@
1010

1111
import numpy as np
1212

13-
from sklearn.decomposition import FastICA
1413
from sklearn.base import BaseEstimator
14+
from sklearn.decomposition import FastICA as SklearnFastICA
1515

1616
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1717
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \
@@ -34,7 +34,7 @@ def __init__(self, n_components: Optional[int] = None,
3434

3535
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
3636

37-
self.preprocessor['numerical'] = FastICA(
37+
self.preprocessor['numerical'] = SklearnFastICA(
3838
n_components=self.n_components, algorithm=self.algorithm,
3939
fun=self.fun, whiten=self.whiten, random_state=self.random_state)
4040

@@ -57,7 +57,7 @@ def get_hyperparameter_search_space(
5757
),
5858
fun: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='fun',
5959
value_range=('logcosh', 'exp', 'cube'),
60-
default_value='logcash',
60+
default_value='logcosh',
6161
),
6262
) -> ConfigurationSpace:
6363
if dataset_properties is not None:
@@ -92,7 +92,6 @@ def get_hyperparameter_search_space(
9292

9393
return cs
9494

95-
9695
@staticmethod
9796
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]:
9897
return {'shortname': 'FastICA',

0 commit comments

Comments
 (0)