Skip to content

Commit ce78f89

Browse files
authored
[ADD] Robustly refit models in final ensemble in parallel (#471)
* add parallel model runner and update running traditional classifiers * update pipeline config to pipeline options * working refit function * fix mypy and flake * suggestions from review * fix mypy and flake * suggestions from review * finish documentation * fix tests * add test for parallel model runner * fix flake * fix tests * fix traditional prediction for refit * suggestions from review * add warning for failed processing of results * remove unnecessary change * update autopytorch version number * update autopytorch version number and the example file
1 parent d160903 commit ce78f89

23 files changed

+909
-276
lines changed

autoPyTorch/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"""Version information."""
22

33
# The following line *must* be the last in the module, exactly as formatted:
4-
__version__ = "0.2"
4+
__version__ = "0.2.1"

autoPyTorch/api/base_task.py

Lines changed: 262 additions & 157 deletions
Large diffs are not rendered by default.

autoPyTorch/ensemble/abstract_ensemble.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
class AbstractEnsemble(object):
1010
__metaclass__ = ABCMeta
1111

12+
def __init__(self):
13+
self.identifiers_: List[Tuple[int, int, float]] = []
14+
1215
@abstractmethod
1316
def fit(
1417
self,
@@ -76,3 +79,12 @@ def get_validation_performance(self) -> float:
7679
Returns:
7780
Score
7881
"""
82+
83+
def update_identifiers(
84+
self,
85+
replace_identifiers_mapping: Dict[Tuple[int, int, float], Tuple[int, int, float]]
86+
) -> None:
87+
identifiers = self.identifiers_.copy()
88+
for i, identifier in enumerate(self.identifiers_):
89+
identifiers[i] = replace_identifiers_mapping.get(identifier, identifier)
90+
self.identifiers_ = identifiers

autoPyTorch/evaluation/abstract_evaluator.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,8 @@ def get_additional_run_info(self) -> Dict[str, Any]:
195195
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/estimator_configs
196196
"""
197197
return {'pipeline_configuration': self.configuration,
198-
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()}
198+
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config(),
199+
'configuration_origin': 'traditional'}
199200

200201
def get_pipeline_representation(self) -> Dict[str, str]:
201202
return self.pipeline.get_pipeline_representation()
@@ -347,7 +348,7 @@ class AbstractEvaluator(object):
347348
348349
An evaluator is an object that:
349350
+ constructs a pipeline (i.e. a classification or regression estimator) for a given
350-
pipeline_config and run settings (budget, seed)
351+
pipeline_options and run settings (budget, seed)
351352
+ Fits and trains this pipeline (TrainEvaluator) or tests a given
352353
configuration (TestEvaluator)
353354
@@ -369,7 +370,7 @@ class AbstractEvaluator(object):
369370
The amount of epochs/time a configuration is allowed to run.
370371
budget_type (str):
371372
The budget type. Currently, only epoch and time are allowed.
372-
pipeline_config (Optional[Dict[str, Any]]):
373+
pipeline_options (Optional[Dict[str, Any]]):
373374
Defines the content of the pipeline being evaluated. For example, it
374375
contains pipeline specific settings like logging name, or whether or not
375376
to use tensorboard.
@@ -430,7 +431,7 @@ def __init__(self, backend: Backend,
430431
budget: float,
431432
configuration: Union[int, str, Configuration],
432433
budget_type: str = None,
433-
pipeline_config: Optional[Dict[str, Any]] = None,
434+
pipeline_options: Optional[Dict[str, Any]] = None,
434435
seed: int = 1,
435436
output_y_hat_optimization: bool = True,
436437
num_run: Optional[int] = None,
@@ -523,10 +524,10 @@ def __init__(self, backend: Backend,
523524
self._init_params = init_params
524525

525526
assert self.pipeline_class is not None, "Could not infer pipeline class"
526-
pipeline_config = pipeline_config if pipeline_config is not None \
527+
pipeline_options = pipeline_options if pipeline_options is not None \
527528
else self.pipeline_class.get_default_pipeline_options()
528-
self.budget_type = pipeline_config['budget_type'] if budget_type is None else budget_type
529-
self.budget = pipeline_config[self.budget_type] if budget == 0 else budget
529+
self.budget_type = pipeline_options['budget_type'] if budget_type is None else budget_type
530+
self.budget = pipeline_options[self.budget_type] if budget == 0 else budget
530531

531532
self.num_run = 0 if num_run is None else num_run
532533

@@ -539,7 +540,7 @@ def __init__(self, backend: Backend,
539540
port=logger_port,
540541
)
541542

542-
self._init_fit_dictionary(logger_port=logger_port, pipeline_config=pipeline_config, metrics_dict=metrics_dict)
543+
self._init_fit_dictionary(logger_port=logger_port, pipeline_options=pipeline_options, metrics_dict=metrics_dict)
543544
self.Y_optimization: Optional[np.ndarray] = None
544545
self.Y_actual_train: Optional[np.ndarray] = None
545546
self.pipelines: Optional[List[BaseEstimator]] = None
@@ -597,7 +598,7 @@ def _init_datamanager_info(
597598
def _init_fit_dictionary(
598599
self,
599600
logger_port: int,
600-
pipeline_config: Dict[str, Any],
601+
pipeline_options: Dict[str, Any],
601602
metrics_dict: Optional[Dict[str, List[str]]] = None,
602603
) -> None:
603604
"""
@@ -608,7 +609,7 @@ def _init_fit_dictionary(
608609
Logging is performed using a socket-server scheme to be robust against many
609610
parallel entities that want to write to the same file. This integer states the
610611
socket port for the communication channel.
611-
pipeline_config (Dict[str, Any]):
612+
pipeline_options (Dict[str, Any]):
612613
Defines the content of the pipeline being evaluated. For example, it
613614
contains pipeline specific settings like logging name, or whether or not
614615
to use tensorboard.
@@ -634,7 +635,7 @@ def _init_fit_dictionary(
634635
'optimize_metric': self.metric.name
635636
})
636637

637-
self.fit_dictionary.update(pipeline_config)
638+
self.fit_dictionary.update(pipeline_options)
638639
# If the budget is epochs, we want to limit that in the fit dictionary
639640
if self.budget_type == 'epochs':
640641
self.fit_dictionary['epochs'] = self.budget
@@ -805,6 +806,11 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
805806
if test_loss is not None:
806807
additional_run_info['test_loss'] = test_loss
807808

809+
# Add information to additional info that can be useful for other functionalities
810+
additional_run_info['configuration'] = self.configuration \
811+
if not isinstance(self.configuration, Configuration) else self.configuration.get_dictionary()
812+
additional_run_info['budget'] = self.budget
813+
808814
rval_dict = {'loss': cost,
809815
'additional_run_info': additional_run_info,
810816
'status': status}

autoPyTorch/evaluation/tae.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def __init__(
123123
abort_on_first_run_crash: bool,
124124
pynisher_context: str,
125125
multi_objectives: List[str],
126-
pipeline_config: Optional[Dict[str, Any]] = None,
126+
pipeline_options: Optional[Dict[str, Any]] = None,
127127
initial_num_run: int = 1,
128128
stats: Optional[Stats] = None,
129129
run_obj: str = 'quality',
@@ -198,13 +198,13 @@ def __init__(
198198
self.disable_file_output = disable_file_output
199199
self.init_params = init_params
200200

201-
self.budget_type = pipeline_config['budget_type'] if pipeline_config is not None else budget_type
201+
self.budget_type = pipeline_options['budget_type'] if pipeline_options is not None else budget_type
202202

203-
self.pipeline_config: Dict[str, Union[int, str, float]] = dict()
204-
if pipeline_config is None:
205-
pipeline_config = replace_string_bool_to_bool(json.load(open(
203+
self.pipeline_options: Dict[str, Union[int, str, float]] = dict()
204+
if pipeline_options is None:
205+
pipeline_options = replace_string_bool_to_bool(json.load(open(
206206
os.path.join(os.path.dirname(__file__), '../configs/default_pipeline_options.json'))))
207-
self.pipeline_config.update(pipeline_config)
207+
self.pipeline_options.update(pipeline_options)
208208

209209
self.logger_port = logger_port
210210
if self.logger_port is None:
@@ -225,7 +225,7 @@ def __init__(
225225
def _check_and_get_default_budget(self) -> float:
226226
budget_type_choices_tabular = ('epochs', 'runtime')
227227
budget_choices = {
228-
budget_type: float(self.pipeline_config.get(budget_type, np.inf))
228+
budget_type: float(self.pipeline_options.get(budget_type, np.inf))
229229
for budget_type in budget_type_choices_tabular
230230
}
231231

@@ -234,7 +234,7 @@ def _check_and_get_default_budget(self) -> float:
234234
budget_type_choices = budget_type_choices_tabular + FORECASTING_BUDGET_TYPE
235235

236236
# budget is defined by epochs by default
237-
budget_type = str(self.pipeline_config.get('budget_type', 'epochs'))
237+
budget_type = str(self.pipeline_options.get('budget_type', 'epochs'))
238238
if self.budget_type is not None:
239239
budget_type = self.budget_type
240240

@@ -361,7 +361,7 @@ def run(
361361
init_params=init_params,
362362
budget=budget,
363363
budget_type=self.budget_type,
364-
pipeline_config=self.pipeline_config,
364+
pipeline_options=self.pipeline_options,
365365
logger_port=self.logger_port,
366366
all_supported_metrics=self.all_supported_metrics,
367367
search_space_updates=self.search_space_updates

autoPyTorch/evaluation/test_evaluator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class TestEvaluator(AbstractEvaluator):
5151
The amount of epochs/time a configuration is allowed to run.
5252
budget_type (str):
5353
The budget type, which can be epochs or time
54-
pipeline_config (Optional[Dict[str, Any]]):
54+
pipeline_options (Optional[Dict[str, Any]]):
5555
Defines the content of the pipeline being evaluated. For example, it
5656
contains pipeline specific settings like logging name, or whether or not
5757
to use tensorboard.
@@ -113,7 +113,7 @@ def __init__(
113113
budget: float,
114114
configuration: Union[int, str, Configuration],
115115
budget_type: str = None,
116-
pipeline_config: Optional[Dict[str, Any]] = None,
116+
pipeline_options: Optional[Dict[str, Any]] = None,
117117
seed: int = 1,
118118
output_y_hat_optimization: bool = False,
119119
num_run: Optional[int] = None,
@@ -141,7 +141,7 @@ def __init__(
141141
budget_type=budget_type,
142142
logger_port=logger_port,
143143
all_supported_metrics=all_supported_metrics,
144-
pipeline_config=pipeline_config,
144+
pipeline_options=pipeline_options,
145145
search_space_updates=search_space_updates
146146
)
147147

@@ -206,7 +206,7 @@ def eval_test_function(
206206
include: Optional[Dict[str, Any]],
207207
exclude: Optional[Dict[str, Any]],
208208
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
209-
pipeline_config: Optional[Dict[str, Any]] = None,
209+
pipeline_options: Optional[Dict[str, Any]] = None,
210210
budget_type: str = None,
211211
init_params: Optional[Dict[str, Any]] = None,
212212
logger_port: Optional[int] = None,
@@ -230,7 +230,7 @@ def eval_test_function(
230230
budget_type=budget_type,
231231
logger_port=logger_port,
232232
all_supported_metrics=all_supported_metrics,
233-
pipeline_config=pipeline_config,
233+
pipeline_options=pipeline_options,
234234
search_space_updates=search_space_updates)
235235

236236
evaluator.fit_predict_and_loss()

autoPyTorch/evaluation/time_series_forecasting_train_evaluator.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class TimeSeriesForecastingTrainEvaluator(TrainEvaluator):
4040
The amount of epochs/time a configuration is allowed to run.
4141
budget_type (str):
4242
The budget type, which can be epochs or time
43-
pipeline_config (Optional[Dict[str, Any]]):
43+
pipeline_options (Optional[Dict[str, Any]]):
4444
Defines the content of the pipeline being evaluated. For example, it
4545
contains pipeline specific settings like logging name, or whether or not
4646
to use tensorboard.
@@ -106,7 +106,7 @@ def __init__(self, backend: Backend, queue: Queue,
106106
metric: autoPyTorchMetric,
107107
budget: float,
108108
budget_type: str = None,
109-
pipeline_config: Optional[Dict[str, Any]] = None,
109+
pipeline_options: Optional[Dict[str, Any]] = None,
110110
configuration: Optional[Configuration] = None,
111111
seed: int = 1,
112112
output_y_hat_optimization: bool = True,
@@ -138,7 +138,7 @@ def __init__(self, backend: Backend, queue: Queue,
138138
logger_port=logger_port,
139139
keep_models=keep_models,
140140
all_supported_metrics=all_supported_metrics,
141-
pipeline_config=pipeline_config,
141+
pipeline_options=pipeline_options,
142142
search_space_updates=search_space_updates
143143
)
144144
self.datamanager = backend.load_datamanager()
@@ -456,7 +456,7 @@ def forecasting_eval_train_function(
456456
include: Optional[Dict[str, Any]],
457457
exclude: Optional[Dict[str, Any]],
458458
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
459-
pipeline_config: Optional[Dict[str, Any]] = None,
459+
pipeline_options: Optional[Dict[str, Any]] = None,
460460
budget_type: str = None,
461461
init_params: Optional[Dict[str, Any]] = None,
462462
logger_port: Optional[int] = None,
@@ -490,7 +490,7 @@ def forecasting_eval_train_function(
490490
The amount of epochs/time a configuration is allowed to run.
491491
budget_type (str):
492492
The budget type, which can be epochs or time
493-
pipeline_config (Optional[Dict[str, Any]]):
493+
pipeline_options (Optional[Dict[str, Any]]):
494494
Defines the content of the pipeline being evaluated. For example, it
495495
contains pipeline specific settings like logging name, or whether or not
496496
to use tensorboard.
@@ -550,7 +550,7 @@ def forecasting_eval_train_function(
550550
budget_type=budget_type,
551551
logger_port=logger_port,
552552
all_supported_metrics=all_supported_metrics,
553-
pipeline_config=pipeline_config,
553+
pipeline_options=pipeline_options,
554554
search_space_updates=search_space_updates,
555555
max_budget=max_budget,
556556
min_num_test_instances=min_num_test_instances,

autoPyTorch/evaluation/train_evaluator.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ class TrainEvaluator(AbstractEvaluator):
6060
The amount of epochs/time a configuration is allowed to run.
6161
budget_type (str):
6262
The budget type, which can be epochs or time
63-
pipeline_config (Optional[Dict[str, Any]]):
63+
pipeline_options (Optional[Dict[str, Any]]):
6464
Defines the content of the pipeline being evaluated. For example, it
6565
contains pipeline specific settings like logging name, or whether or not
6666
to use tensorboard.
@@ -121,7 +121,7 @@ def __init__(self, backend: Backend, queue: Queue,
121121
budget: float,
122122
configuration: Union[int, str, Configuration],
123123
budget_type: str = None,
124-
pipeline_config: Optional[Dict[str, Any]] = None,
124+
pipeline_options: Optional[Dict[str, Any]] = None,
125125
seed: int = 1,
126126
output_y_hat_optimization: bool = True,
127127
num_run: Optional[int] = None,
@@ -149,7 +149,7 @@ def __init__(self, backend: Backend, queue: Queue,
149149
budget_type=budget_type,
150150
logger_port=logger_port,
151151
all_supported_metrics=all_supported_metrics,
152-
pipeline_config=pipeline_config,
152+
pipeline_options=pipeline_options,
153153
search_space_updates=search_space_updates
154154
)
155155

@@ -420,7 +420,7 @@ def eval_train_function(
420420
include: Optional[Dict[str, Any]],
421421
exclude: Optional[Dict[str, Any]],
422422
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
423-
pipeline_config: Optional[Dict[str, Any]] = None,
423+
pipeline_options: Optional[Dict[str, Any]] = None,
424424
budget_type: str = None,
425425
init_params: Optional[Dict[str, Any]] = None,
426426
logger_port: Optional[int] = None,
@@ -452,7 +452,7 @@ def eval_train_function(
452452
The amount of epochs/time a configuration is allowed to run.
453453
budget_type (str):
454454
The budget type, which can be epochs or time
455-
pipeline_config (Optional[Dict[str, Any]]):
455+
pipeline_options (Optional[Dict[str, Any]]):
456456
Defines the content of the pipeline being evaluated. For example, it
457457
contains pipeline specific settings like logging name, or whether or not
458458
to use tensorboard.
@@ -506,7 +506,7 @@ def eval_train_function(
506506
budget_type=budget_type,
507507
logger_port=logger_port,
508508
all_supported_metrics=all_supported_metrics,
509-
pipeline_config=pipeline_config,
509+
pipeline_options=pipeline_options,
510510
search_space_updates=search_space_updates,
511511
)
512512
evaluator.fit_predict_and_loss()

autoPyTorch/optimizer/smbo.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def __init__(self,
111111
watcher: StopWatch,
112112
n_jobs: int,
113113
dask_client: Optional[dask.distributed.Client],
114-
pipeline_config: Dict[str, Any],
114+
pipeline_options: Dict[str, Any],
115115
start_num_run: int = 1,
116116
seed: int = 1,
117117
resampling_strategy: Union[HoldoutValTypes,
@@ -227,7 +227,7 @@ def __init__(self,
227227
self.backend = backend
228228
self.all_supported_metrics = all_supported_metrics
229229

230-
self.pipeline_config = pipeline_config
230+
self.pipeline_options = pipeline_options
231231
# the configuration space
232232
self.config_space = config_space
233233

@@ -326,7 +326,7 @@ def run_smbo(self, func: Optional[Callable] = None
326326
ta=func,
327327
logger_port=self.logger_port,
328328
all_supported_metrics=self.all_supported_metrics,
329-
pipeline_config=self.pipeline_config,
329+
pipeline_options=self.pipeline_options,
330330
search_space_updates=self.search_space_updates,
331331
pynisher_context=self.pynisher_context,
332332
)
@@ -376,7 +376,7 @@ def run_smbo(self, func: Optional[Callable] = None
376376
)
377377
scenario_dict.update(self.smac_scenario_args)
378378

379-
budget_type = self.pipeline_config['budget_type']
379+
budget_type = self.pipeline_options['budget_type']
380380
if budget_type in FORECASTING_BUDGET_TYPE:
381381
if STRING_TO_TASK_TYPES.get(self.task_type, -1) != TIMESERIES_FORECASTING:
382382
raise ValueError('Forecasting Budget type is only available for forecasting task!')

autoPyTorch/pipeline/components/setup/traditional_ml/base_model.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ def __init__(
5252
self.add_fit_requirements([
5353
FitRequirement('X_train', (np.ndarray, list, pd.DataFrame), user_defined=False, dataset_property=False),
5454
FitRequirement('y_train', (np.ndarray, list, pd.Series,), user_defined=False, dataset_property=False),
55-
FitRequirement('train_indices', (np.ndarray, list), user_defined=False, dataset_property=False),
56-
FitRequirement('val_indices', (np.ndarray, list), user_defined=False, dataset_property=False)])
55+
FitRequirement('train_indices', (np.ndarray, list), user_defined=False, dataset_property=False)])
5756

5857
def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent:
5958
"""
@@ -90,8 +89,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchSetupComponent:
9089

9190
# train model
9291
blockPrint()
92+
val_indices = X.get('val_indices', None)
93+
X_val = None
94+
y_val = None
95+
if val_indices is not None:
96+
X_val = X['X_train'][val_indices]
97+
y_val = X['y_train'][val_indices]
9398
self.fit_output = self.model.fit(X['X_train'][X['train_indices']], X['y_train'][X['train_indices']],
94-
X['X_train'][X['val_indices']], X['y_train'][X['val_indices']])
99+
X_val, y_val)
95100
enablePrint()
96101

97102
# infer

0 commit comments

Comments
 (0)