1
1
# -*- encoding: utf-8 -*-
2
2
3
- from typing import Optional , Dict , List
3
+ from typing import Optional , Dict , List , Tuple , Union
4
4
5
+ from ConfigSpace .configuration_space import Configuration
5
6
import dask .distributed
6
7
import joblib
7
8
import numpy as np
8
9
from sklearn .base import BaseEstimator , ClassifierMixin , RegressorMixin
9
10
from sklearn .utils .multiclass import type_of_target
11
+ from smac .runhistory .runhistory import RunInfo , RunValue
10
12
13
+ from autosklearn .data .validation import (
14
+ SUPPORTED_FEAT_TYPES ,
15
+ SUPPORTED_TARGET_TYPES ,
16
+ )
17
+ from autosklearn .pipeline .base import BasePipeline
11
18
from autosklearn .automl import AutoMLClassifier , AutoMLRegressor , AutoML
12
19
from autosklearn .metrics import Scorer
13
20
from autosklearn .util .backend import create
@@ -271,8 +278,15 @@ def __init__(
271
278
self .load_models = load_models
272
279
273
280
self .automl_ = None # type: Optional[AutoML]
274
- # n_jobs after conversion to a number (b/c default is None)
281
+
282
+ # Handle the number of jobs and the time for them
275
283
self ._n_jobs = None
284
+ if self .n_jobs is None or self .n_jobs == 1 :
285
+ self ._n_jobs = 1
286
+ elif self .n_jobs == - 1 :
287
+ self ._n_jobs = joblib .cpu_count ()
288
+ else :
289
+ self ._n_jobs = self .n_jobs
276
290
277
291
super ().__init__ ()
278
292
@@ -281,35 +295,24 @@ def __getstate__(self):
281
295
self .dask_client = None
282
296
return self .__dict__
283
297
284
- def build_automl (
285
- self ,
286
- seed : int ,
287
- ensemble_size : int ,
288
- initial_configurations_via_metalearning : int ,
289
- tmp_folder : str ,
290
- output_folder : str ,
291
- smac_scenario_args : Optional [Dict ] = None ,
292
- ):
298
+ def build_automl (self ):
293
299
294
300
backend = create (
295
- temporary_directory = tmp_folder ,
296
- output_directory = output_folder ,
301
+ temporary_directory = self . tmp_folder ,
302
+ output_directory = self . output_folder ,
297
303
delete_tmp_folder_after_terminate = self .delete_tmp_folder_after_terminate ,
298
304
delete_output_folder_after_terminate = self .delete_output_folder_after_terminate ,
299
305
)
300
306
301
- if smac_scenario_args is None :
302
- smac_scenario_args = self .smac_scenario_args
303
-
304
307
automl = self ._get_automl_class ()(
305
308
backend = backend ,
306
309
time_left_for_this_task = self .time_left_for_this_task ,
307
310
per_run_time_limit = self .per_run_time_limit ,
308
- initial_configurations_via_metalearning = initial_configurations_via_metalearning ,
309
- ensemble_size = ensemble_size ,
311
+ initial_configurations_via_metalearning = self . initial_configurations_via_metalearning ,
312
+ ensemble_size = self . ensemble_size ,
310
313
ensemble_nbest = self .ensemble_nbest ,
311
314
max_models_on_disc = self .max_models_on_disc ,
312
- seed = seed ,
315
+ seed = self . seed ,
313
316
memory_limit = self .memory_limit ,
314
317
include_estimators = self .include_estimators ,
315
318
exclude_estimators = self .exclude_estimators ,
@@ -321,7 +324,7 @@ def build_automl(
321
324
dask_client = self .dask_client ,
322
325
get_smac_object_callback = self .get_smac_object_callback ,
323
326
disable_evaluator_output = self .disable_evaluator_output ,
324
- smac_scenario_args = smac_scenario_args ,
327
+ smac_scenario_args = self . smac_scenario_args ,
325
328
logging_config = self .logging_config ,
326
329
metadata_directory = self .metadata_directory ,
327
330
metric = self .metric ,
@@ -332,32 +335,82 @@ def build_automl(
332
335
333
336
def fit (self , ** kwargs ):
334
337
335
- # Handle the number of jobs and the time for them
336
- if self .n_jobs is None or self .n_jobs == 1 :
337
- self ._n_jobs = 1
338
- elif self .n_jobs == - 1 :
339
- self ._n_jobs = joblib .cpu_count ()
340
- else :
341
- self ._n_jobs = self .n_jobs
342
-
343
338
# Automatically set the cutoff time per task
344
339
if self .per_run_time_limit is None :
345
340
self .per_run_time_limit = self ._n_jobs * self .time_left_for_this_task // 10
346
341
347
- seed = self .seed
348
- self .automl_ = self .build_automl (
349
- seed = seed ,
350
- ensemble_size = self .ensemble_size ,
351
- initial_configurations_via_metalearning = (
352
- self .initial_configurations_via_metalearning
353
- ),
354
- tmp_folder = self .tmp_folder ,
355
- output_folder = self .output_folder ,
356
- )
342
+ if self .automl_ is None :
343
+ self .automl_ = self .build_automl ()
357
344
self .automl_ .fit (load_models = self .load_models , ** kwargs )
358
345
359
346
return self
360
347
348
+ def fit_pipeline (
349
+ self ,
350
+ X : SUPPORTED_FEAT_TYPES ,
351
+ y : SUPPORTED_TARGET_TYPES ,
352
+ config : Union [Configuration , Dict [str , Union [str , float , int ]]],
353
+ dataset_name : Optional [str ] = None ,
354
+ X_test : Optional [SUPPORTED_FEAT_TYPES ] = None ,
355
+ y_test : Optional [SUPPORTED_TARGET_TYPES ] = None ,
356
+ feat_type : Optional [List [str ]] = None ,
357
+ * args ,
358
+ ** kwargs : Dict ,
359
+ ) -> Tuple [Optional [BasePipeline ], RunInfo , RunValue ]:
360
+ """ Fits and individual pipeline configuration and returns
361
+ the result to the user.
362
+
363
+ The Estimator constraints are honored, for example the resampling
364
+ strategy, or memory constraints, unless directly provided to the method.
365
+ By default, this method supports the same signature as fit(), and any extra
366
+ arguments are redirected to the TAE evaluation function, which allows for
367
+ further customization while building a pipeline.
368
+
369
+ Any additional argument provided is directly passed to the worker exercising the run.
370
+
371
+ Parameters
372
+ ----------
373
+ X: array-like, shape = (n_samples, n_features)
374
+ The features used for training
375
+ y: array-like
376
+ The labels used for training
377
+ X_test: Optionalarray-like, shape = (n_samples, n_features)
378
+ If provided, the testing performance will be tracked on this features.
379
+ y_test: array-like
380
+ If provided, the testing performance will be tracked on this labels
381
+ config: Union[Configuration, Dict[str, Union[str, float, int]]]
382
+ A configuration object used to define the pipeline steps.
383
+ If a dictionary is passed, a configuration is created based on this dictionary.
384
+ dataset_name: Optional[str]
385
+ Name that will be used to tag the Auto-Sklearn run and identify the
386
+ Auto-Sklearn run
387
+ feat_type : list, optional (default=None)
388
+ List of str of `len(X.shape[1])` describing the attribute type.
389
+ Possible types are `Categorical` and `Numerical`. `Categorical`
390
+ attributes will be automatically One-Hot encoded. The values
391
+ used for a categorical attribute must be integers, obtained for
392
+ example by `sklearn.preprocessing.LabelEncoder
393
+ <http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html>`_.
394
+
395
+ Returns
396
+ -------
397
+ pipeline: Optional[BasePipeline]
398
+ The fitted pipeline. In case of failure while fitting the pipeline,
399
+ a None is returned.
400
+ run_info: RunInFo
401
+ A named tuple that contains the configuration launched
402
+ run_value: RunValue
403
+ A named tuple that contains the result of the run
404
+ """
405
+ if self .automl_ is None :
406
+ self .automl_ = self .build_automl ()
407
+ return self .automl_ .fit_pipeline (X = X , y = y ,
408
+ dataset_name = dataset_name ,
409
+ config = config ,
410
+ feat_type = feat_type ,
411
+ X_test = X_test , y_test = y_test ,
412
+ * args , ** kwargs )
413
+
361
414
def fit_ensemble (self , y , task = None , precision = 32 ,
362
415
dataset_name = None , ensemble_nbest = None ,
363
416
ensemble_size = None ):
@@ -401,17 +454,9 @@ def fit_ensemble(self, y, task=None, precision=32,
401
454
"""
402
455
if self .automl_ is None :
403
456
# Build a dummy automl object to call fit_ensemble
404
- self .automl_ = self .build_automl (
405
- seed = self .seed ,
406
- ensemble_size = (
407
- ensemble_size
408
- if ensemble_size is not None else
409
- self .ensemble_size
410
- ),
411
- initial_configurations_via_metalearning = 0 ,
412
- tmp_folder = self .tmp_folder ,
413
- output_folder = self .output_folder ,
414
- )
457
+ # The ensemble size is honored in the .automl_.fit_ensemble
458
+ # call
459
+ self .automl_ = self .build_automl ()
415
460
self .automl_ .fit_ensemble (
416
461
y = y ,
417
462
task = task ,
@@ -513,8 +558,40 @@ def sprint_statistics(self):
513
558
def _get_automl_class (self ):
514
559
raise NotImplementedError ()
515
560
516
- def get_configuration_space (self , X , y ):
517
- return self .automl_ .configuration_space
561
+ def get_configuration_space (
562
+ self ,
563
+ X : SUPPORTED_FEAT_TYPES ,
564
+ y : SUPPORTED_TARGET_TYPES ,
565
+ X_test : Optional [SUPPORTED_FEAT_TYPES ] = None ,
566
+ y_test : Optional [SUPPORTED_TARGET_TYPES ] = None ,
567
+ dataset_name : Optional [str ] = None ,
568
+ ):
569
+ """
570
+ Returns the Configuration Space object, from which Auto-Sklearn
571
+ will sample configurations and build pipelines.
572
+
573
+ Parameters
574
+ ----------
575
+ X : array-like or sparse matrix of shape = [n_samples, n_features]
576
+ Array with the training features, used to get characteristics like
577
+ data sparsity
578
+ y : array-like, shape = [n_samples] or [n_samples, n_outputs]
579
+ Array with the problem labels
580
+ X_test : array-like or sparse matrix of shape = [n_samples, n_features]
581
+ Array with features used for performance estimation
582
+ y_test : array-like, shape = [n_samples] or [n_samples, n_outputs]
583
+ Array with the problem labels for the testing split
584
+ dataset_name: Optional[str]
585
+ A string to tag the Auto-Sklearn run
586
+ """
587
+ if self .automl_ is None :
588
+ self .automl_ = self .build_automl ()
589
+ return self .automl_ .fit (
590
+ X , y ,
591
+ X_test = X_test , y_test = y_test ,
592
+ dataset_name = dataset_name ,
593
+ only_return_configuration_space = True ,
594
+ ) if self .automl_ .configuration_space is None else self .automl_ .configuration_space
518
595
519
596
520
597
class AutoSklearnClassifier (AutoSklearnEstimator , ClassifierMixin ):
0 commit comments