@@ -262,13 +262,81 @@ def send_warnings_to_log(message, category, filename, lineno,
262
262
263
263
264
264
class AbstractEvaluator (object ):
265
+ """
266
+ This method defines the interface that pipeline evaluators should follow, when
267
+ interacting with SMAC through ExecuteTaFuncWithQueue.
268
+
269
+ An evaluator is an object that:
270
+ + constructs a pipeline (i.e. a classification or regression estimator) for a given
271
+ pipeline_config and run settings (budget, seed)
272
+ + Fits and trains this pipeline (TrainEvaluator) or tests a given
273
+ configuration (TestEvaluator)
274
+
275
+ The provided configuration determines the type of pipeline created. For more
276
+ details, please read the get_pipeline() method.
277
+
278
+ Attributes:
279
+ backend (Backend):
280
+ An object that allows interaction with the disk storage. In particular, allows to
281
+ access the train and test datasets
282
+ queue (Queue):
283
+ Each worker available will instantiate an evaluator, and after completion,
284
+ it will append the result to a multiprocessing queue
285
+ metric (autoPyTorchMetric):
286
+ A scorer object that is able to evaluate how good a pipeline was fit. It
287
+ is a wrapper on top of the actual score method (a wrapper on top of
288
+ scikit-learn accuracy for example) that formats the predictions accordingly.
289
+ budget: (float):
290
+ The amount of epochs/time a configuration is allowed to run.
291
+ budget_type (str):
292
+ The budget type. Currently, only epoch and time are allowed.
293
+ pipeline_config (Optional[Dict[str, Any]]):
294
+ Defines the content of the pipeline being evaluated. For example, it
295
+ contains pipeline specific settings like logging name, or whether or not
296
+ to use tensorboard.
297
+ configuration (Union[int, str, Configuration]):
298
+ Determines the pipeline to be constructed. A dummy estimator is created for
299
+ integer configurations, a traditional machine learning pipeline is created
300
+ for string based configuration, and NAS is performed when a configuration
301
+ object is passed.
302
+ seed (int):
303
+ A integer that allows for reproducibility of results
304
+ output_y_hat_optimization (bool):
305
+ Whether this worker should output the target predictions, so that they are
306
+ stored on disk. Fundamentally, the resampling strategy might shuffle the
307
+ Y_train targets, so we store the split in order to re-use them for ensemble
308
+ selection.
309
+ num_run (Optional[int]):
310
+ An identifier of the current configuration being fit. This number is unique per
311
+ configuration.
312
+ include (Optional[Dict[str, Any]]):
313
+ An optional dictionary to include components of the pipeline steps.
314
+ exclude (Optional[Dict[str, Any]]):
315
+ An optional dictionary to exclude components of the pipeline steps.
316
+ disable_file_output (Union[bool, List[str]]):
317
+ By default, the model, it's predictions and other metadata is stored on disk
318
+ for each finished configuration. This argument allows the user to skip
319
+ saving certain file type, for example the model, from being written to disk.
320
+ init_params (Optional[Dict[str, Any]]):
321
+ Optional argument that is passed to each pipeline step. It is the equivalent of
322
+ kwargs for the pipeline steps.
323
+ logger_port (Optional[int]):
324
+ Logging is performed using a socket-server scheme to be robust against many
325
+ parallel entities that want to write to the same file. This integer states the
326
+ socket port for the communication channel.
327
+ If None is provided, the logging.handlers.DEFAULT_TCP_LOGGING_PORT is used.
328
+ all_supported_metrics (bool):
329
+ Whether all supported metrics should be calculated for every configuration.
330
+ search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
331
+ An object used to fine tune the hyperparameter search space of the pipeline
332
+ """
265
333
def __init__ (self , backend : Backend ,
266
334
queue : Queue ,
267
335
metric : autoPyTorchMetric ,
268
336
budget : float ,
337
+ configuration : Union [int , str , Configuration ],
269
338
budget_type : str = None ,
270
339
pipeline_config : Optional [Dict [str , Any ]] = None ,
271
- configuration : Optional [Configuration ] = None ,
272
340
seed : int = 1 ,
273
341
output_y_hat_optimization : bool = True ,
274
342
num_run : Optional [int ] = None ,
@@ -408,6 +476,23 @@ def __init__(self, backend: Backend,
408
476
self .logger .debug ("Search space updates :{}" .format (self .search_space_updates ))
409
477
410
478
def _get_pipeline (self ) -> BaseEstimator :
479
+ """
480
+ Implements a pipeline object based on the self.configuration attribute.
481
+ int: A dummy classifier/dummy regressor is created. This estimator serves
482
+ as a baseline model to ignore all models that perform worst than this
483
+ fixed estimator. Also, in the worst case scenario, this is the final
484
+ estimator created (for instance, in case not enough memory was allocated).
485
+ str: A pipeline with traditional classifiers like random forest, SVM, etc is created,
486
+ as the configuration will contain an estimator name defining the configuration
487
+ to use, for example 'RandomForest'
488
+ Configuration: A pipeline object matching this configuration is created. This
489
+ is the case of neural architecture search, where different backbones
490
+ and head can be passed in the form of a configuration object.
491
+
492
+ Returns
493
+ pipeline (BaseEstimator):
494
+ A scikit-learn compliant pipeline which is not yet fit to the data.
495
+ """
411
496
assert self .pipeline_class is not None , "Can't return pipeline, pipeline_class not initialised"
412
497
if isinstance (self .configuration , int ):
413
498
pipeline = self .pipeline_class (config = self .configuration ,
@@ -436,6 +521,15 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray) -> Dict[str, float]:
436
521
The calculate_loss internally translate a score function to
437
522
a minimization problem
438
523
524
+ Args:
525
+ y_true (np.ndarray):
526
+ The expect labels given by the original dataset
527
+ y_hat (np.ndarray):
528
+ The prediction of the current pipeline being fit
529
+ Returns:
530
+ (Dict[str, float]):
531
+ A dictionary with metric_name -> metric_loss, for every
532
+ supported metric
439
533
"""
440
534
441
535
if isinstance (self .configuration , int ):
@@ -461,7 +555,39 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
461
555
* saving the files for the ensembles_statistics
462
556
* generate output for SMAC
463
557
We use it as the signal handler so we can recycle the code for the
464
- normal usecase and when the runsolver kills us here :)"""
558
+ normal usecase and when the runsolver kills us here :)
559
+
560
+ Args:
561
+ loss (Dict[str, float]):
562
+ The optimization loss, calculated on the validation set. This will
563
+ be the cost used in SMAC
564
+ train_loss (Dict[str, float]):
565
+ The train loss, calculated on the train set
566
+ opt_pred (np.ndarray):
567
+ The predictions on the validation set. This validation set is created
568
+ from the resampling strategy
569
+ valid_pred (Optional[np.ndarray]):
570
+ Predictions on a user provided validation set
571
+ test_pred (Optional[np.ndarray]):
572
+ Predictions on a user provided test set
573
+ additional_run_info (Optional[Dict]):
574
+ A dictionary with additional run information, like duration or
575
+ the crash error msg, if any.
576
+ file_output (bool):
577
+ Whether or not this pipeline should output information to disk
578
+ status (StatusType)
579
+ The status of the run, following SMAC StatusType syntax.
580
+
581
+ Returns:
582
+ duration (float):
583
+ The elapsed time of the training of this evaluator
584
+ loss (float):
585
+ The optimization loss of this run
586
+ seed (int):
587
+ The seed used while fitting the pipeline
588
+ additional_info (Dict):
589
+ Additional run information, like train/test loss
590
+ """
465
591
466
592
self .duration = time .time () - self .starttime
467
593
@@ -508,6 +634,25 @@ def calculate_auxiliary_losses(
508
634
Y_valid_pred : np .ndarray ,
509
635
Y_test_pred : np .ndarray ,
510
636
) -> Tuple [Optional [float ], Optional [float ]]:
637
+ """
638
+ A helper function to calculate the performance estimate of the
639
+ current pipeline in the user provided validation/test set.
640
+
641
+ Args:
642
+ Y_valid_pred (np.ndarray):
643
+ predictions on a validation set provided by the user,
644
+ matching self.y_valid
645
+ Y_test_pred (np.ndarray):
646
+ predictions on a test set provided by the user,
647
+ matching self.y_test
648
+ Returns:
649
+ validation_loss (Optional[float]):
650
+ The validation loss under the optimization metric
651
+ stored in self.metric
652
+ test_loss (Optional[float]]):
653
+ The test loss under the optimization metric
654
+ stored in self.metric
655
+ """
511
656
512
657
validation_loss : Optional [float ] = None
513
658
@@ -530,6 +675,31 @@ def file_output(
530
675
Y_valid_pred : np .ndarray ,
531
676
Y_test_pred : np .ndarray
532
677
) -> Tuple [Optional [float ], Dict ]:
678
+ """
679
+ This method decides what file outputs are written to disk.
680
+
681
+ It is also the interface to the backed save_numrun_to_dir
682
+ which stores all the pipeline related information to a single
683
+ directory for easy identification of the current run.
684
+
685
+ Args:
686
+ Y_optimization_pred (np.ndarray):
687
+ The pipeline predictions on the validation set internally created
688
+ from self.y_train
689
+ Y_valid_pred (np.ndarray):
690
+ The pipeline predictions on the user provided validation set,
691
+ which should match self.y_valid
692
+ Y_test_pred (np.ndarray):
693
+ The pipeline predictions on the user provided test set,
694
+ which should match self.y_test
695
+ Returns:
696
+ loss (Optional[float]):
697
+ A loss in case the run failed to store files to
698
+ disk
699
+ error_dict (Dict):
700
+ A dictionary with an error that explains why a run
701
+ was not successfully stored to disk.
702
+ """
533
703
# Abort if self.Y_optimization is None
534
704
# self.Y_optimization can be None if we use partial-cv, then,
535
705
# obviously no output should be saved.
@@ -624,6 +794,23 @@ def file_output(
624
794
625
795
def _predict_proba (self , X : np .ndarray , pipeline : BaseEstimator ,
626
796
Y_train : Optional [np .ndarray ] = None ) -> np .ndarray :
797
+ """
798
+ A wrapper function to handle the prediction of classification tasks.
799
+ It also makes sure that the predictions has the same dimensionality
800
+ as the expected labels
801
+
802
+ Args:
803
+ X (np.ndarray):
804
+ A set of features to feed to the pipeline
805
+ pipeline (BaseEstimator):
806
+ A model that will take the features X return a prediction y
807
+ This pipeline must be a classification estimator that supports
808
+ the predict_proba method.
809
+ Y_train (Optional[np.ndarray]):
810
+ Returns:
811
+ (np.ndarray):
812
+ The predictions of pipeline for the given features X
813
+ """
627
814
@no_type_check
628
815
def send_warnings_to_log (message , category , filename , lineno ,
629
816
file = None , line = None ):
@@ -640,6 +827,24 @@ def send_warnings_to_log(message, category, filename, lineno,
640
827
641
828
def _predict_regression (self , X : np .ndarray , pipeline : BaseEstimator ,
642
829
Y_train : Optional [np .ndarray ] = None ) -> np .ndarray :
830
+ """
831
+ A wrapper function to handle the prediction of regression tasks.
832
+ It is a wrapper to provide the same interface to _predict_proba
833
+
834
+ Regression predictions expects an unraveled dimensionality.
835
+ To comply with scikit-learn VotingRegressor requirement, if the estimator
836
+ predicts a (N,) shaped array, it is converted to (N, 1)
837
+
838
+ Args:
839
+ X (np.ndarray):
840
+ A set of features to feed to the pipeline
841
+ pipeline (BaseEstimator):
842
+ A model that will take the features X return a prediction y
843
+ Y_train (Optional[np.ndarray]):
844
+ Returns:
845
+ (np.ndarray):
846
+ The predictions of pipeline for the given features X
847
+ """
643
848
@no_type_check
644
849
def send_warnings_to_log (message , category , filename , lineno ,
645
850
file = None , line = None ):
@@ -658,6 +863,20 @@ def send_warnings_to_log(message, category, filename, lineno,
658
863
659
864
def _ensure_prediction_array_sizes (self , prediction : np .ndarray ,
660
865
Y_train : np .ndarray ) -> np .ndarray :
866
+ """
867
+ This method formats a prediction to match the dimensionality of the provided
868
+ labels (Y_train). This should be used exclusively for classification tasks
869
+
870
+ Args:
871
+ prediction (np.ndarray):
872
+ The un-formatted predictions of a pipeline
873
+ Y_train (np.ndarray):
874
+ The labels from the dataset to give an intuition of the expected
875
+ predictions dimensionality
876
+ Returns:
877
+ (np.ndarray):
878
+ The formatted prediction
879
+ """
661
880
assert self .datamanager .num_classes is not None , "Called function on wrong task"
662
881
num_classes : int = self .datamanager .num_classes
663
882
0 commit comments