rebase and fix flake

ravinkohli · ravinkohli · commit 878718620444 · 2022-01-27T18:56:12.000+01:00
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -299,7 +299,14 @@ def _get_dataset_input_validator(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+<<<<<<< HEAD
         resampling_strategy: Optional[ResamplingStrategies] = None,
+=======
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
+>>>>>>> rebase and fix flake
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
     ) -> Tuple[BaseDataset, BaseInputValidator]:
@@ -341,7 +348,14 @@ def get_dataset(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+<<<<<<< HEAD
         resampling_strategy: Optional[ResamplingStrategies] = None,
+=======
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
+>>>>>>> rebase and fix flake
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
     ) -> BaseDataset:
@@ -1389,7 +1403,14 @@ def fit_pipeline(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
+<<<<<<< HEAD
         resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
+=======
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
+>>>>>>> rebase and fix flake
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         run_time_limit_secs: int = 60,
         memory_limit: Optional[int] = None,
@@ -1513,7 +1534,6 @@ def fit_pipeline(
             (BaseDataset):
                 Dataset created from the given tensors
         """
-        self.dataset_name = dataset.dataset_name
 
         if dataset is None:
             if (
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -374,19 +374,6 @@ def search(
             self
 
         """
-        if dataset_name is None:
-            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
-
-        # we have to create a logger for at this point for the validator
-        self._logger = self._get_logger(dataset_name)
-
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TabularInputValidator(
-            is_classification=True,
-            logger_port=self._logger_port,
-        )
-
         self.dataset, self.InputValidator = self._get_dataset_input_validator(
             X_train=X_train,
             y_train=y_train,
@@ -404,9 +391,6 @@ def search(
                 '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
             )
 
-        if self.dataset is None:
-            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
-
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -391,9 +391,6 @@ def search(
                 '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
             )
 
-        if self.dataset is None:
-            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
-
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/evaluation/fit_evaluator.py b/autoPyTorch/evaluation/fit_evaluator.py
@@ -16,6 +16,7 @@
     AbstractEvaluator,
     fit_and_suppress_warnings
 )
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -33,7 +34,7 @@ def __init__(self, backend: Backend, queue: Queue,
                  num_run: Optional[int] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
-                 disable_file_output: Union[bool, List] = False,
+                 disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  keep_models: Optional[bool] = None,
@@ -241,14 +242,11 @@ def file_output(
                 )
 
         # Abort if we don't want to output anything.
-        if hasattr(self, 'disable_file_output'):
-            if self.disable_file_output:
-                return None, {}
-            else:
-                self.disabled_file_outputs = []
+        if 'all' in self.disable_file_output:
+            return None, {}
 
-        if hasattr(self, 'pipeline') and self.pipeline is not None:
-            if 'pipeline' not in self.disabled_file_outputs:
+        if getattr(self, 'pipeline', None) is not None:
+            if 'pipeline' not in self.disable_file_output:
                 pipeline = self.pipeline
             else:
                 pipeline = None
@@ -265,11 +263,11 @@ def file_output(
             ensemble_predictions=None,
             valid_predictions=(
                 Y_valid_pred if 'y_valid' not in
-                                self.disabled_file_outputs else None
+                                self.disable_file_output else None
             ),
             test_predictions=(
                 Y_test_pred if 'y_test' not in
-                               self.disabled_file_outputs else None
+                               self.disable_file_output else None
             ),
         )
 
@@ -287,8 +285,8 @@ def eval_function(
     num_run: int,
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
-    disable_file_output: Union[bool, List],
     output_y_hat_optimization: bool = False,
+    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
     pipeline_config: Optional[Dict[str, Any]] = None,
     budget_type: str = None,
     init_params: Optional[Dict[str, Any]] = None,
@@ -297,14 +295,75 @@ def eval_function(
     search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     instance: str = None,
 ) -> None:
+    """
+    This closure allows the communication between the ExecuteTaFuncWithQueue and the
+    pipeline trainer (TrainEvaluator).
+
+    Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally
+    builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files
+    to disc via the backend, and puts the performance result of the run in the queue.
+
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        config (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        instance (str):
+            An instance on which to evaluate the current pipeline. By default we work
+            with a single instance, being the provided X_train, y_train of a single dataset.
+            This instance is a compatibility argument for SMAC, that is capable of working
+            with multiple datasets at the same time.
+    """
     evaluator = FitEvaluator(
         backend=backend,
         queue=queue,
         metric=metric,
         configuration=config,
         seed=seed,
         num_run=num_run,
-        output_y_hat_optimization=output_y_hat_optimization,
         include=include,
         exclude=exclude,
         disable_file_output=disable_file_output,
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
@@ -428,10 +428,10 @@ def eval_train_function(
     budget: float,
     config: Optional[Configuration],
     seed: int,
-    output_y_hat_optimization: bool,
     num_run: int,
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
+    output_y_hat_optimization: bool,
     disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
     pipeline_config: Optional[Dict[str, Any]] = None,
     budget_type: str = None,

Original file line number	Diff line number	Diff line change
`@@ -391,9 +391,6 @@ def search(`
`391`	`391`	`'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)`
`392`	`392`	`)`
`393`	`393`
`394`		`- if self.dataset is None:`
`395`		- raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
`396`		`-`
`397`	`394`	`return self._search(`
`398`	`395`	`dataset=self.dataset,`
`399`	`396`	`optimize_metric=optimize_metric,`