Fix bugs in cutout training (#233)

ravinkohli · ravinkohli · commit 17d18d800262 · 2022-03-09T18:09:42.000+01:00
* Fix bugs in cutout training

* Address comments from arlind
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
@@ -35,11 +35,9 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         if beta <= 0 or r > self.alpha:
             return X, {'y_a': y, 'y_b': y[index], 'lam': 1}
 
-        # The mixup component mixes up also on the batch dimension
-        # It is unlikely that the batch size is lower than the number of features, but
-        # be safe
-        size = min(X.shape[0], X.shape[1])
-        indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int(size * lam))))
+        size = X.shape[1]
+        indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int32(size * lam)),
+                                                        replace=False))
 
         X[:, indices] = X[index, :][:, indices]
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
@@ -9,6 +9,8 @@
 
 
 class RowCutOutTrainer(CutOut, BaseTrainerComponent):
+    NUMERICAL_VALUE = 0
+    CATEGORICAL_VALUE = -1
 
     def data_preparation(self, X: np.ndarray, y: np.ndarray,
                          ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]:
@@ -34,17 +36,26 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             lam = 1
             return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam}
 
-        # The mixup component mixes up also on the batch dimension
-        # It is unlikely that the batch size is lower than the number of features, but
-        # be safe
-        size = min(X.shape[0], X.shape[1])
-        indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int(size * self.patch_ratio))))
+        size = X.shape[1]
+        indices = self.random_state.choice(range(1, size), max(1, np.int32(size * self.patch_ratio)),
+                                           replace=False)
 
-        # We use an ordinal encoder on the tabular data
+        if not isinstance(self.numerical_columns, typing.Iterable):
+            raise ValueError("{} requires numerical columns information of {}"
+                             "to prepare data got {}.".format(self.__class__.__name__,
+                                                              typing.Iterable,
+                                                              self.numerical_columns))
+        numerical_indices = torch.tensor(self.numerical_columns)
+        categorical_indices = torch.tensor([index for index in indices if index not in self.numerical_columns])
+
+        # We use an ordinal encoder on the categorical columns of tabular data
         # -1 is the conceptual equivalent to 0 in a image, that does not
         # have color as a feature and hence the network has to learn to deal
-        # without this data
-        X[:, indices.long()] = -1
+        # without this data. For numerical columns we use 0 to cutout the features
+        # similar to the effect that setting 0 as a pixel value in an image.
+        X[:, categorical_indices.long()] = self.CATEGORICAL_VALUE
+        X[:, numerical_indices.long()] = self.NUMERICAL_VALUE
+
         lam = 1
         y_a = y
         y_b = y
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -347,6 +347,8 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
             task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']],
             labels=labels,
             step_interval=X['step_interval']
+            numerical_columns=X['dataset_properties']['numerical_columns'] if 'numerical_columns' in X[
+                'dataset_properties'] else None
         )
         total_parameter_count, trainable_parameter_count = self.count_parameters(X['network'])
         self.run_summary = RunSummary(
@@ -385,11 +387,7 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
 
             val_loss, val_metrics, test_loss, test_metrics = None, {}, None, {}
             if self.eval_valid_each_epoch(X):
-<<<<<<< HEAD
-                if X['val_data_loader']:
-=======
                 if 'val_data_loader' in X and X['val_data_loader']:
->>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
                     val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
                 if 'test_data_loader' in X and X['test_data_loader']:
                     test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer)
@@ -443,17 +441,10 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
 
         # wrap up -- add score if not evaluating every epoch
         if not self.eval_valid_each_epoch(X):
-<<<<<<< HEAD
-            if X['val_data_loader']:
-                val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
-            if 'test_data_loader' in X and X['val_data_loader']:
-                test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer)
-=======
             if 'val_data_loader' in X and X['val_data_loader']:
                 val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
             if 'test_data_loader' in X and X['test_data_loader']:
                 test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'])
->>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
             self.run_summary.add_performance(
                 epoch=epoch,
                 start_time=start_time,
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -265,7 +265,8 @@ def prepare(
         scheduler: _LRScheduler,
         task_type: int,
         labels: Union[np.ndarray, torch.Tensor, pd.DataFrame],
-        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch
+        step_interval: Union[str, StepIntervalUnit] = StepIntervalUnit.batch,
+        numerical_columns: Optional[List[int]] = None
     ) -> None:
 
         # Save the device to be used
@@ -322,6 +323,9 @@ def prepare(
         # task type (used for calculating metrics)
         self.task_type = task_type
 
+        # for cutout trainer, we need the list of numerical columns
+        self.numerical_columns = numerical_columns
+
     def on_epoch_start(self, X: Dict[str, Any], epoch: int) -> None:
         """
         Optional place holder for AutoPytorch Extensions.