[fix] [test] Add a small number to label for regression and add tests

nabenabe0928 · nabenabe0928 · commit c926036257f2 · 2022-02-24T02:45:30.000+09:00
Since target labels are required to be float and sklearn requires
numbers after a decimal point, I added a workaround to add the almost
possible minimum fraction to array so that we can avoid a mis-inference
of task type from sklearn.
Plus, I added tests to check if we get the expected results for
extreme cases.
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
@@ -145,10 +145,18 @@ def transform(
         if y.ndim == 2 and y.shape[1] == 1:
             y = np.ravel(y)
 
-        if not self.is_classification:
-            # Regression targets must be cast to float
+        if not self.is_classification and "continuous" not in type_of_target(y):
+            # Regression targets must have numbers after a decimal point.
             # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
-            y = y.astype(dtype=np.float64)
+            y_min = np.abs(y).min()
+            offset = y_min * 1e-16  # Sufficiently small number
+            if y_min > 1e15:
+                raise ValueError(
+                    "The minimum value for the target labels of regression tasks must be smaller than "
+                    f"1e15 to avoid errors caused by an overflow, but got {y_min}"
+                )
+
+            y = y.astype(dtype=np.float64) + offset  # Since it is all integer, we can just add a random small number
 
         return y
 
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -49,6 +49,36 @@ def type_check(train_tensors: BaseDatasetInputType,
             check_valid_data(val_tensors[i])
 
 
+def _get_output_properties(train_tensors: BaseDatasetInputType) -> Tuple[int, str]:
+    """
+    Return the dimension of output given a target_labels and output_type.
+
+    Args:
+        train_tensors (BaseDatasetInputType):
+            Training data.
+
+    Returns:
+        output_dim (int):
+            The dimension of outputs.
+        output_type (str):
+            The output type according to sklearn specification.
+    """
+    if isinstance(train_tensors, Dataset):
+        target_labels = np.array([sample[-1] for sample in train_tensors])
+    else:
+        target_labels = np.array(train_tensors[1])
+
+    output_type: str = type_of_target(target_labels)
+    if STRING_TO_OUTPUT_TYPES.get(output_type, None) in CLASSIFICATION_OUTPUTS:
+        output_dim = len(np.unique(target_labels))
+    elif target_labels.ndim > 1:
+        output_dim = target_labels.shape[-1]
+    else:
+        output_dim = 1
+
+    return output_dim, output_type
+
+
 class TransformSubset(Subset):
     """Wrapper of BaseDataset for splitted datasets
 
@@ -132,15 +162,7 @@ def __init__(
         self.issparse: bool = issparse(self.train_tensors[0])
         self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:]
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
-            self.output_type: str = type_of_target(self.train_tensors[1])
-
-            if (
-                self.output_type in STRING_TO_OUTPUT_TYPES
-                and STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS
-            ):
-                self.output_shape = len(np.unique(self.train_tensors[1]))
-            else:
-                self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
+            self.output_shape, self.output_type = _get_output_properties(self.train_tensors)
 
         # TODO: Look for a criteria to define small enough to preprocess
         self.is_small_preprocess = True
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
@@ -904,3 +904,31 @@ def test_tabular_classification_test_evaluator(openml_id, backend, n_samples):
     assert 'opt_loss' in incumbent_results, "run history: {}, successful_num_run: {}".format(estimator.run_history.data,
                                                                                              successful_num_run)
     assert 'train_loss' in incumbent_results
+
+
+@pytest.mark.parametrize("ans,task_class", (
+    ("continuous", TabularRegressionTask),
+    ("multiclass", TabularClassificationTask))
+)
+def test_task_inference(ans, task_class, backend):
+    # Get the data and check that contents of data-manager make sense
+    X = np.random.random((5, 1))
+    y = np.array([0, 1, 2, 3, 4]) + 10 ** 15
+
+    X_train, _, y_train, _ = sklearn.model_selection.train_test_split(X, y, random_state=42)
+
+    estimator = task_class(
+        backend=backend,
+        resampling_strategy=HoldoutValTypes.holdout_validation,
+        resampling_strategy_args=None,
+        seed=42,
+    )
+    dataset = estimator.get_dataset(X_train, y_train)
+    assert dataset.output_type == ans
+
+    y_train += 1
+    if ans == 'continuous':
+        with pytest.raises(ValueError):  # ValueError due to `Too large value`
+            estimator.get_dataset(X_train, y_train)
+    else:
+        estimator.get_dataset(X_train, y_train)