Skip to content

Commit c926036

Browse files
committed
[fix] [test] Add a small number to label for regression and add tests
Since target labels are required to be float and sklearn requires numbers after a decimal point, I added a workaround to add the almost possible minimum fraction to array so that we can avoid a mis-inference of task type from sklearn. Plus, I added tests to check if we get the expected results for extreme cases.
1 parent 295a307 commit c926036

File tree

3 files changed

+70
-12
lines changed

3 files changed

+70
-12
lines changed

autoPyTorch/data/tabular_target_validator.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,18 @@ def transform(
145145
if y.ndim == 2 and y.shape[1] == 1:
146146
y = np.ravel(y)
147147

148-
if not self.is_classification:
149-
# Regression targets must be cast to float
148+
if not self.is_classification and "continuous" not in type_of_target(y):
149+
# Regression targets must have numbers after a decimal point.
150150
# Ref: https://github.com/scikit-learn/scikit-learn/issues/8952
151-
y = y.astype(dtype=np.float64)
151+
y_min = np.abs(y).min()
152+
offset = y_min * 1e-16 # Sufficiently small number
153+
if y_min > 1e15:
154+
raise ValueError(
155+
"The minimum value for the target labels of regression tasks must be smaller than "
156+
f"1e15 to avoid errors caused by an overflow, but got {y_min}"
157+
)
158+
159+
y = y.astype(dtype=np.float64) + offset # Since it is all integer, we can just add a random small number
152160

153161
return y
154162

autoPyTorch/datasets/base_dataset.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,36 @@ def type_check(train_tensors: BaseDatasetInputType,
4949
check_valid_data(val_tensors[i])
5050

5151

52+
def _get_output_properties(train_tensors: BaseDatasetInputType) -> Tuple[int, str]:
53+
"""
54+
Return the dimension of output given a target_labels and output_type.
55+
56+
Args:
57+
train_tensors (BaseDatasetInputType):
58+
Training data.
59+
60+
Returns:
61+
output_dim (int):
62+
The dimension of outputs.
63+
output_type (str):
64+
The output type according to sklearn specification.
65+
"""
66+
if isinstance(train_tensors, Dataset):
67+
target_labels = np.array([sample[-1] for sample in train_tensors])
68+
else:
69+
target_labels = np.array(train_tensors[1])
70+
71+
output_type: str = type_of_target(target_labels)
72+
if STRING_TO_OUTPUT_TYPES.get(output_type, None) in CLASSIFICATION_OUTPUTS:
73+
output_dim = len(np.unique(target_labels))
74+
elif target_labels.ndim > 1:
75+
output_dim = target_labels.shape[-1]
76+
else:
77+
output_dim = 1
78+
79+
return output_dim, output_type
80+
81+
5282
class TransformSubset(Subset):
5383
"""Wrapper of BaseDataset for splitted datasets
5484
@@ -132,15 +162,7 @@ def __init__(
132162
self.issparse: bool = issparse(self.train_tensors[0])
133163
self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:]
134164
if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
135-
self.output_type: str = type_of_target(self.train_tensors[1])
136-
137-
if (
138-
self.output_type in STRING_TO_OUTPUT_TYPES
139-
and STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS
140-
):
141-
self.output_shape = len(np.unique(self.train_tensors[1]))
142-
else:
143-
self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
165+
self.output_shape, self.output_type = _get_output_properties(self.train_tensors)
144166

145167
# TODO: Look for a criteria to define small enough to preprocess
146168
self.is_small_preprocess = True

test/test_api/test_api.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,3 +904,31 @@ def test_tabular_classification_test_evaluator(openml_id, backend, n_samples):
904904
assert 'opt_loss' in incumbent_results, "run history: {}, successful_num_run: {}".format(estimator.run_history.data,
905905
successful_num_run)
906906
assert 'train_loss' in incumbent_results
907+
908+
909+
@pytest.mark.parametrize("ans,task_class", (
910+
("continuous", TabularRegressionTask),
911+
("multiclass", TabularClassificationTask))
912+
)
913+
def test_task_inference(ans, task_class, backend):
914+
# Get the data and check that contents of data-manager make sense
915+
X = np.random.random((5, 1))
916+
y = np.array([0, 1, 2, 3, 4]) + 10 ** 15
917+
918+
X_train, _, y_train, _ = sklearn.model_selection.train_test_split(X, y, random_state=42)
919+
920+
estimator = task_class(
921+
backend=backend,
922+
resampling_strategy=HoldoutValTypes.holdout_validation,
923+
resampling_strategy_args=None,
924+
seed=42,
925+
)
926+
dataset = estimator.get_dataset(X_train, y_train)
927+
assert dataset.output_type == ans
928+
929+
y_train += 1
930+
if ans == 'continuous':
931+
with pytest.raises(ValueError): # ValueError due to `Too large value`
932+
estimator.get_dataset(X_train, y_train)
933+
else:
934+
estimator.get_dataset(X_train, y_train)

0 commit comments

Comments
 (0)