Skip to content

Commit 5168ba5

Browse files
ArlindKadraravinkohli
authored andcommitted
Fixing issues with imbalanced datasets (#197)
* adding missing method from base_feature_validator * First try at a fix, removing redundant code * Fix bug * Updating unit test typo, fixing bug where the data type was not checked because X was a numpy array at the time of checking * Fixing flake 8 failing * Bug fix, implementation update for imbalanced datasets and unit tests to check the implementation * flake8 fix * Bug fix * Making the conversion to dataframe in the unit tests consistent with what happens at the validator, so the types do not change * flake8 fix * Addressing Ravin's comments
1 parent 6d4790f commit 5168ba5

File tree

4 files changed

+120
-35
lines changed

4 files changed

+120
-35
lines changed

autoPyTorch/data/base_feature_validator.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,20 @@ def _fit(
111111
"""
112112
raise NotImplementedError()
113113

114+
def _check_data(
115+
self,
116+
X: SUPPORTED_FEAT_TYPES,
117+
) -> None:
118+
"""
119+
Feature dimensionality and data type checks
120+
121+
Arguments:
122+
X (SUPPORTED_FEAT_TYPES):
123+
A set of features that are going to be validated (type and dimensionality
124+
checks) and a encoder fitted in the case the data needs encoding
125+
"""
126+
raise NotImplementedError()
127+
114128
def transform(
115129
self,
116130
X: SupportedFeatTypes,

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,13 @@ def _fit(
162162
# with nan values.
163163
# Columns that are completely made of NaN values are provided to the pipeline
164164
# so that later stages decide how to handle them
165+
166+
# Clear whatever null column markers we had previously
167+
self.null_columns.clear()
165168
if np.any(pd.isnull(X)):
166169
for column in X.columns:
167170
if X[column].isna().all():
171+
self.null_columns.add(column)
168172
X[column] = pd.to_numeric(X[column])
169173
# Also note this change in self.dtypes
170174
if len(self.dtypes) != 0:
@@ -244,30 +248,38 @@ def transform(
244248
if isinstance(X, np.ndarray):
245249
X = self.numpy_array_to_pandas(X)
246250

247-
if ispandas(X) and not issparse(X):
248-
if np.any(pd.isnull(X)):
249-
for column in X.columns:
250-
if X[column].isna().all():
251-
X[column] = pd.to_numeric(X[column])
251+
if hasattr(X, "iloc") and not issparse(X):
252+
X = cast(pd.DataFrame, X)
253+
# If we had null columns in our fit call and we made them numeric, then:
254+
# - If the columns are null even in transform, apply the same procedure.
255+
# - Otherwise, substitute the values with np.NaN and then make the columns numeric.
256+
# If the column is null here, but it was not in fit, it does not matter.
257+
for column in self.null_columns:
258+
# The column is not null, make it null since it was null in fit.
259+
if not X[column].isna().all():
260+
X[column] = np.NaN
261+
X[column] = pd.to_numeric(X[column])
262+
263+
# for the test set, if we have columns with only null values
264+
# they will probably have a numeric type. If these columns were not
265+
# with only null values in the train set, they should be converted
266+
# to the type that they had during fitting.
267+
for column in X.columns:
268+
if X[column].isna().all():
269+
X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)])
252270

253271
# Also remove the object dtype for new data
254272
if not X.select_dtypes(include='object').empty:
255273
X = self.infer_objects(X)
256274

257275
# Check the data here so we catch problems on new test data
258276
self._check_data(X)
277+
# We also need to fillna on the transformation
278+
# in case test data is provided
279+
X = self.impute_nan_in_categories(X)
259280

260-
# Pandas related transformations
261-
if ispandas(X) and self.column_transformer is not None:
262-
if np.any(pd.isnull(X)):
263-
# After above check it means that if there is a NaN
264-
# the whole column must be NaN
265-
# Make sure it is numerical and let the pipeline handle it
266-
for column in X.columns:
267-
if X[column].isna().all():
268-
X[column] = pd.to_numeric(X[column])
269-
270-
X = self.column_transformer.transform(X)
281+
if self.encoder is not None:
282+
X = self.encoder.transform(X)
271283

272284
# Sparse related transformations
273285
# Not all sparse format support index sorting
@@ -557,7 +569,7 @@ def numpy_array_to_pandas(
557569
Returns:
558570
pd.DataFrame
559571
"""
560-
return pd.DataFrame(X).infer_objects().convert_dtypes()
572+
return pd.DataFrame(X).convert_dtypes()
561573

562574
def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
563575
"""
@@ -575,18 +587,13 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
575587
if hasattr(self, 'object_dtype_mapping'):
576588
# Mypy does not process the has attr. This dict is defined below
577589
for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type]
578-
if 'int' in dtype.name:
579-
# In the case train data was interpreted as int
580-
# and test data was interpreted as float, because of 0.0
581-
# for example, honor training data
582-
X[key] = X[key].applymap(np.int64)
583-
else:
584-
try:
585-
X[key] = X[key].astype(dtype.name)
586-
except Exception as e:
587-
# Try inference if possible
588-
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
589-
pass
590+
# honor the training data types
591+
try:
592+
X[key] = X[key].astype(dtype.name)
593+
except Exception as e:
594+
# Try inference if possible
595+
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
596+
pass
590597
else:
591598
X = X.infer_objects()
592599
for column in X.columns:

test/test_data/test_feature_validator.py

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import copy
1+
import copy
22
import functools
33

44
import numpy as np
@@ -139,9 +139,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
139139
if isinstance(input_data_featuretest, pd.DataFrame):
140140
pytest.skip("Column order change in pandas is not supported")
141141
elif isinstance(input_data_featuretest, np.ndarray):
142-
complementary_type = pd.DataFrame(input_data_featuretest)
142+
complementary_type = validator.numpy_array_to_pandas(input_data_featuretest)
143143
elif isinstance(input_data_featuretest, list):
144-
complementary_type = pd.DataFrame(input_data_featuretest)
144+
complementary_type, _ = validator.list_to_dataframe(input_data_featuretest)
145145
elif sparse.issparse(input_data_featuretest):
146146
complementary_type = sparse.csr_matrix(input_data_featuretest.todense())
147147
else:
@@ -331,8 +331,11 @@ def test_unknown_encode_value():
331331
)
332332
@pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list'))
333333
@pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list'))
334-
def test_featurevalidator_new_data_after_fit(openml_id,
335-
train_data_type, test_data_type):
334+
def test_feature_validator_new_data_after_fit(
335+
openml_id,
336+
train_data_type,
337+
test_data_type,
338+
):
336339

337340
# List is currently not supported as infer_objects
338341
# cast list objects to type objects
@@ -526,3 +529,65 @@ def test_feature_validator_get_columns_to_encode_error_feat_type(input_data_feat
526529
validator = TabularFeatureValidator(feat_types=feat_types)
527530
with pytest.raises(ValueError, match=r"Expected type of features to be in .*"):
528531
validator._validate_feat_types(X)
532+
533+
def test_feature_validator_imbalanced_data():
534+
535+
# Null columns in the train split but not necessarily in the test split
536+
train_features = {
537+
'A': [np.NaN, np.NaN, np.NaN],
538+
'B': [1, 2, 3],
539+
'C': [np.NaN, np.NaN, np.NaN],
540+
'D': [np.NaN, np.NaN, np.NaN],
541+
}
542+
test_features = {
543+
'A': [3, 4, 5],
544+
'B': [6, 5, 7],
545+
'C': [np.NaN, np.NaN, np.NaN],
546+
'D': ['Blue', np.NaN, np.NaN],
547+
}
548+
549+
X_train = pd.DataFrame.from_dict(train_features)
550+
X_test = pd.DataFrame.from_dict(test_features)
551+
validator = TabularFeatureValidator()
552+
validator.fit(X_train)
553+
554+
train_feature_types = copy.deepcopy(validator.feat_type)
555+
assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical']
556+
# validator will throw an error if the column types are not the same
557+
transformed_X_test = validator.transform(X_test)
558+
transformed_X_test = pd.DataFrame(transformed_X_test)
559+
null_columns = []
560+
for column in transformed_X_test.columns:
561+
if transformed_X_test[column].isna().all():
562+
null_columns.append(column)
563+
assert null_columns == [0, 2, 3]
564+
565+
# Columns with not all null values in the train split and
566+
# completely null on the test split.
567+
train_features = {
568+
'A': [np.NaN, np.NaN, 4],
569+
'B': [1, 2, 3],
570+
'C': ['Blue', np.NaN, np.NaN],
571+
}
572+
test_features = {
573+
'A': [np.NaN, np.NaN, np.NaN],
574+
'B': [6, 5, 7],
575+
'C': [np.NaN, np.NaN, np.NaN],
576+
}
577+
578+
X_train = pd.DataFrame.from_dict(train_features)
579+
X_test = pd.DataFrame.from_dict(test_features)
580+
validator = TabularFeatureValidator()
581+
validator.fit(X_train)
582+
train_feature_types = copy.deepcopy(validator.feat_type)
583+
assert train_feature_types == ['categorical', 'numerical', 'numerical']
584+
585+
transformed_X_test = validator.transform(X_test)
586+
transformed_X_test = pd.DataFrame(transformed_X_test)
587+
null_columns = []
588+
for column in transformed_X_test.columns:
589+
if transformed_X_test[column].isna().all():
590+
null_columns.append(column)
591+
592+
assert null_columns == [1]
593+
>>>>>>> Fixing issues with imbalanced datasets (#197)

test/test_data/test_validation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def test_data_validation_for_classification(openmlid, as_frame):
3232
x, y, test_size=0.33, random_state=0)
3333

3434
validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
35-
3635
X_train_t, y_train_t = validator.transform(X_train, y_train)
3736
assert np.shape(X_train) == np.shape(X_train_t)
3837

0 commit comments

Comments
 (0)