|
1 |
| -import copy |
| 1 | + import copy |
2 | 2 | import functools
|
3 | 3 |
|
4 | 4 | import numpy as np
|
@@ -139,9 +139,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
|
139 | 139 | if isinstance(input_data_featuretest, pd.DataFrame):
|
140 | 140 | pytest.skip("Column order change in pandas is not supported")
|
141 | 141 | elif isinstance(input_data_featuretest, np.ndarray):
|
142 |
| - complementary_type = pd.DataFrame(input_data_featuretest) |
| 142 | + complementary_type = validator.numpy_array_to_pandas(input_data_featuretest) |
143 | 143 | elif isinstance(input_data_featuretest, list):
|
144 |
| - complementary_type = pd.DataFrame(input_data_featuretest) |
| 144 | + complementary_type, _ = validator.list_to_dataframe(input_data_featuretest) |
145 | 145 | elif sparse.issparse(input_data_featuretest):
|
146 | 146 | complementary_type = sparse.csr_matrix(input_data_featuretest.todense())
|
147 | 147 | else:
|
@@ -331,8 +331,11 @@ def test_unknown_encode_value():
|
331 | 331 | )
|
332 | 332 | @pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list'))
|
333 | 333 | @pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list'))
|
334 |
| -def test_featurevalidator_new_data_after_fit(openml_id, |
335 |
| - train_data_type, test_data_type): |
| 334 | +def test_feature_validator_new_data_after_fit( |
| 335 | + openml_id, |
| 336 | + train_data_type, |
| 337 | + test_data_type, |
| 338 | +): |
336 | 339 |
|
337 | 340 | # List is currently not supported as infer_objects
|
338 | 341 | # cast list objects to type objects
|
@@ -406,3 +409,109 @@ def test_comparator():
|
406 | 409 | key=functools.cmp_to_key(validator._comparator)
|
407 | 410 | )
|
408 | 411 | assert ans == feat_type
|
| 412 | + |
| 413 | + |
| 414 | +# Actual checks for the features |
| 415 | +@pytest.mark.parametrize( |
| 416 | + 'input_data_featuretest', |
| 417 | + ( |
| 418 | + 'numpy_numericalonly_nonan', |
| 419 | + 'numpy_numericalonly_nan', |
| 420 | + 'numpy_mixed_nan', |
| 421 | + 'pandas_numericalonly_nan', |
| 422 | + 'sparse_bsr_nonan', |
| 423 | + 'sparse_bsr_nan', |
| 424 | + 'sparse_coo_nonan', |
| 425 | + 'sparse_coo_nan', |
| 426 | + 'sparse_csc_nonan', |
| 427 | + 'sparse_csc_nan', |
| 428 | + 'sparse_csr_nonan', |
| 429 | + 'sparse_csr_nan', |
| 430 | + 'sparse_dia_nonan', |
| 431 | + 'sparse_dia_nan', |
| 432 | + 'sparse_dok_nonan', |
| 433 | + 'sparse_dok_nan', |
| 434 | + 'openml_40981', # Australian |
| 435 | + ), |
| 436 | + indirect=True |
| 437 | +) |
| 438 | +def test_featurevalidator_reduce_precision(input_data_featuretest): |
| 439 | + X_train, X_test = sklearn.model_selection.train_test_split( |
| 440 | + input_data_featuretest, test_size=0.1, random_state=1) |
| 441 | + validator = TabularFeatureValidator(dataset_compression={'memory_allocation': 0, 'methods': ['precision']}) |
| 442 | + validator.fit(X_train=X_train) |
| 443 | + transformed_X_train = validator.transform(X_train.copy()) |
| 444 | + |
| 445 | + assert validator._reduced_dtype is not None |
| 446 | + assert megabytes(transformed_X_train) < megabytes(X_train) |
| 447 | + |
| 448 | + transformed_X_test = validator.transform(X_test.copy()) |
| 449 | + assert megabytes(transformed_X_test) < megabytes(X_test) |
| 450 | + if hasattr(transformed_X_train, 'iloc'): |
| 451 | + assert all(transformed_X_train.dtypes == transformed_X_test.dtypes) |
| 452 | + assert all(transformed_X_train.dtypes == validator._precision) |
| 453 | + else: |
| 454 | + assert transformed_X_train.dtype == transformed_X_test.dtype |
| 455 | + assert transformed_X_test.dtype == validator._reduced_dtype |
| 456 | + |
| 457 | + |
| 458 | +def test_feature_validator_imbalanced_data(): |
| 459 | + |
| 460 | + # Null columns in the train split but not necessarily in the test split |
| 461 | + train_features = { |
| 462 | + 'A': [np.NaN, np.NaN, np.NaN], |
| 463 | + 'B': [1, 2, 3], |
| 464 | + 'C': [np.NaN, np.NaN, np.NaN], |
| 465 | + 'D': [np.NaN, np.NaN, np.NaN], |
| 466 | + } |
| 467 | + test_features = { |
| 468 | + 'A': [3, 4, 5], |
| 469 | + 'B': [6, 5, 7], |
| 470 | + 'C': [np.NaN, np.NaN, np.NaN], |
| 471 | + 'D': ['Blue', np.NaN, np.NaN], |
| 472 | + } |
| 473 | + |
| 474 | + X_train = pd.DataFrame.from_dict(train_features) |
| 475 | + X_test = pd.DataFrame.from_dict(test_features) |
| 476 | + validator = TabularFeatureValidator() |
| 477 | + validator.fit(X_train) |
| 478 | + |
| 479 | + train_feature_types = copy.deepcopy(validator.feat_type) |
| 480 | + assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical'] |
| 481 | + # validator will throw an error if the column types are not the same |
| 482 | + transformed_X_test = validator.transform(X_test) |
| 483 | + transformed_X_test = pd.DataFrame(transformed_X_test) |
| 484 | + null_columns = [] |
| 485 | + for column in transformed_X_test.columns: |
| 486 | + if transformed_X_test[column].isna().all(): |
| 487 | + null_columns.append(column) |
| 488 | + assert null_columns == [0, 2, 3] |
| 489 | + |
| 490 | + # Columns with not all null values in the train split and |
| 491 | + # completely null on the test split. |
| 492 | + train_features = { |
| 493 | + 'A': [np.NaN, np.NaN, 4], |
| 494 | + 'B': [1, 2, 3], |
| 495 | + 'C': ['Blue', np.NaN, np.NaN], |
| 496 | + } |
| 497 | + test_features = { |
| 498 | + 'A': [np.NaN, np.NaN, np.NaN], |
| 499 | + 'B': [6, 5, 7], |
| 500 | + 'C': [np.NaN, np.NaN, np.NaN], |
| 501 | + } |
| 502 | + |
| 503 | + X_train = pd.DataFrame.from_dict(train_features) |
| 504 | + X_test = pd.DataFrame.from_dict(test_features) |
| 505 | + validator = TabularFeatureValidator() |
| 506 | + validator.fit(X_train) |
| 507 | + train_feature_types = copy.deepcopy(validator.feat_type) |
| 508 | + assert train_feature_types == ['categorical', 'numerical', 'numerical'] |
| 509 | + |
| 510 | + transformed_X_test = validator.transform(X_test) |
| 511 | + transformed_X_test = pd.DataFrame(transformed_X_test) |
| 512 | + null_columns = [] |
| 513 | + for column in transformed_X_test.columns: |
| 514 | + if transformed_X_test[column].isna().all(): |
| 515 | + null_columns.append(column) |
| 516 | + |
| 517 | + assert null_columns == [1] |
0 commit comments