@@ -204,7 +204,6 @@ def test_featurevalidator_supported_types(input_data_featuretest):
204
204
assert sparse .issparse (transformed_X )
205
205
else :
206
206
assert isinstance (transformed_X , np .ndarray )
207
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
208
207
assert np .issubdtype (transformed_X .dtype , np .number )
209
208
assert validator ._is_fitted
210
209
@@ -237,9 +236,10 @@ def test_featurevalidator_categorical_nan(input_data_featuretest):
237
236
validator .fit (input_data_featuretest )
238
237
transformed_X = validator .transform (input_data_featuretest )
239
238
assert any (pd .isna (input_data_featuretest ))
240
- assert any ((- 1 in categories ) or ('-1' in categories ) or ('Missing!' in categories ) for categories in
241
- validator .encoder .named_transformers_ ['encoder' ].categories_ )
242
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
239
+ categories_ = validator .column_transformer .\
240
+ named_transformers_ ['categorical_pipeline' ].named_steps ['onehotencoder' ].categories_
241
+ assert any (('0' in categories ) or (0 in categories ) or ('missing_value' in categories ) for categories in
242
+ categories_ )
243
243
assert np .issubdtype (transformed_X .dtype , np .number )
244
244
assert validator ._is_fitted
245
245
assert isinstance (transformed_X , np .ndarray )
@@ -292,7 +292,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
292
292
else :
293
293
raise ValueError (type (input_data_featuretest ))
294
294
transformed_X = validator .transform (complementary_type )
295
- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
296
295
assert np .issubdtype (transformed_X .dtype , np .number )
297
296
assert validator ._is_fitted
298
297
@@ -436,36 +435,29 @@ def test_features_unsupported_calls_are_raised():
436
435
expected
437
436
"""
438
437
validator = TabularFeatureValidator ()
439
- with pytest .raises (ValueError , match = r"AutoPyTorch does not support time " ):
438
+ with pytest .raises (TypeError , match = r".*?Convert the time information to a numerical value " ):
440
439
validator .fit (
441
440
pd .DataFrame ({'datetime' : [pd .Timestamp ('20180310' )]})
442
441
)
442
+ validator = TabularFeatureValidator ()
443
443
with pytest .raises (ValueError , match = r"AutoPyTorch only supports.*yet, the provided input" ):
444
444
validator .fit ({'input1' : 1 , 'input2' : 2 })
445
- with pytest .raises (ValueError , match = r"has unsupported dtype string" ):
445
+ validator = TabularFeatureValidator ()
446
+ with pytest .raises (TypeError , match = r".*?but input column A has an invalid type `string`.*" ):
446
447
validator .fit (pd .DataFrame ([{'A' : 1 , 'B' : 2 }], dtype = 'string' ))
448
+ validator = TabularFeatureValidator ()
447
449
with pytest .raises (ValueError , match = r"The feature dimensionality of the train and test" ):
448
450
validator .fit (X_train = np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]),
449
451
X_test = np .array ([[1 , 2 , 3 , 4 ], [4 , 5 , 6 , 7 ]]),
450
452
)
453
+ validator = TabularFeatureValidator ()
451
454
with pytest .raises (ValueError , match = r"Cannot call transform on a validator that is not fit" ):
452
455
validator .transform (np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]))
453
456
454
457
455
458
@pytest .mark .parametrize (
456
459
'input_data_featuretest' ,
457
460
(
458
- 'numpy_numericalonly_nonan' ,
459
- 'numpy_numericalonly_nan' ,
460
- 'pandas_numericalonly_nonan' ,
461
- 'pandas_numericalonly_nan' ,
462
- 'list_numericalonly_nonan' ,
463
- 'list_numericalonly_nan' ,
464
- # Category in numpy is handled via feat_type
465
- 'numpy_categoricalonly_nonan' ,
466
- 'numpy_mixed_nonan' ,
467
- 'numpy_categoricalonly_nan' ,
468
- 'numpy_mixed_nan' ,
469
461
'sparse_bsr_nonan' ,
470
462
'sparse_bsr_nan' ,
471
463
'sparse_coo_nonan' ,
@@ -483,14 +475,14 @@ def test_features_unsupported_calls_are_raised():
483
475
),
484
476
indirect = True
485
477
)
486
- def test_no_encoder_created (input_data_featuretest ):
478
+ def test_no_column_transformer_created (input_data_featuretest ):
487
479
"""
488
480
Makes sure that for numerical only features, no encoder is created
489
481
"""
490
482
validator = TabularFeatureValidator ()
491
483
validator .fit (input_data_featuretest )
492
484
validator .transform (input_data_featuretest )
493
- assert validator .encoder is None
485
+ assert validator .column_transformer is None
494
486
495
487
496
488
@pytest .mark .parametrize (
@@ -501,18 +493,18 @@ def test_no_encoder_created(input_data_featuretest):
501
493
),
502
494
indirect = True
503
495
)
504
- def test_encoder_created (input_data_featuretest ):
496
+ def test_column_transformer_created (input_data_featuretest ):
505
497
"""
506
- This test ensures an encoder is created if categorical data is provided
498
+ This test ensures an column transformer is created if categorical data is provided
507
499
"""
508
500
validator = TabularFeatureValidator ()
509
501
validator .fit (input_data_featuretest )
510
502
transformed_X = validator .transform (input_data_featuretest )
511
- assert validator .encoder is not None
503
+ assert validator .column_transformer is not None
512
504
513
505
# Make sure that the encoded features are actually encoded. Categorical columns are at
514
506
# the start after transformation. In our fixtures, this is also honored prior encode
515
- enc_columns , feature_types = validator ._get_columns_to_encode (input_data_featuretest )
507
+ cat_columns , _ , feature_types = validator ._get_columns_info (input_data_featuretest )
516
508
517
509
# At least one categorical
518
510
assert 'categorical' in validator .feat_type
@@ -521,20 +513,13 @@ def test_encoder_created(input_data_featuretest):
521
513
if np .any ([pd .api .types .is_numeric_dtype (input_data_featuretest [col ]
522
514
) for col in input_data_featuretest .columns ]):
523
515
assert 'numerical' in validator .feat_type
524
- for i , feat_type in enumerate (feature_types ):
525
- if 'numerical' in feat_type :
526
- np .testing .assert_array_equal (
527
- transformed_X [:, i ],
528
- input_data_featuretest [input_data_featuretest .columns [i ]].to_numpy ()
529
- )
530
- elif 'categorical' in feat_type :
531
- np .testing .assert_array_equal (
532
- transformed_X [:, i ],
533
- # Expect always 0, 1... because we use a ordinal encoder
534
- np .array ([0 , 1 ])
535
- )
536
- else :
537
- raise ValueError (feat_type )
516
+ # we expect this input to be the fixture 'pandas_mixed_nan'
517
+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , - 1. ], [0. , 1. , 1. ]]))
518
+ else :
519
+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , 1. , 0. ], [0. , 1. , 0. , 1. ]]))
520
+
521
+ if not all ([feat_type in ['numerical' , 'categorical' ] for feat_type in feature_types ]):
522
+ raise ValueError ("Expected only numerical and categorical feature types" )
538
523
539
524
540
525
def test_no_new_category_after_fit ():
@@ -566,13 +551,12 @@ def test_unknown_encode_value():
566
551
x ['c' ].cat .add_categories (['NA' ], inplace = True )
567
552
x .loc [0 , 'c' ] = 'NA' # unknown value
568
553
x_t = validator .transform (x )
569
- # The first row should have a -1 as we added a new categorical there
570
- expected_row = [- 1 , - 41 , - 3 , - 987.2 ]
554
+ # The first row should have a 0, 0 as we added a
555
+ # new categorical there and one hot encoder marks
556
+ # it as all zeros for the transformed column
557
+ expected_row = [0.0 , 0.0 , - 0.5584294383572701 , 0.5000000000000004 , - 1.5136598016833485 ]
571
558
assert expected_row == x_t [0 ].tolist ()
572
559
573
- # Notice how there is only one column 'c' to encode
574
- assert validator .categories == [list (range (2 )) for i in range (1 )]
575
-
576
560
577
561
# Actual checks for the features
578
562
@pytest .mark .parametrize (
@@ -624,19 +608,20 @@ def test_feature_validator_new_data_after_fit(
624
608
assert sparse .issparse (transformed_X )
625
609
else :
626
610
assert isinstance (transformed_X , np .ndarray )
627
- assert np .shape (X_test ) == np .shape (transformed_X )
628
611
629
612
# And then check proper error messages
630
613
if train_data_type == 'pandas' :
631
614
old_dtypes = copy .deepcopy (validator .dtypes )
632
615
validator .dtypes = ['dummy' for dtype in X_train .dtypes ]
633
- with pytest .raises (ValueError , match = r"Changing the dtype of the features after fit" ):
616
+ with pytest .raises (ValueError ,
617
+ match = r"The dtype of the features must not be changed after fit" ):
634
618
transformed_X = validator .transform (X_test )
635
619
validator .dtypes = old_dtypes
636
620
if test_data_type == 'pandas' :
637
621
columns = X_test .columns .tolist ()
638
622
X_test = X_test [reversed (columns )]
639
- with pytest .raises (ValueError , match = r"Changing the column order of the features" ):
623
+ with pytest .raises (ValueError ,
624
+ match = r"The column order of the features must not be changed after fit" ):
640
625
transformed_X = validator .transform (X_test )
641
626
642
627
0 commit comments