-
-
Notifications
You must be signed in to change notification settings - Fork 25.5k
/
Copy pathvalidation.py
2965 lines (2454 loc) · 106 KB
/
validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Functions to validate input and parameters within scikit-learn estimators."""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numbers
import operator
import sys
import warnings
from collections.abc import Sequence
from contextlib import suppress
from functools import reduce, wraps
from inspect import Parameter, isclass, signature
import joblib
import numpy as np
import scipy.sparse as sp
from .. import get_config as _get_config
from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning
from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace
from ..utils.deprecation import _deprecate_force_all_finite
from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype
from ._isfinite import FiniteStatus, cy_isfinite
from ._tags import get_tags
from .fixes import _object_dtype_isnan
FLOAT_DTYPES = (np.float64, np.float32, np.float16)
# This function is not used anymore at this moment in the code base but we keep it in
# case that we merge a new public function without kwarg only by mistake, which would
# require a deprecation cycle to fix.
def _deprecate_positional_args(func=None, *, version="1.3"):
"""Decorator for methods that issues warnings for positional arguments.
Using the keyword-only argument syntax in pep 3102, arguments after the
* will issue a warning when passed as a positional argument.
Parameters
----------
func : callable, default=None
Function to check arguments on.
version : callable, default="1.3"
The version when positional arguments will result in error.
"""
def _inner_deprecate_positional_args(f):
sig = signature(f)
kwonly_args = []
all_args = []
for name, param in sig.parameters.items():
if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
all_args.append(name)
elif param.kind == Parameter.KEYWORD_ONLY:
kwonly_args.append(name)
@wraps(f)
def inner_f(*args, **kwargs):
extra_args = len(args) - len(all_args)
if extra_args <= 0:
return f(*args, **kwargs)
# extra_args > 0
args_msg = [
"{}={}".format(name, arg)
for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])
]
args_msg = ", ".join(args_msg)
warnings.warn(
(
f"Pass {args_msg} as keyword args. From version "
f"{version} passing these as positional arguments "
"will result in an error"
),
FutureWarning,
)
kwargs.update(zip(sig.parameters, args))
return f(**kwargs)
return inner_f
if func is not None:
return _inner_deprecate_positional_args(func)
return _inner_deprecate_positional_args
def _assert_all_finite(
X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name=""
):
"""Like assert_all_finite, but only for ndarray."""
xp, is_array_api = get_namespace(X)
if _get_config()["assume_finite"]:
return
X = xp.asarray(X)
# for object dtype data, we only check for NaNs (GH-13254)
if not is_array_api and X.dtype == np.dtype("object") and not allow_nan:
if _object_dtype_isnan(X).any():
raise ValueError("Input contains NaN")
# We need only consider float arrays, hence can early return for all else.
if not xp.isdtype(X.dtype, ("real floating", "complex floating")):
return
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space `np.isinf/isnan` or custom
# Cython implementation to prevent false positives and provide a detailed
# error message.
with np.errstate(over="ignore"):
first_pass_isfinite = xp.isfinite(xp.sum(X))
if first_pass_isfinite:
return
_assert_all_finite_element_wise(
X,
xp=xp,
allow_nan=allow_nan,
msg_dtype=msg_dtype,
estimator_name=estimator_name,
input_name=input_name,
)
def _assert_all_finite_element_wise(
X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name=""
):
# Cython implementation doesn't support FP16 or complex numbers
use_cython = (
xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64}
)
if use_cython:
out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan)
has_nan_error = False if allow_nan else out == FiniteStatus.has_nan
has_inf = out == FiniteStatus.has_infinite
else:
has_inf = xp.any(xp.isinf(X))
has_nan_error = False if allow_nan else xp.any(xp.isnan(X))
if has_inf or has_nan_error:
if has_nan_error:
type_err = "NaN"
else:
msg_dtype = msg_dtype if msg_dtype is not None else X.dtype
type_err = f"infinity or a value too large for {msg_dtype!r}"
padded_input_name = input_name + " " if input_name else ""
msg_err = f"Input {padded_input_name}contains {type_err}."
if estimator_name and input_name == "X" and has_nan_error:
# Improve the error message on how to handle missing values in
# scikit-learn.
msg_err += (
f"\n{estimator_name} does not accept missing values"
" encoded as NaN natively. For supervised learning, you might want"
" to consider sklearn.ensemble.HistGradientBoostingClassifier and"
" Regressor which accept missing values encoded as NaNs natively."
" Alternatively, it is possible to preprocess the data, for"
" instance by using an imputer transformer in a pipeline or drop"
" samples with missing values. See"
" https://scikit-learn.org/stable/modules/impute.html"
" You can find a list of all estimators that handle NaN values"
" at the following page:"
" https://scikit-learn.org/stable/modules/impute.html"
"#estimators-that-handle-nan-values"
)
raise ValueError(msg_err)
def assert_all_finite(
X,
*,
allow_nan=False,
estimator_name=None,
input_name="",
):
"""Throw a ValueError if X contains NaN or infinity.
Parameters
----------
X : {ndarray, sparse matrix}
The input data.
allow_nan : bool, default=False
If True, do not throw error when `X` contains NaN.
estimator_name : str, default=None
The estimator name, used to construct the error message.
input_name : str, default=""
The data name used to construct the error message. In particular
if `input_name` is "X" and the data has NaN values and
allow_nan is False, the error message will link to the imputer
documentation.
Examples
--------
>>> from sklearn.utils import assert_all_finite
>>> import numpy as np
>>> array = np.array([1, np.inf, np.nan, 4])
>>> try:
... assert_all_finite(array)
... print("Test passed: Array contains only finite values.")
... except ValueError:
... print("Test failed: Array contains non-finite values.")
Test failed: Array contains non-finite values.
"""
_assert_all_finite(
X.data if sp.issparse(X) else X,
allow_nan=allow_nan,
estimator_name=estimator_name,
input_name=input_name,
)
def as_float_array(
X, *, copy=True, force_all_finite="deprecated", ensure_all_finite=None
):
"""Convert an array-like to an array of floats.
The new dtype will be np.float32 or np.float64, depending on the original
type. The function can create a copy or modify the argument depending
on the argument copy.
Parameters
----------
X : {array-like, sparse matrix}
The input data.
copy : bool, default=True
If True, a copy of X will be created. If False, a copy may still be
returned if X's dtype is not a floating point type.
force_all_finite : bool or 'allow-nan', default=True
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
possibilities are:
- True: Force all values of X to be finite.
- False: accepts np.inf, np.nan, pd.NA in X.
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
be infinite.
.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.
.. versionchanged:: 0.23
Accepts `pd.NA` and converts it into `np.nan`
.. deprecated:: 1.6
`force_all_finite` was renamed to `ensure_all_finite` and will be removed
in 1.8.
ensure_all_finite : bool or 'allow-nan', default=True
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
possibilities are:
- True: Force all values of X to be finite.
- False: accepts np.inf, np.nan, pd.NA in X.
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
be infinite.
.. versionadded:: 1.6
`force_all_finite` was renamed to `ensure_all_finite`.
Returns
-------
XT : {ndarray, sparse matrix}
An array of type float.
Examples
--------
>>> from sklearn.utils import as_float_array
>>> import numpy as np
>>> array = np.array([0, 0, 1, 2, 2], dtype=np.int64)
>>> as_float_array(array)
array([0., 0., 1., 2., 2.])
"""
ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
if isinstance(X, np.matrix) or (
not isinstance(X, np.ndarray) and not sp.issparse(X)
):
return check_array(
X,
accept_sparse=["csr", "csc", "coo"],
dtype=np.float64,
copy=copy,
ensure_all_finite=ensure_all_finite,
ensure_2d=False,
)
elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
return X.copy() if copy else X
elif X.dtype in [np.float32, np.float64]: # is numpy array
return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X
else:
if X.dtype.kind in "uib" and X.dtype.itemsize <= 4:
return_dtype = np.float32
else:
return_dtype = np.float64
return X.astype(return_dtype)
def _is_arraylike(x):
"""Returns whether the input is array-like."""
if sp.issparse(x):
return False
return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__")
def _is_arraylike_not_scalar(array):
"""Return True if array is array-like and not a scalar"""
return _is_arraylike(array) and not np.isscalar(array)
def _use_interchange_protocol(X):
"""Use interchange protocol for non-pandas dataframes that follow the protocol.
Note: at this point we chose not to use the interchange API on pandas dataframe
to ensure strict behavioral backward compatibility with older versions of
scikit-learn.
"""
return not _is_pandas_df(X) and hasattr(X, "__dataframe__")
def _num_features(X):
"""Return the number of features in an array-like X.
This helper function tries hard to avoid to materialize an array version
of X unless necessary. For instance, if X is a list of lists,
this function will return the length of the first element, assuming
that subsequent elements are all lists of the same length without
checking.
Parameters
----------
X : array-like
array-like to get the number of features.
Returns
-------
features : int
Number of features
"""
type_ = type(X)
if type_.__module__ == "builtins":
type_name = type_.__qualname__
else:
type_name = f"{type_.__module__}.{type_.__qualname__}"
message = f"Unable to find the number of features from X of type {type_name}"
if not hasattr(X, "__len__") and not hasattr(X, "shape"):
if not hasattr(X, "__array__"):
raise TypeError(message)
# Only convert X to a numpy array if there is no cheaper, heuristic
# option.
X = np.asarray(X)
if hasattr(X, "shape"):
if not hasattr(X.shape, "__len__") or len(X.shape) <= 1:
message += f" with shape {X.shape}"
raise TypeError(message)
return X.shape[1]
first_sample = X[0]
# Do not consider an array-like of strings or dicts to be a 2D array
if isinstance(first_sample, (str, bytes, dict)):
message += f" where the samples are of type {type(first_sample).__qualname__}"
raise TypeError(message)
try:
# If X is a list of lists, for instance, we assume that all nested
# lists have the same length without checking or converting to
# a numpy array to keep this function call as cheap as possible.
return len(first_sample)
except Exception as err:
raise TypeError(message) from err
def _num_samples(x):
"""Return number of samples in array-like x."""
message = "Expected sequence or array-like, got %s" % type(x)
if hasattr(x, "fit") and callable(x.fit):
# Don't get num_samples from an ensembles length!
raise TypeError(message)
if _use_interchange_protocol(x):
return x.__dataframe__().num_rows()
if not hasattr(x, "__len__") and not hasattr(x, "shape"):
if hasattr(x, "__array__"):
x = np.asarray(x)
else:
raise TypeError(message)
if hasattr(x, "shape") and x.shape is not None:
if len(x.shape) == 0:
raise TypeError(
"Input should have at least 1 dimension i.e. satisfy "
f"`len(x.shape) > 0`, got scalar `{x!r}` instead."
)
# Check that shape is returning an integer or default to len
# Dask dataframes may not return numeric shape[0] value
if isinstance(x.shape[0], numbers.Integral):
return x.shape[0]
try:
return len(x)
except TypeError as type_error:
raise TypeError(message) from type_error
def check_memory(memory):
"""Check that ``memory`` is joblib.Memory-like.
joblib.Memory-like means that ``memory`` can be converted into a
joblib.Memory instance (typically a str denoting the ``location``)
or has the same interface (has a ``cache`` method).
Parameters
----------
memory : None, str or object with the joblib.Memory interface
- If string, the location where to create the `joblib.Memory` interface.
- If None, no caching is done and the Memory object is completely transparent.
Returns
-------
memory : object with the joblib.Memory interface
A correct joblib.Memory object.
Raises
------
ValueError
If ``memory`` is not joblib.Memory-like.
Examples
--------
>>> from sklearn.utils.validation import check_memory
>>> check_memory("caching_dir")
Memory(location=caching_dir/joblib)
"""
if memory is None or isinstance(memory, str):
memory = joblib.Memory(location=memory, verbose=0)
elif not hasattr(memory, "cache"):
raise ValueError(
"'memory' should be None, a string or have the same"
" interface as joblib.Memory."
" Got memory='{}' instead.".format(memory)
)
return memory
def check_consistent_length(*arrays):
"""Check that all arrays have consistent first dimensions.
Checks whether all objects in arrays have the same shape or length.
Parameters
----------
*arrays : list or tuple of input objects.
Objects that will be checked for consistent length.
Examples
--------
>>> from sklearn.utils.validation import check_consistent_length
>>> a = [1, 2, 3]
>>> b = [2, 3, 4]
>>> check_consistent_length(a, b)
"""
lengths = [_num_samples(X) for X in arrays if X is not None]
if len(set(lengths)) > 1:
raise ValueError(
"Found input variables with inconsistent numbers of samples: %r"
% [int(l) for l in lengths]
)
def _make_indexable(iterable):
"""Ensure iterable supports indexing or convert to an indexable variant.
Convert sparse matrices to csr and other non-indexable iterable to arrays.
Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
Parameters
----------
iterable : {list, dataframe, ndarray, sparse matrix} or None
Object to be converted to an indexable iterable.
"""
if sp.issparse(iterable):
return iterable.tocsr()
elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
return iterable
elif iterable is None:
return iterable
return np.array(iterable)
def indexable(*iterables):
"""Make arrays indexable for cross-validation.
Checks consistent length, passes through None, and ensures that everything
can be indexed by converting sparse matrices to csr and converting
non-iterable objects to arrays.
Parameters
----------
*iterables : {lists, dataframes, ndarrays, sparse matrices}
List of objects to ensure sliceability.
Returns
-------
result : list of {ndarray, sparse matrix, dataframe} or None
Returns a list containing indexable arrays (i.e. NumPy array,
sparse matrix, or dataframe) or `None`.
Examples
--------
>>> from sklearn.utils import indexable
>>> from scipy.sparse import csr_matrix
>>> import numpy as np
>>> iterables = [
... [1, 2, 3], np.array([2, 3, 4]), None, csr_matrix([[5], [6], [7]])
... ]
>>> indexable(*iterables)
[[1, 2, 3], array([2, 3, 4]), None, <...Sparse...dtype 'int64'...shape (3, 1)>]
"""
result = [_make_indexable(X) for X in iterables]
check_consistent_length(*result)
return result
def _ensure_sparse_format(
sparse_container,
accept_sparse,
dtype,
copy,
ensure_all_finite,
accept_large_sparse,
estimator_name=None,
input_name="",
):
"""Convert a sparse container to a given format.
Checks the sparse format of `sparse_container` and converts if necessary.
Parameters
----------
sparse_container : sparse matrix or array
Input to validate and convert.
accept_sparse : str, bool or list/tuple of str
String[s] representing allowed sparse matrix formats ('csc',
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
not in the allowed format, it will be converted to the first listed
format. True allows the input to be any format. False means
that a sparse matrix input will raise an error.
dtype : str, type or None
Data type of result. If None, the dtype of the input is preserved.
copy : bool
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.
ensure_all_finite : bool or 'allow-nan'
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
possibilities are:
- True: Force all values of X to be finite.
- False: accepts np.inf, np.nan, pd.NA in X.
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
be infinite.
.. versionadded:: 0.20
``ensure_all_finite`` accepts the string ``'allow-nan'``.
.. versionchanged:: 0.23
Accepts `pd.NA` and converts it into `np.nan`
estimator_name : str, default=None
The estimator name, used to construct the error message.
input_name : str, default=""
The data name used to construct the error message. In particular
if `input_name` is "X" and the data has NaN values and
allow_nan is False, the error message will link to the imputer
documentation.
Returns
-------
sparse_container_converted : sparse matrix or array
Sparse container (matrix/array) that is ensured to have an allowed type.
"""
if dtype is None:
dtype = sparse_container.dtype
changed_format = False
sparse_container_type_name = type(sparse_container).__name__
if isinstance(accept_sparse, str):
accept_sparse = [accept_sparse]
# Indices dtype validation
_check_large_sparse(sparse_container, accept_large_sparse)
if accept_sparse is False:
padded_input = " for " + input_name if input_name else ""
raise TypeError(
f"Sparse data was passed{padded_input}, but dense data is required. "
"Use '.toarray()' to convert to a dense numpy array."
)
elif isinstance(accept_sparse, (list, tuple)):
if len(accept_sparse) == 0:
raise ValueError(
"When providing 'accept_sparse' as a tuple or list, it must contain at "
"least one string value."
)
# ensure correct sparse format
if sparse_container.format not in accept_sparse:
# create new with correct sparse
sparse_container = sparse_container.asformat(accept_sparse[0])
changed_format = True
elif accept_sparse is not True:
# any other type
raise ValueError(
"Parameter 'accept_sparse' should be a string, boolean or list of strings."
f" You provided 'accept_sparse={accept_sparse}'."
)
if dtype != sparse_container.dtype:
# convert dtype
sparse_container = sparse_container.astype(dtype)
elif copy and not changed_format:
# force copy
sparse_container = sparse_container.copy()
if ensure_all_finite:
if not hasattr(sparse_container, "data"):
warnings.warn(
f"Can't check {sparse_container.format} sparse matrix for nan or inf.",
stacklevel=2,
)
else:
_assert_all_finite(
sparse_container.data,
allow_nan=ensure_all_finite == "allow-nan",
estimator_name=estimator_name,
input_name=input_name,
)
# TODO: Remove when the minimum version of SciPy supported is 1.12
# With SciPy sparse arrays, conversion from DIA format to COO, CSR, or BSR
# triggers the use of `np.int64` indices even if the data is such that it could
# be more efficiently represented with `np.int32` indices.
# https://github.com/scipy/scipy/issues/19245 Since not all scikit-learn
# algorithms support large indices, the following code downcasts to `np.int32`
# indices when it's safe to do so.
if changed_format:
# accept_sparse is specified to a specific format and a conversion occurred
requested_sparse_format = accept_sparse[0]
_preserve_dia_indices_dtype(
sparse_container, sparse_container_type_name, requested_sparse_format
)
return sparse_container
def _ensure_no_complex_data(array):
if (
hasattr(array, "dtype")
and array.dtype is not None
and hasattr(array.dtype, "kind")
and array.dtype.kind == "c"
):
raise ValueError("Complex data not supported\n{}\n".format(array))
def _check_estimator_name(estimator):
if estimator is not None:
if isinstance(estimator, str):
return estimator
else:
return estimator.__class__.__name__
return None
def _pandas_dtype_needs_early_conversion(pd_dtype):
"""Return True if pandas extension pd_dtype need to be converted early."""
# Check these early for pandas versions without extension dtypes
from pandas import SparseDtype
from pandas.api.types import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
)
if is_bool_dtype(pd_dtype):
# bool and extension booleans need early conversion because __array__
# converts mixed dtype dataframes into object dtypes
return True
if isinstance(pd_dtype, SparseDtype):
# Sparse arrays will be converted later in `check_array`
return False
try:
from pandas.api.types import is_extension_array_dtype
except ImportError:
return False
if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype):
# Sparse arrays will be converted later in `check_array`
# Only handle extension arrays for integer and floats
return False
elif is_float_dtype(pd_dtype):
# Float ndarrays can normally support nans. They need to be converted
# first to map pd.NA to np.nan
return True
elif is_integer_dtype(pd_dtype):
# XXX: Warn when converting from a high integer to a float
return True
return False
def _is_extension_array_dtype(array):
# Pandas extension arrays have a dtype with an na_value
return hasattr(array, "dtype") and hasattr(array.dtype, "na_value")
def check_array(
array,
accept_sparse=False,
*,
accept_large_sparse=True,
dtype="numeric",
order=None,
copy=False,
force_writeable=False,
force_all_finite="deprecated",
ensure_all_finite=None,
ensure_non_negative=False,
ensure_2d=True,
allow_nd=False,
ensure_min_samples=1,
ensure_min_features=1,
estimator=None,
input_name="",
):
"""Input validation on an array, list, sparse matrix or similar.
By default, the input is checked to be a non-empty 2D array containing
only finite values. If the dtype of the array is object, attempt
converting to float, raising on failure.
Parameters
----------
array : object
Input object to check / convert.
accept_sparse : str, bool or list/tuple of str, default=False
String[s] representing allowed sparse matrix formats, such as 'csc',
'csr', etc. If the input is sparse but not in the allowed format,
it will be converted to the first listed format. True allows the input
to be any format. False means that a sparse matrix input will
raise an error.
accept_large_sparse : bool, default=True
If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
accept_sparse, accept_large_sparse=False will cause it to be accepted
only if its indices are stored with a 32-bit dtype.
.. versionadded:: 0.20
dtype : 'numeric', type, list of type or None, default='numeric'
Data type of result. If None, the dtype of the input is preserved.
If "numeric", dtype is preserved unless array.dtype is object.
If dtype is a list of types, conversion on the first type is only
performed if the dtype of the input is not in the list.
order : {'F', 'C'} or None, default=None
Whether an array will be forced to be fortran or c-style.
When order is None (default), then if copy=False, nothing is ensured
about the memory layout of the output array; otherwise (copy=True)
the memory layout of the returned array is kept as close as possible
to the original array.
copy : bool, default=False
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.
force_writeable : bool, default=False
Whether to force the output array to be writeable. If True, the returned array
is guaranteed to be writeable, which may require a copy. Otherwise the
writeability of the input array is preserved.
.. versionadded:: 1.6
force_all_finite : bool or 'allow-nan', default=True
Whether to raise an error on np.inf, np.nan, pd.NA in array. The
possibilities are:
- True: Force all values of array to be finite.
- False: accepts np.inf, np.nan, pd.NA in array.
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
cannot be infinite.
.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.
.. versionchanged:: 0.23
Accepts `pd.NA` and converts it into `np.nan`
.. deprecated:: 1.6
`force_all_finite` was renamed to `ensure_all_finite` and will be removed
in 1.8.
ensure_all_finite : bool or 'allow-nan', default=True
Whether to raise an error on np.inf, np.nan, pd.NA in array. The
possibilities are:
- True: Force all values of array to be finite.
- False: accepts np.inf, np.nan, pd.NA in array.
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
cannot be infinite.
.. versionadded:: 1.6
`force_all_finite` was renamed to `ensure_all_finite`.
ensure_non_negative : bool, default=False
Make sure the array has only non-negative values. If True, an array that
contains negative values will raise a ValueError.
.. versionadded:: 1.6
ensure_2d : bool, default=True
Whether to raise a value error if array is not 2D.
allow_nd : bool, default=False
Whether to allow array.ndim > 2.
ensure_min_samples : int, default=1
Make sure that the array has a minimum number of samples in its first
axis (rows for a 2D array). Setting to 0 disables this check.
ensure_min_features : int, default=1
Make sure that the 2D array has some minimum number of features
(columns). The default value of 1 rejects empty datasets.
This check is only enforced when the input data has effectively 2
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
disables this check.
estimator : str or estimator instance, default=None
If passed, include the name of the estimator in warning messages.
input_name : str, default=""
The data name used to construct the error message. In particular
if `input_name` is "X" and the data has NaN values and
allow_nan is False, the error message will link to the imputer
documentation.
.. versionadded:: 1.1.0
Returns
-------
array_converted : object
The converted and validated array.
Examples
--------
>>> from sklearn.utils.validation import check_array
>>> X = [[1, 2, 3], [4, 5, 6]]
>>> X_checked = check_array(X)
>>> X_checked
array([[1, 2, 3], [4, 5, 6]])
"""
ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite)
if isinstance(array, np.matrix):
raise TypeError(
"np.matrix is not supported. Please convert to a numpy array with "
"np.asarray. For more information see: "
"https://numpy.org/doc/stable/reference/generated/numpy.matrix.html"
)
xp, is_array_api_compliant = get_namespace(array)
# store reference to original array to check if copy is needed when
# function returns
array_orig = array
# store whether originally we wanted numeric dtype
dtype_numeric = isinstance(dtype, str) and dtype == "numeric"
dtype_orig = getattr(array, "dtype", None)
if not is_array_api_compliant and not hasattr(dtype_orig, "kind"):
# not a data type (e.g. a column named dtype in a pandas DataFrame)
dtype_orig = None
# check if the object contains several dtypes (typically a pandas
# DataFrame), and store them. If not, store None.
dtypes_orig = None
pandas_requires_conversion = False
# track if we have a Series-like object to raise a better error message
type_if_series = None
if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
# throw warning if columns are sparse. If all columns are sparse, then
# array.sparse exists and sparsity will be preserved (later).
with suppress(ImportError):
from pandas import SparseDtype
def is_sparse(dtype):
return isinstance(dtype, SparseDtype)
if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
warnings.warn(
"pandas.DataFrame with sparse columns found."
"It will be converted to a dense numpy array."
)
dtypes_orig = list(array.dtypes)
pandas_requires_conversion = any(
_pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
)
if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
dtype_orig = np.result_type(*dtypes_orig)
elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
# Force object if any of the dtypes is an object
dtype_orig = object
elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
array, "dtype"
):
# array is a pandas series
type_if_series = type(array)
pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
if isinstance(array.dtype, np.dtype):
dtype_orig = array.dtype
else:
# Set to None to let array.astype work out the best dtype
dtype_orig = None
if dtype_numeric:
if (
dtype_orig is not None
and hasattr(dtype_orig, "kind")
and dtype_orig.kind == "O"
):
# if input is object, convert to float.
dtype = xp.float64
else:
dtype = None
if isinstance(dtype, (list, tuple)):
if dtype_orig is not None and dtype_orig in dtype:
# no dtype conversion required
dtype = None
else:
# dtype conversion required. Let's select the first element of the
# list of accepted types.
dtype = dtype[0]
if pandas_requires_conversion:
# pandas dataframe requires conversion earlier to handle extension dtypes with
# nans
# Use the original dtype for conversion if dtype is None
new_dtype = dtype_orig if dtype is None else dtype
array = array.astype(new_dtype)
# Since we converted here, we do not need to convert again later
dtype = None
if ensure_all_finite not in (True, False, "allow-nan"):
raise ValueError(
"ensure_all_finite should be a bool or 'allow-nan'. Got "
f"{ensure_all_finite!r} instead."
)
if dtype is not None and _is_numpy_namespace(xp):
# convert to dtype object to conform to Array API to be use `xp.isdtype` later
dtype = np.dtype(dtype)
estimator_name = _check_estimator_name(estimator)
context = " by %s" % estimator_name if estimator is not None else ""
# When all dataframe columns are sparse, convert to a sparse array
if hasattr(array, "sparse") and array.ndim > 1:
with suppress(ImportError):
from pandas import SparseDtype # noqa: F811
def is_sparse(dtype):
return isinstance(dtype, SparseDtype)
if array.dtypes.apply(is_sparse).all():
# DataFrame.sparse only supports `to_coo`
array = array.sparse.to_coo()
if array.dtype == np.dtype("object"):
unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes])