Skip to content

Commit 5eb8d47

Browse files
eddiebergmanmfeurer
andcommitted
Clearup warnings (#1238)
* np.bool deprecation * Invalid escape sequence \_ * Series specify dtype * drop na requires keyword args deprecation * unspecified np.int size deprecated, use int instead * deprecated unspeicifed np.int precision * Element wise comparison failed, will raise error in the future * Specify explicit dtype for empty series * metric warnings for mismatch between y_pred and y_true label count * Quantile transformer n_quantiles larger than n_samples warning ignored * Silenced convergence warnings * pass sklearn args as keywords * np.bool deprecation * Invalid escape sequence \_ * Series specify dtype * drop na requires keyword args deprecation * unspecified np.int size deprecated, use int instead * deprecated unspeicifed np.int precision * Element wise comparison failed, will raise error in the future * Specify explicit dtype for empty series * metric warnings for mismatch between y_pred and y_true label count * Quantile transformer n_quantiles larger than n_samples warning ignored * Silenced convergence warnings * pass sklearn args as keywords * flake8'd * flake8'd * Fixed CategoricalImputation not accounting for sparse matrices * Updated to use distro for linux distribution * Ignore convergence warnings for gaussian process regressor * Averaging metrics now use zero_division parameter * Readded scorers to module scope * flake8'd * Fix * Fixed dtype for metalearner no run * Catch gaussian process iterative fit warning * Moved ignored warnings to tests * Correctly type pd.Series * Revert back to usual iterative fit * Readded missing iteration increment * Removed odd backslash * Fixed imputer for sparse matrices * Ignore warnings we are aware about in tests * Flake'd: * Revert "Fixed imputer for sparse matrices" This reverts commit 05675ad. * Revert "Revert "Fixed imputer for sparse matrices"" This reverts commit d031b0d. * Back to default values * Reverted to default behaviour with comment * Added xfail test to document * flaked * Fixed test, moved to np.testing for assertion * Update autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de> Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
1 parent 7cb249c commit 5eb8d47

File tree

24 files changed

+320
-116
lines changed

24 files changed

+320
-116
lines changed

autosklearn/automl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- encoding: utf-8 -*-
22
import copy
3+
import distro
34
import io
45
import json
56
import platform
@@ -690,11 +691,10 @@ def fit(
690691
self._logger.debug('Starting to print environment information')
691692
self._logger.debug(' Python version: %s', sys.version.split('\n'))
692693
try:
693-
self._logger.debug(' Distribution: %s', platform.linux_distribution())
694+
self._logger.debug(f'\tDistribution: {distro.id()}-{distro.version()}-{distro.name()}')
694695
except AttributeError:
695-
# platform.linux_distribution() was removed in Python3.8
696-
# We should move to the distro package as soon as it supports Windows and OSX
697696
pass
697+
698698
self._logger.debug(' System: %s', platform.system())
699699
self._logger.debug(' Machine: %s', platform.machine())
700700
self._logger.debug(' Platform: %s', platform.platform())

autosklearn/estimators.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,13 +234,13 @@ def __init__(
234234
Attributes
235235
----------
236236
237-
cv_results\_ : dict of numpy (masked) ndarrays
237+
cv_results_ : dict of numpy (masked) ndarrays
238238
A dict with keys as column headers and values as columns, that can be
239239
imported into a pandas ``DataFrame``.
240240
241241
Not all keys returned by scikit-learn are supported yet.
242242
243-
performance_over_time\_ : pandas.core.frame.DataFrame
243+
performance_over_time_ : pandas.core.frame.DataFrame
244244
A ``DataFrame`` containing the models performance over time data. Can be
245245
used for plotting directly. Please refer to the example
246246
:ref:`Train and Test Inputs <sphx_glr_examples_40_advanced_example_pandas_train_test.py>`.

autosklearn/metalearning/metafeatures/metafeatures.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def _calculate(self, X, y, logger, categorical):
184184
def _calculate_sparse(self, X, y, logger, categorical):
185185
data = [True if not np.isfinite(x) else False for x in X.data]
186186
missing = X.__class__((data, X.indices, X.indptr), shape=X.shape,
187-
dtype=np.bool)
187+
dtype=bool)
188188
return missing
189189

190190

autosklearn/metalearning/metalearning/meta_base.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from collections import OrderedDict
2+
13
import numpy as np
24
import pandas as pd
35

@@ -39,7 +41,7 @@ def __init__(self, configuration_space, aslib_directory, logger):
3941

4042
aslib_reader = aslib_simple.AlgorithmSelectionProblem(self.aslib_directory)
4143
self.metafeatures = aslib_reader.metafeatures
42-
self.algorithm_runs = aslib_reader.algorithm_runs
44+
self.algorithm_runs: OrderedDict[str, pd.DataFrame] = aslib_reader.algorithm_runs
4345
self.configurations = aslib_reader.configurations
4446

4547
configurations = dict()
@@ -65,7 +67,7 @@ def add_dataset(self, name, metafeatures):
6567
self.metafeatures.drop(name.lower(), inplace=True)
6668
self.metafeatures = self.metafeatures.append(metafeatures)
6769

68-
runs = pd.Series([], name=name)
70+
runs = pd.Series([], name=name, dtype=float)
6971
for metric in self.algorithm_runs.keys():
7072
self.algorithm_runs[metric].append(runs)
7173

autosklearn/metalearning/optimizers/metalearn_optimizer/metalearner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ def _learn(self, exclude_double_configurations=True):
111111
except KeyError:
112112
# TODO should I really except this?
113113
self.logger.info("Could not find runs for instance %s" % task_id)
114-
runs[task_id] = pd.Series([], name=task_id)
114+
runs[task_id] = pd.Series([], name=task_id, dtype=np.float64)
115+
115116
runs = pd.DataFrame(runs)
116117

117118
kND.fit(all_other_metafeatures, runs)

autosklearn/metrics/__init__.py

Lines changed: 50 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from abc import ABCMeta, abstractmethod
22
from functools import partial
3+
from itertools import product
34
from typing import Any, Callable, Dict, List, Optional, Union, cast
45

56
import numpy as np
@@ -278,16 +279,14 @@ def make_scorer(
278279
optimum=0,
279280
worst_possible_result=MAXINT,
280281
greater_is_better=False)
281-
r2 = make_scorer('r2',
282-
sklearn.metrics.r2_score)
282+
283+
r2 = make_scorer('r2', sklearn.metrics.r2_score)
283284

284285
# Standard Classification Scores
285286
accuracy = make_scorer('accuracy',
286287
sklearn.metrics.accuracy_score)
287288
balanced_accuracy = make_scorer('balanced_accuracy',
288289
sklearn.metrics.balanced_accuracy_score)
289-
f1 = make_scorer('f1',
290-
sklearn.metrics.f1_score)
291290

292291
# Score functions that need decision values
293292
roc_auc = make_scorer('roc_auc',
@@ -297,10 +296,20 @@ def make_scorer(
297296
average_precision = make_scorer('average_precision',
298297
sklearn.metrics.average_precision_score,
299298
needs_threshold=True)
300-
precision = make_scorer('precision',
301-
sklearn.metrics.precision_score)
302-
recall = make_scorer('recall',
303-
sklearn.metrics.recall_score)
299+
300+
# NOTE: zero_division
301+
#
302+
# Specified as the explicit default, see sklearn docs:
303+
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn-metrics-precision-score
304+
precision = make_scorer(
305+
'precision', partial(sklearn.metrics.precision_score, zero_division=0)
306+
)
307+
recall = make_scorer(
308+
'recall', partial(sklearn.metrics.recall_score, zero_division=0)
309+
)
310+
f1 = make_scorer(
311+
'f1', partial(sklearn.metrics.f1_score, zero_division=0)
312+
)
304313

305314
# Score function for probabilistic classification
306315
log_loss = make_scorer('log_loss',
@@ -312,29 +321,39 @@ def make_scorer(
312321
# TODO what about mathews correlation coefficient etc?
313322

314323

315-
REGRESSION_METRICS = dict()
316-
for scorer in [mean_absolute_error, mean_squared_error, root_mean_squared_error,
317-
mean_squared_log_error, median_absolute_error, r2]:
318-
REGRESSION_METRICS[scorer.name] = scorer
319-
320-
CLASSIFICATION_METRICS = dict()
321-
322-
for scorer in [accuracy, balanced_accuracy, roc_auc, average_precision,
323-
log_loss]:
324-
CLASSIFICATION_METRICS[scorer.name] = scorer
325-
326-
for name, metric in [('precision', sklearn.metrics.precision_score),
327-
('recall', sklearn.metrics.recall_score),
328-
('f1', sklearn.metrics.f1_score)]:
329-
globals()[name] = make_scorer(name, metric)
330-
CLASSIFICATION_METRICS[name] = globals()[name]
331-
for average in ['macro', 'micro', 'samples', 'weighted']:
332-
qualified_name = '{0}_{1}'.format(name, average)
333-
globals()[qualified_name] = make_scorer(qualified_name,
334-
partial(metric,
335-
pos_label=None,
336-
average=average))
337-
CLASSIFICATION_METRICS[qualified_name] = globals()[qualified_name]
324+
REGRESSION_METRICS = {
325+
scorer.name: scorer
326+
for scorer in [
327+
mean_absolute_error, mean_squared_error, root_mean_squared_error,
328+
mean_squared_log_error, median_absolute_error, r2
329+
]
330+
}
331+
332+
CLASSIFICATION_METRICS = {
333+
scorer.name: scorer
334+
for scorer in [
335+
accuracy, balanced_accuracy, roc_auc, average_precision, log_loss
336+
]
337+
}
338+
339+
# NOTE: zero_division
340+
#
341+
# Specified as the explicit default, see sklearn docs:
342+
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn-metrics-precision-score
343+
for (base_name, sklearn_metric), average in product(
344+
[
345+
('precision', sklearn.metrics.precision_score),
346+
('recall', sklearn.metrics.recall_score),
347+
('f1', sklearn.metrics.f1_score),
348+
],
349+
['macro', 'micro', 'samples', 'weighted']
350+
):
351+
name = f'{base_name}_{average}'
352+
scorer = make_scorer(
353+
name, partial(sklearn_metric, pos_label=None, average=average, zero_division=0)
354+
)
355+
globals()[name] = scorer # Adds scorer to the module scope
356+
CLASSIFICATION_METRICS[name] = scorer
338357

339358

340359
def calculate_score(

autosklearn/pipeline/components/base.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,16 @@ def __str__(self):
147147

148148

149149
class IterativeComponent(AutoSklearnComponent):
150+
150151
def fit(self, X, y, sample_weight=None):
151152
self.iterative_fit(X, y, n_iter=2, refit=True)
153+
152154
iteration = 2
153155
while not self.configuration_fully_fitted():
154156
n_iter = int(2 ** iteration / 2)
155157
self.iterative_fit(X, y, n_iter=n_iter, refit=False)
156158
iteration += 1
159+
157160
return self
158161

159162
@staticmethod
@@ -165,15 +168,16 @@ def get_current_iter(self):
165168

166169

167170
class IterativeComponentWithSampleWeight(AutoSklearnComponent):
171+
168172
def fit(self, X, y, sample_weight=None):
169-
self.iterative_fit(
170-
X, y, n_iter=2, refit=True, sample_weight=sample_weight
171-
)
173+
self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight)
174+
172175
iteration = 2
173176
while not self.configuration_fully_fitted():
174177
n_iter = int(2 ** iteration / 2)
175-
self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight)
178+
self.iterative_fit(X, y, n_iter=n_iter, refit=False, sample_weight=sample_weight)
176179
iteration += 1
180+
177181
return self
178182

179183
@staticmethod

autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,14 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
2727
categories='auto', handle_unknown='use_encoded_value', unknown_value=-1,
2828
)
2929
self.preprocessor.fit(X, y)
30-
return self
30+
return self
31+
else:
32+
# TODO sparse_encoding of negative labels
33+
#
34+
# The next step in the pipeline relies on positive labels
35+
# Given a categorical column [[0], [-1]], the next step will fail
36+
# unless we can fix this encoding
37+
return self
3138

3239
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
3340
if scipy.sparse.issparse(X):

autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from ConfigSpace.configuration_space import ConfigurationSpace
44

55
import numpy as np
6+
from scipy.sparse import spmatrix
67

78
from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
89
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
@@ -28,24 +29,32 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
2829
y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation':
2930
import sklearn.impute
3031

31-
fill_value = None
3232
if hasattr(X, 'columns'):
3333
kind = X[X.columns[-1]].dtype.kind
3434
else:
3535
# Series, sparse and numpy have dtype
3636
# Only DataFrame does not
3737
kind = X.dtype.kind
38-
if kind in ("i", "u", "f"):
39-
# We do not want to impute a category with the default
40-
# value (0 is the default) in case such default is in the
41-
# train data already!
42-
fill_value = 0
43-
unique = np.unique(X)
44-
while fill_value in unique:
45-
fill_value -= 1
38+
39+
fill_value: Optional[int] = None
40+
41+
number_kinds = ("i", "u", "f")
42+
if kind in number_kinds:
43+
if isinstance(X, spmatrix):
44+
# TODO negative labels
45+
#
46+
# Previously this was the behaviour and went
47+
# unnoticed. Imputing negative labels results in
48+
# the cateogircal shift step failing as the ordinal
49+
# encoder can't fix negative labels.
50+
# This is here to document the behaviour explicitly
51+
fill_value = 0
52+
else:
53+
fill_value = min(np.unique(X)) - 1
4654

4755
self.preprocessor = sklearn.impute.SimpleImputer(
48-
strategy='constant', copy=False, fill_value=fill_value)
56+
strategy='constant', copy=False, fill_value=fill_value
57+
)
4958
self.preprocessor.fit(X)
5059
return self
5160

autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,27 @@ def __init__(
1919
) -> None:
2020
self.preprocessor: Optional[BaseEstimator] = None
2121

22-
def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
23-
) -> 'AutoSklearnPreprocessingAlgorithm':
22+
def fit(
23+
self,
24+
X: PIPELINE_DATA_DTYPE,
25+
y: Optional[PIPELINE_DATA_DTYPE] = None
26+
) -> 'AutoSklearnPreprocessingAlgorithm':
27+
2428
if self.preprocessor is None:
2529
raise NotFittedError()
30+
2631
self.preprocessor.fit(X)
32+
2733
return self
2834

2935
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
36+
3037
if self.preprocessor is None:
31-
raise NotImplementedError()
32-
return self.preprocessor.transform(X)
38+
raise NotFittedError()
39+
40+
transformed_X = self.preprocessor.transform(X)
41+
42+
return transformed_X
3343

3444
@staticmethod
3545
def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None

autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from typing import Optional, Union
2+
3+
from numpy.random import RandomState
14
from ConfigSpace.configuration_space import ConfigurationSpace
25
from ConfigSpace.hyperparameters import UniformFloatHyperparameter, \
36
UniformIntegerHyperparameter
@@ -8,13 +11,23 @@
811

912
class RandomKitchenSinks(AutoSklearnPreprocessingAlgorithm):
1013

11-
def __init__(self, gamma, n_components, random_state=None):
12-
""" Parameters:
14+
def __init__(
15+
self,
16+
gamma: float,
17+
n_components: int,
18+
random_state: Optional[Union[int, RandomState]] = None
19+
) -> None:
20+
"""
21+
Parameters
22+
----------
1323
gamma: float
14-
Parameter of the rbf kernel to be approximated exp(-gamma * x^2)
24+
Parameter of the rbf kernel to be approximated exp(-gamma * x^2)
1525
1626
n_components: int
17-
Number of components (output dimensionality) used to approximate the kernel
27+
Number of components (output dimensionality) used to approximate the kernel
28+
29+
random_state: Optional[int | RandomState]
30+
The random state to pass to the underlying estimator
1831
"""
1932
self.gamma = gamma
2033
self.n_components = n_components
@@ -27,7 +40,10 @@ def fit(self, X, Y=None):
2740
self.gamma = float(self.gamma)
2841

2942
self.preprocessor = sklearn.kernel_approximation.RBFSampler(
30-
self.gamma, self.n_components, self.random_state)
43+
gamma=self.gamma,
44+
n_components=self.n_components,
45+
random_state=self.random_state
46+
)
3147
self.preprocessor.fit(X)
3248
return self
3349

0 commit comments

Comments
 (0)