Skip to content

Commit

Permalink
Add sparse support for Auto-sklearn 2.0 (automl#1245)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfeurer authored Sep 13, 2021
1 parent ff11e5a commit a8effd8
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 7 deletions.
2 changes: 1 addition & 1 deletion autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1187,7 +1187,7 @@ def refit(self, X, y):
budget_type=self._budget_type,
logger=self._logger,
model=model,
train_indices=np.arange(len(X), dtype=int),
train_indices=np.arange(X.shape[0], dtype=int),
task_type=self._task,
)
break
Expand Down
43 changes: 37 additions & 6 deletions autosklearn/experimental/askl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Any, Dict, List, Optional, Union

import dask.distributed
import scipy.sparse

from ConfigSpace import Configuration
import numpy as np
Expand Down Expand Up @@ -92,9 +93,14 @@ def __call__(

scenario = Scenario(scenario_dict)

initial_configurations = [
Configuration(configuration_space=scenario.cs, values=member)
for member in self.portfolio.values()]
initial_configurations = []
for member in self.portfolio.values():
try:
initial_configurations.append(
Configuration(configuration_space=scenario.cs, values=member)
)
except ValueError:
pass

rh2EPM = RunHistory2EPM4LogCost
return SMAC4AC(
Expand Down Expand Up @@ -134,9 +140,15 @@ def __call__(
from smac.scenario.scenario import Scenario

scenario = Scenario(scenario_dict)
initial_configurations = [
Configuration(configuration_space=scenario.cs, values=member)
for member in self.portfolio.values()]

initial_configurations = []
for member in self.portfolio.values():
try:
initial_configurations.append(
Configuration(configuration_space=scenario.cs, values=member)
)
except ValueError:
pass

rh2EPM = RunHistory2EPM4LogCost
ta_kwargs['budget_type'] = self.budget_type
Expand Down Expand Up @@ -341,6 +353,25 @@ def fit(self, X, y,
feat_type=None,
dataset_name=None):

# TODO
# regularly check https://github.com/scikit-learn/scikit-learn/issues/15336 whether
# histogram gradient boosting in scikit-learn finally support sparse data
is_sparse = scipy.sparse.issparse(X)
if is_sparse:
include_estimators = [
'extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'mlp',
]
else:
include_estimators = [
'extra_trees',
'passive_aggressive',
'random_forest',
'sgd',
'gradient_boosting',
'mlp',
]
self.include['classifier'] = include_estimators

if self.metric is None:
if len(y.shape) == 1 or y.shape[1] == 1:
self.metric = accuracy
Expand Down
25 changes: 25 additions & 0 deletions test/test_automl/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,31 @@ def test_autosklearn2_classification_methods_returns_self(dask_client):
pickle.dumps(automl_fitted)


def test_autosklearn2_classification_methods_returns_self_sparse(dask_client):
X_train, y_train, X_test, y_test = putil.get_dataset('breast_cancer', make_sparse=True)
automl = AutoSklearn2Classifier(time_left_for_this_task=60, ensemble_size=0,
delete_tmp_folder_after_terminate=False,
dask_client=dask_client)

automl_fitted = automl.fit(X_train, y_train)
assert automl is automl_fitted

automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
assert automl is automl_ensemble_fitted

automl_refitted = automl.refit(X_train.copy(), y_train.copy())
assert automl is automl_refitted

predictions = automl_fitted.predict(X_test)
assert sklearn.metrics.accuracy_score(
y_test, predictions
) >= 2 / 3, print_debug_information(automl)

assert "boosting" not in str(automl.get_configuration_space(X=X_train, y=y_train))

pickle.dumps(automl_fitted)


@pytest.mark.parametrize("class_", [AutoSklearnClassifier, AutoSklearnRegressor,
AutoSklearn2Classifier])
def test_check_estimator_signature(class_):
Expand Down

0 comments on commit a8effd8

Please sign in to comment.