Skip to content

Commit 7488955

Browse files
committed
update for rebase, add loading of X_data in ensemble builder
1 parent aa6e47a commit 7488955

File tree

8 files changed

+108
-20
lines changed

8 files changed

+108
-20
lines changed

autosklearn/ensemble_building/builder.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from sklearn.utils.validation import check_random_state
1919

2020
from autosklearn.automl_common.common.utils.backend import Backend
21+
from autosklearn.data.validation import SUPPORTED_FEAT_TYPES
2122
from autosklearn.data.xy_data_manager import XYDataManager
2223
from autosklearn.ensemble_building.run import Run, RunID
2324
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
@@ -169,7 +170,9 @@ def __init__(
169170

170171
# Data we may need
171172
datamanager: XYDataManager = self.backend.load_datamanager()
173+
self._X_test: SUPPORTED_FEAT_TYPES | None = datamanager.data.get("X_test", None)
172174
self._y_test: np.ndarray | None = datamanager.data.get("Y_test", None)
175+
self._X_ensemble: SUPPORTED_FEAT_TYPES | None = None
173176
self._y_ensemble: np.ndarray | None = None
174177

175178
@property
@@ -226,6 +229,29 @@ def targets(self, kind: str = "ensemble") -> np.ndarray | None:
226229
else:
227230
raise NotImplementedError(kind)
228231

232+
def X_data(self, kind: str = "ensemble") -> SUPPORTED_FEAT_TYPES:
233+
"""The ensemble targets used for training the ensemble
234+
235+
It will attempt to load and cache them in memory but
236+
return None if it can't.
237+
238+
Returns
239+
-------
240+
np.ndarray | None
241+
The ensemble targets, if they can be loaded
242+
"""
243+
if kind == "ensemble":
244+
if self._X_ensemble is None:
245+
if os.path.exists(self.backend._get_input_ensemble_filename()):
246+
self._X_ensemble = self.backend.load_input_ensemble()
247+
return self._X_ensemble
248+
249+
elif kind == "test":
250+
return self._X_test
251+
252+
else:
253+
raise NotImplementedError(kind)
254+
229255
def run(
230256
self,
231257
iteration: int,
@@ -424,7 +450,10 @@ def main(
424450
for run in requires_update:
425451
run.record_modified_times() # So we don't count as modified next time
426452
run.losses = {
427-
metric.name: self.loss(run, metric=metric) for metric in self.metrics
453+
metric.name: self.loss(
454+
run, metric=metric, X_data=self.X_data("ensemble")
455+
)
456+
for metric in self.metrics
428457
}
429458

430459
# Get the dummy and real runs
@@ -520,9 +549,11 @@ def main(
520549
return self.ensemble_history, self.ensemble_nbest
521550

522551
targets = cast(np.ndarray, self.targets("ensemble")) # Sure they exist
552+
X_data = self.X_data("ensemble")
523553

524554
ensemble = self.fit_ensemble(
525-
candidates,
555+
candidates=candidates,
556+
X_data=X_data,
526557
targets=targets,
527558
runs=runs,
528559
ensemble_class=self.ensemble_class,
@@ -556,12 +587,14 @@ def main(
556587

557588
run_preds = [r.predictions(kind, precision=self.precision) for r in models]
558589
pred = ensemble.predict(run_preds)
590+
X_data = self.X_data(kind)
559591

560592
scores = calculate_scores(
561593
solution=pred_targets,
562594
prediction=pred,
563595
task_type=self.task_type,
564596
metrics=self.metrics,
597+
X_data=X_data,
565598
scoring_functions=None,
566599
)
567600
performance_stamp[f"ensemble_{score_name}_score"] = scores[
@@ -773,6 +806,7 @@ def candidate_selection(
773806
def fit_ensemble(
774807
self,
775808
candidates: list[Run],
809+
X_data: SUPPORTED_FEAT_TYPES,
776810
targets: np.ndarray,
777811
*,
778812
runs: list[Run],
@@ -794,6 +828,9 @@ def fit_ensemble(
794828
candidates: list[Run]
795829
List of runs to build an ensemble from
796830
831+
X_data: SUPPORTED_FEAT_TYPES
832+
The base level data.
833+
797834
targets: np.ndarray
798835
The targets to build the ensemble with
799836
@@ -851,6 +888,7 @@ def fit_ensemble(
851888

852889
ensemble.fit(
853890
base_models_predictions=predictions_train,
891+
X_data=X_data,
854892
true_targets=targets,
855893
model_identifiers=[run.id for run in candidates],
856894
runs=runs,
@@ -953,7 +991,13 @@ def requires_deletion(
953991

954992
return keep, delete
955993

956-
def loss(self, run: Run, metric: Scorer, kind: str = "ensemble") -> float:
994+
def loss(
995+
self,
996+
run: Run,
997+
metric: Scorer,
998+
X_data: SUPPORTED_FEAT_TYPES,
999+
kind: str = "ensemble",
1000+
) -> float:
9571001
"""Calculate the loss for a run
9581002
9591003
Parameters
@@ -984,6 +1028,7 @@ def loss(self, run: Run, metric: Scorer, kind: str = "ensemble") -> float:
9841028
prediction=predictions,
9851029
task_type=self.task_type,
9861030
metric=metric,
1031+
X_data=X_data,
9871032
)
9881033
except Exception as e:
9891034
tb = traceback.format_exc()

autosklearn/ensembles/abstract_ensemble.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77

88
from autosklearn.automl_common.common.utils.backend import Backend
9+
from autosklearn.data.validation import SUPPORTED_FEAT_TYPES
910
from autosklearn.ensemble_building.run import Run
1011
from autosklearn.metrics import Scorer
1112
from autosklearn.pipeline.base import BasePipeline
@@ -26,6 +27,7 @@ def __init__(
2627
def fit(
2728
self,
2829
base_models_predictions: np.ndarray | List[np.ndarray],
30+
X_data: SUPPORTED_FEAT_TYPES,
2931
true_targets: np.ndarray,
3032
model_identifiers: List[Tuple[int, int, float]],
3133
runs: Sequence[Run],
@@ -45,6 +47,8 @@ def fit(
4547
Can be a list of 2d numpy arrays as well to prevent copying all
4648
predictions into a single, large numpy array.
4749
50+
X_data : list-like or sparse data
51+
4852
true_targets : array of shape [n_targets]
4953
5054
model_identifiers : identifier for each base model.

autosklearn/ensembles/ensemble_selection.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from autosklearn.automl_common.common.utils.backend import Backend
1313
from autosklearn.constants import TASK_TYPES
14+
from autosklearn.data.validation import SUPPORTED_FEAT_TYPES
1415
from autosklearn.ensemble_building.run import Run
1516
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
1617
from autosklearn.metrics import Scorer, calculate_losses
@@ -104,6 +105,7 @@ def __getstate__(self) -> Dict[str, Any]:
104105
def fit(
105106
self,
106107
base_models_predictions: List[np.ndarray],
108+
X_data: SUPPORTED_FEAT_TYPES,
107109
true_targets: np.ndarray,
108110
model_identifiers: List[Tuple[int, int, float]],
109111
runs: Sequence[Run],
@@ -127,25 +129,31 @@ def fit(
127129
if self.bagging:
128130
self._bagging(base_models_predictions, true_targets)
129131
else:
130-
self._fit(base_models_predictions, true_targets)
132+
self._fit(
133+
predictions=base_models_predictions,
134+
X_data=X_data,
135+
labels=true_targets,
136+
)
131137
self._calculate_weights()
132138
self.identifiers_ = model_identifiers
133139
return self
134140

135141
def _fit(
136142
self,
137143
predictions: List[np.ndarray],
144+
X_data: SUPPORTED_FEAT_TYPES,
138145
labels: np.ndarray,
139146
) -> EnsembleSelection:
140147
if self.mode == "fast":
141-
self._fast(predictions, labels)
148+
self._fast(predictions, X_data, labels)
142149
else:
143-
self._slow(predictions, labels)
150+
self._slow(predictions, X_data, labels)
144151
return self
145152

146153
def _fast(
147154
self,
148155
predictions: List[np.ndarray],
156+
X_data: SUPPORTED_FEAT_TYPES,
149157
labels: np.ndarray,
150158
) -> None:
151159
"""Fast version of Rich Caruana's ensemble selection method."""
@@ -200,6 +208,7 @@ def _fast(
200208
prediction=fant_ensemble_prediction,
201209
task_type=self.task_type,
202210
metrics=[self.metric],
211+
X_data=X_data,
203212
scoring_functions=None,
204213
)[self.metric.name]
205214

@@ -219,7 +228,12 @@ def _fast(
219228
self.trajectory_ = trajectory
220229
self.train_loss_ = trajectory[-1]
221230

222-
def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None:
231+
def _slow(
232+
self,
233+
predictions: List[np.ndarray],
234+
X_data: SUPPORTED_FEAT_TYPES,
235+
labels: np.ndarray,
236+
) -> None:
223237
"""Rich Caruana's ensemble selection method."""
224238
self.num_input_models_ = len(predictions)
225239

@@ -242,6 +256,7 @@ def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None:
242256
prediction=ensemble_prediction,
243257
task_type=self.task_type,
244258
metrics=[self.metric],
259+
X_data=X_data,
245260
scoring_functions=None,
246261
)[self.metric.name]
247262
ensemble.pop()

autosklearn/ensembles/singlebest_ensemble.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from smac.runhistory.runhistory import RunHistory
99

1010
from autosklearn.automl_common.common.utils.backend import Backend
11+
from autosklearn.data.validation import SUPPORTED_FEAT_TYPES
1112
from autosklearn.ensemble_building.run import Run
1213
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
1314
from autosklearn.metrics import Scorer
@@ -52,6 +53,7 @@ def __init__(
5253
def fit(
5354
self,
5455
base_models_predictions: np.ndarray | List[np.ndarray],
56+
X_data: SUPPORTED_FEAT_TYPES,
5557
true_targets: np.ndarray,
5658
model_identifiers: List[Tuple[int, int, float]],
5759
runs: Sequence[Run],

autosklearn/metrics/__init__.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __call__(
4848
self,
4949
y_true: np.ndarray,
5050
y_pred: np.ndarray,
51-
X_data: Optional[np.ndarray] = None,
51+
X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
5252
sample_weight: Optional[List[float]] = None,
5353
) -> float:
5454
pass
@@ -62,7 +62,7 @@ def __call__(
6262
self,
6363
y_true: np.ndarray,
6464
y_pred: np.ndarray,
65-
X_data: Optional[np.ndarray] = None,
65+
X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
6666
sample_weight: Optional[List[float]] = None,
6767
) -> float:
6868
"""Evaluate predicted target values for X relative to y_true.
@@ -129,7 +129,7 @@ def __call__(
129129
self,
130130
y_true: np.ndarray,
131131
y_pred: np.ndarray,
132-
X_data: Optional[np.ndarray] = None,
132+
X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
133133
sample_weight: Optional[List[float]] = None,
134134
) -> float:
135135
"""Evaluate predicted probabilities for X relative to y_true.
@@ -189,7 +189,7 @@ def __call__(
189189
self,
190190
y_true: np.ndarray,
191191
y_pred: np.ndarray,
192-
X_data: Optional[np.ndarray] = None,
192+
X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
193193
sample_weight: Optional[List[float]] = None,
194194
) -> float:
195195
"""Evaluate decision function output for X relative to y_true.
@@ -563,6 +563,7 @@ def calculate_loss(
563563
prediction: np.ndarray,
564564
task_type: int,
565565
metric: Scorer,
566+
X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
566567
) -> float:
567568
"""Calculate the loss with a given metric
568569
@@ -579,12 +580,16 @@ def calculate_loss(
579580
580581
metric: Scorer
581582
The metric to use
583+
584+
X_data: Optional[SUPPORTED_XDATA_TYPES]
585+
X data used to obtain the predictions
582586
"""
583587
losses = calculate_losses(
584588
solution=solution,
585589
prediction=prediction,
586590
task_type=task_type,
587591
metrics=[metric],
592+
X_data=X_data,
588593
)
589594
return losses[metric.name]
590595

@@ -615,7 +620,7 @@ def calculate_losses(
615620
metrics: Sequence[Scorer]
616621
A list of objects that hosts a function to calculate how good the
617622
prediction is according to the solution.
618-
X_data: Optional[np.ndarray]
623+
X_data: Optional[SUPPORTED_XDATA_TYPES]
619624
X data used to obtain the predictions
620625
scoring_functions: List[Scorer]
621626
A list of metrics to calculate multiple losses
@@ -652,7 +657,7 @@ def compute_single_metric(
652657
prediction: np.ndarray,
653658
solution: np.ndarray,
654659
task_type: int,
655-
X_data: Optional[np.ndarray] = None,
660+
X_data: Optional[SUPPORTED_XDATA_TYPES] = None,
656661
) -> float:
657662
"""
658663
Returns a metric for the given Auto-Sklearn Scorer object.

test/test_ensemble_builder/test_ensemble_builder.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -607,9 +607,10 @@ def test_loss_with_no_ensemble_targets(
607607
* Should give a loss of np.inf if run has no predictions of a given kind
608608
"""
609609
run = make_run(predictions=None)
610+
X_data = builder.X_data()
610611
metric = builder.metrics[0]
611612

612-
assert builder.loss(run, metric=metric, kind=kind) == np.inf
613+
assert builder.loss(run, metric=metric, X_data=X_data, kind=kind) == np.inf
613614

614615

615616
@parametrize("kind", ["ensemble", "test"])
@@ -623,12 +624,13 @@ def test_loss_with_targets(
623624
-------
624625
* Should give a loss < np.inf if the predictions exist
625626
"""
627+
X_data = builder.X_data(kind)
626628
targets = builder.targets(kind)
627629
metric = builder.metrics[0]
628630

629631
run = make_run(predictions={kind: targets})
630632

631-
assert builder.loss(run, metric=metric, kind=kind) < np.inf
633+
assert builder.loss(run, metric=metric, X_data=X_data, kind=kind) < np.inf
632634

633635

634636
def test_delete_runs(builder: EnsembleBuilder, make_run: Callable[..., Run]) -> None:
@@ -680,13 +682,16 @@ def test_fit_ensemble_produces_ensemble(
680682
-------
681683
* Should produce an ensemble if all runs have predictions
682684
"""
685+
X_data = builder.X_data("ensemble")
683686
targets = builder.targets("ensemble")
684687
assert targets is not None
685688

686689
predictions = targets
687690
runs = [make_run(predictions={"ensemble": predictions}) for _ in range(10)]
688691

689-
ensemble = builder.fit_ensemble(candidates=runs, targets=targets, runs=runs)
692+
ensemble = builder.fit_ensemble(
693+
candidates=runs, X_data=X_data, targets=targets, runs=runs
694+
)
690695

691696
assert ensemble is not None
692697

@@ -701,6 +706,7 @@ def test_fit_with_error_gives_no_ensemble(
701706
* A run without predictions will raise an error will cause `fit_ensemble` to fail
702707
as it requires all runs to have valid predictions
703708
"""
709+
X_data = builder.X_data("ensemble")
704710
targets = builder.targets("ensemble")
705711
assert targets is not None
706712

@@ -712,7 +718,7 @@ def test_fit_with_error_gives_no_ensemble(
712718
runs.append(bad_run)
713719

714720
with pytest.raises(FileNotFoundError):
715-
builder.fit_ensemble(candidates=runs, targets=targets, runs=runs)
721+
builder.fit_ensemble(candidates=runs, X_data=X_data, targets=targets, runs=runs)
716722

717723

718724
@parametrize("time_buffer", [1, 5])

0 commit comments

Comments
 (0)