Preserving train inputs and targets through transforms (#3044)

Carl Hvarfner · facebook-github-bot · commit a4163d02efce · 2025-10-16T12:35:41.000-07:00
Summary:

This PR preserves botorch transforms (specifically outcome_transforms, like Standardize) through state_dict loading. The fix also ensures that train_targets of a Leave-one-out model with outcome transforms will, in the default case, have the same targets as a base model, minus the point left out.


__Longer explanation:__
Transforms, and specifically learnable output transforms like Standardize, will currently:
a. Learn the parameters at initialization of the GP
b. Transform the train_Ys to the normalized space

Then, when we load a state dict, we will:
a. Impose new standardization parameters on already standardized data
b. Potentially make the transforms re-learnable, nullifying the change made by the state dict

This has undesired consequences for cross-validation, as all cross-validated models will effectively have different training data. In essence, _we don't simply leave one point out, but instead we leave one out and re-standardize_. When we have outliers in the data, this will lead to substantially different predictions when the outlier is left out, since the outlier will substantially impact the outcome transform parameters.

Notebook explaining the effect with some plots: N8342965

Reviewed By: Balandat

Differential Revision: D84571407
diff --git a/botorch/models/gpytorch.py b/botorch/models/gpytorch.py
@@ -17,7 +17,7 @@
 import warnings
 from abc import ABC
 from copy import deepcopy
-from typing import Any, TYPE_CHECKING
+from typing import Any, Mapping, TYPE_CHECKING
 
 import torch
 from botorch.acquisition.objective import PosteriorTransform
@@ -283,6 +283,115 @@ def condition_on_observations(
             ).detach()
         return fantasy_model
 
+    def load_state_dict(
+        self,
+        state_dict: Mapping[str, Any],
+        strict: bool = True,
+        keep_transforms: bool = True,
+    ) -> None:
+        r"""Load the model state.
+
+        Args:
+            state_dict: A dict containing the state of the model.
+            strict: A boolean indicating whether to strictly enforce that the keys.
+            keep_transforms: A boolean indicating whether to keep the input and outcome
+                transforms. Doing so is useful when loading a model that was trained on
+                a full set of data, and is later loaded with a subset of the data.
+        """
+        # If `keep_transforms is false, the transforms are reset to the default values
+        # and re-trained when the model is evaluated, which may lead to different
+        # behavior than when the initial model was trained, pre-loading.
+        if not keep_transforms:
+            super().load_state_dict(state_dict, strict)
+            return
+
+        # Checks that
+        # 1. the model has train targets (not necessarily true, e.g. for ApproximateGP),
+        # 2. The model accepts a transform, and that it is is not None.
+        should_outcome_transform = (
+            hasattr(self, "train_targets")
+            and getattr(self, "outcome_transform", None) is not None
+        )
+        with torch.no_grad():
+            untransformed_Yvar = None
+            # This becomes necessary when we have model batch_shapes,
+            # e.g. in FullyBayesianSingleTaskGP/MultiTaskGP. Then, we have a
+            # batch dimension in the noise, but not in the train_targets.
+            # Thus, we get this nested structure of if-statements to ensure
+            # train_targets and Yvar is on shape [batch_shape] x n x m,
+            # with batch_shape included only if the training data initially
+            # contained it.
+            if self.num_outputs > 1 and not isinstance(self, MultiTaskGPyTorchModel):
+                untransformed_Y = self.train_targets.transpose(-1, -2)
+                if isinstance(self.likelihood, FixedNoiseGaussianLikelihood):
+                    untransformed_Yvar = self.likelihood.noise_covar.noise.transpose(
+                        -1, -2
+                    )
+            else:
+                untransformed_Y = self.train_targets.unsqueeze(-1)
+                if isinstance(self.likelihood, FixedNoiseGaussianLikelihood):
+                    untransformed_Yvar = self.likelihood.noise_covar.noise.unsqueeze(-1)
+
+            # NOTE Some outcome transforms require an X, but the untransformed X's cannot
+            # generally be extracted without transformations & adding batch dimensions,
+            # e.g. in Warp). Thus, we use the train inputs.
+            X = self.train_inputs[0]
+
+            # We obtain the untransformed Y (the train_Y's) by untransforming the train
+            # targets.
+            if should_outcome_transform:
+                try:
+                    untransformed_Y, untransformed_Yvar = (
+                        self.outcome_transform.untransform(
+                            Y=untransformed_Y,
+                            Yvar=untransformed_Yvar,
+                            X=X,
+                        )
+                    )
+                except NotImplementedError:
+                    # If the outcome transform does not support untransforming, we
+                    # re-transform the train targets.
+                    warnings.warn(
+                        "Outcome transform does not support untransforming."
+                        "Cannot load the state dict with transforms preserved."
+                        "Setting keep_transforms=False.",
+                        stacklevel=3,
+                    )
+                    super().load_state_dict(state_dict, strict)
+                    return
+
+        super().load_state_dict(state_dict, strict)
+
+        # If we want to keep the transforms, we cannot have them in train mode.
+        # If we do, the transforms will be re-trained when the model is evaluated.
+        if getattr(self, "input_transform", None) is not None:
+            self.input_transform.eval()
+
+        # Now, the outcome transform is identical to the state_dict'ed model, so we may
+        # once again transform the train targets.
+        if should_outcome_transform:
+            self.outcome_transform.eval()
+            retransformed_Y, retransformed_Yvar = self.outcome_transform(
+                Y=untransformed_Y, Yvar=untransformed_Yvar, X=X
+            )
+
+            # not all models have self._transform_tensor_args, so we do this instead.
+            if self.num_outputs > 1 and not isinstance(self, MultiTaskGPyTorchModel):
+                retransformed_Y = retransformed_Y.transpose(-1, -2)
+                if isinstance(self.likelihood, FixedNoiseGaussianLikelihood):
+                    retransformed_Yvar = retransformed_Yvar.transpose(-1, -2)
+                    self.likelihood.noise_covar.noise = retransformed_Yvar
+            else:
+                retransformed_Y = retransformed_Y.squeeze(-1)
+                if isinstance(self.likelihood, FixedNoiseGaussianLikelihood):
+                    retransformed_Yvar = retransformed_Yvar.squeeze(-1)
+                    self.likelihood.noise_covar.noise = retransformed_Yvar
+
+            self.set_train_data(
+                targets=retransformed_Y,
+                strict=strict,
+            )
+
 
 # pyre-fixme[13]: uninitialized attributes _num_outputs, _input_batch_shape,
 # _aug_batch_shape
diff --git a/test/models/test_gpytorch.py b/test/models/test_gpytorch.py
@@ -17,6 +17,7 @@
 from botorch.exceptions.errors import DeprecationError, InputDataError
 from botorch.exceptions.warnings import InputDataWarning
 from botorch.fit import fit_gpytorch_mll
+from botorch.models.gp_regression import SingleTaskGP
 from botorch.models.gpytorch import (
     BatchedMultiOutputGPyTorchModel,
     GPyTorchModel,
@@ -28,6 +29,7 @@
 from botorch.models.transforms.input import (
     ChainedInputTransform,
     InputTransform,
+    Normalize,
     NumericToCategoricalEncoding,
 )
 from botorch.models.utils import fantasize
@@ -870,3 +872,197 @@ def test_condition_on_observations_train_input_shapes(self):
             fantasy_model._original_train_inputs.shape[0], original_size + 1
         )
         self.assertEqual(model2._original_train_inputs.shape[0], original_size)
+
+
+class NonUntransformableOutcomeTransform(Standardize):
+    def untransform(self, **kwargs):
+        raise NotImplementedError
+
+
+def _get_input_output_transform(
+    d: int, m: int, use_transforms: bool = True
+) -> dict[str, torch.nn.Module]:
+    return {
+        "input_transform": Normalize(d=d) if use_transforms else None,
+        "outcome_transform": Standardize(m=m) if use_transforms else None,
+    }
+
+
+class TestTransformWarnings(BotorchTestCase):
+    def test_set_transformed_inputs_warning_no_train_inputs(self):
+        from botorch.models.model import Model
+
+        class NotSoAbstractBaseModel(Model):
+            def posterior(self, X, output_indices, observation_noise, **kwargs):
+                pass
+
+        model = NotSoAbstractBaseModel()
+        model.input_transform = Normalize(d=2)
+
+        with self.assertWarnsRegex(
+            RuntimeWarning,
+            "Could not update `train_inputs` with transformed inputs "
+            "since NotSoAbstractBaseModel does not have a `train_inputs` "
+            "attribute. Make sure that the `input_transform` is applied to "
+            "both the train inputs and test inputs.",
+        ):
+            model._set_transformed_inputs()
+
+    def test_load_state_dict_output_warnings(self):
+        tkwargs = {"device": self.device, "dtype": torch.double}
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_Y = torch.rand(3, 1, **tkwargs)
+
+        model = SingleTaskGP(
+            train_X=train_X,
+            train_Y=train_Y,
+            input_transform=Normalize(d=2),
+            outcome_transform=NonUntransformableOutcomeTransform(m=1),
+        )
+        state_dict = model.state_dict()
+
+        with self.assertWarnsRegex(
+            UserWarning,
+            "Outcome transform does not support untransforming.*",
+        ):
+            model.load_state_dict(state_dict, keep_transforms=True)
+
+
+class TestLoadStateDict(BotorchTestCase):
+    def _test_load_state_dict_base(
+        self, num_outputs: int, include_yvar: bool = True
+    ) -> None:
+        tkwargs = {"device": self.device, "dtype": torch.double}
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_X = torch.cat(
+            [train_X, torch.tensor([[-0.02, 11.1], [17.1, -2.5]], **tkwargs)], dim=0
+        )
+        train_Y = torch.sin(train_X).sum(dim=1, keepdim=True).repeat(1, num_outputs)
+
+        model_kwargs = {
+            "train_X": train_X,
+            "train_Y": train_Y,
+        }
+
+        if include_yvar:
+            train_Yvar = 0.1 * torch.rand_like(train_Y)
+            model_kwargs["train_Yvar"] = train_Yvar
+
+        base_model = SingleTaskGP(
+            **model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+
+        original_train_inputs = base_model.input_transform(base_model.train_inputs[0])
+        original_train_targets = base_model.train_targets.clone()
+        original_train_yvar = base_model.likelihood.noise_covar.noise.clone()
+
+        state_dict = base_model.state_dict()
+
+        cv_model_kwargs = model_kwargs.copy()
+        cv_model_kwargs["train_X"] = train_X[:-1]
+        cv_model_kwargs["train_Y"] = train_Y[:-1]
+        if include_yvar:
+            cv_model_kwargs["train_Yvar"] = train_Yvar[:-1]
+        cv_model = SingleTaskGP(
+            **cv_model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+
+        cv_model.load_state_dict(state_dict, keep_transforms=True)
+
+        sd_mean = cv_model.outcome_transform.means
+        cv_model.outcome_transform(train_Y[:-1])
+        self.assertTrue(torch.all(cv_model.outcome_transform.means == sd_mean))
+
+        self.assertTrue(
+            torch.allclose(
+                cv_model.input_transform._offset,
+                state_dict["input_transform._offset"],
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                cv_model.outcome_transform.means,
+                state_dict["outcome_transform.means"],
+            )
+        )
+
+        self.assertAllClose(cv_model.train_targets, original_train_targets[..., :-1])
+        self.assertTrue(
+            torch.equal(
+                cv_model.input_transform(cv_model.train_inputs[0]),
+                original_train_inputs[..., :-1, :],
+            )
+        )
+        if include_yvar:
+            self.assertAllClose(
+                cv_model.likelihood.noise_covar.noise, original_train_yvar[..., :-1]
+            )
+
+        cv_model = SingleTaskGP(
+            **cv_model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+        cv_model.load_state_dict(state_dict, keep_transforms=False)
+
+        sd_mean = cv_model.outcome_transform.means
+        cv_model.outcome_transform(train_Y[:-1])
+        self.assertTrue(torch.all(cv_model.outcome_transform.means != sd_mean))
+
+        self.assertFalse(
+            torch.equal(
+                cv_model.input_transform(cv_model.train_inputs[0]),
+                original_train_inputs[..., :-1, :],
+            )
+        )
+        self.assertFalse(
+            torch.equal(cv_model.train_targets, original_train_targets[..., :-1])
+        )
+        self.assertFalse(
+            torch.equal(
+                cv_model.input_transform._offset,
+                state_dict["input_transform._offset"],
+            )
+        )
+        self.assertFalse(
+            torch.equal(
+                cv_model.outcome_transform.means,
+                state_dict["outcome_transform.means"],
+            )
+        )
+
+    def test_load_state_dict_with_transforms(self):
+        self._test_load_state_dict_base(num_outputs=1, include_yvar=True)
+
+    def test_load_state_dict_with_transforms_no_yvar(self):
+        self._test_load_state_dict_base(num_outputs=1, include_yvar=False)
+
+    def test_load_state_dict_multi_output_with_transforms(self):
+        self._test_load_state_dict_base(num_outputs=3, include_yvar=True)
+
+    def test_load_state_dict_multi_output_with_transforms_no_yvar(self):
+        self._test_load_state_dict_base(num_outputs=3, include_yvar=False)
+
+    def test_load_state_dict_no_transforms(self):
+        tkwargs = {"device": self.device, "dtype": torch.double}
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_X = torch.cat(
+            [train_X, torch.tensor([[-0.02, 11.1], [17.1, -2.5]], **tkwargs)], dim=0
+        )
+        train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
+
+        base_model = SingleTaskGP(
+            train_X=train_X, train_Y=train_Y, outcome_transform=None
+        )
+        original_train_targets = base_model.train_targets.clone()
+        state_dict = base_model.state_dict()
+
+        cv_model = SingleTaskGP(
+            train_X=train_X[:-1], train_Y=train_Y[:-1], outcome_transform=None
+        )
+        cv_model.load_state_dict(state_dict, keep_transforms=False)
+
+        self.assertTrue(
+            torch.equal(cv_model.train_targets, original_train_targets[:-1])
+        )