Preserving train inputs and targets through transforms (#3044)

Carl Hvarfner · facebook-github-bot · commit e17f00343c5b · 2025-10-14T12:30:38.000-07:00
Summary:

This PR preserves botorch transforms (specifically outcome_transforms, like Standardize) through state_dict loading. The fix also ensures that train_targets of a Leave-one-out model with outcome transforms will, in the default case, have the same targets as a base model, minus the point left out.


__Longer explanation:__
Transforms, and specifically learnable output transforms like Standardize, will currently:
a. Learn the parameters at initialization of the GP
b. Transform the train_Ys to the normalized space

Then, when we load a state dict, we will:
a. Impose new standardization parameters on already standardized data
b. Potentially make the transforms re-learnable, nullifying the change made by the state dict

This has undesired consequences for cross-validation, as all cross-validated models will effectively have different training data. In essence, _we don't simply leave one point out, but instead we leave one out and re-standardize_. When we have outliers in the data, this will lead to substantially different predictions when the outlier is left out, since the outlier will substantially impact the outcome transform parameters.

TODO: 
- Account for non-invertible transforms

Differential Revision: D84571407
diff --git a/botorch/models/model.py b/botorch/models/model.py
@@ -34,7 +34,10 @@
 from botorch.utils.containers import BotorchContainer
 from botorch.utils.datasets import SupervisedDataset
 from botorch.utils.transforms import is_fully_bayesian
-from gpytorch.likelihoods.gaussian_likelihood import FixedNoiseGaussianLikelihood
+from gpytorch.likelihoods.gaussian_likelihood import (
+    _GaussianLikelihoodBase,
+    FixedNoiseGaussianLikelihood,
+)
 from torch import Tensor
 from torch.nn import Module, ModuleDict, ModuleList
 from typing_extensions import Self
@@ -268,6 +271,112 @@ def train(self, mode: bool = True) -> Model:
     def dtypes_of_buffers(self) -> set[torch.dtype]:
         return {t.dtype for t in self.buffers() if t is not None}
 
+    def load_state_dict(
+        self,
+        state_dict: Mapping[str, Any],
+        strict: bool = True,
+        keep_transforms: bool = True,
+    ) -> None:
+        r"""Load the model state.
+
+        Args:
+            state_dict: A dict containing the state of the model.
+            strict: A boolean indicating whether to strictly enforce that the keys.
+            keep_transforms: A boolean indicating whether to keep the input and outcome
+                transforms. Doing so is useful when loading a model that was trained on
+                a full set of data, and is later loaded with a subset of the data. If
+                `keep_transforms=False`, the transforms are reset to the default values
+                and re-trained when the model is evaluated, which may lead to different
+                behavior than when the initial model was trained, pre-loading. Yvar does
+                not need to be transformed, since it is saved as part of the state dict
+                and will thus always be on the right format.
+        """
+        if not keep_transforms:
+            super().load_state_dict(state_dict, strict)
+            return
+        should_input_transform = (
+            hasattr(self, "input_transform") and self.input_transform is not None
+        )
+        should_outcome_transform = (
+            hasattr(self, "outcome_transform") and self.outcome_transform is not None
+        )
+        with torch.no_grad():
+            retransformed_Y = None
+            untransformed_Yvar = None
+            if self._num_outputs > 1:
+                untransformed_Y = self.train_targets.transpose(-1, -2)
+                if isinstance(self.likelihood, _GaussianLikelihoodBase):
+                    untransformed_Yvar = self.likelihood.noise_covar.noise.transpose(
+                        -1, -2
+                    )
+            else:
+                untransformed_Y = self.train_targets.unsqueeze(-1)
+                if isinstance(self.likelihood, _GaussianLikelihoodBase):
+                    untransformed_Yvar = self.likelihood.noise_covar.noise.unsqueeze(-1)
+
+            untransformed_X = self.train_inputs[0]
+            if should_input_transform:
+                try:
+                    untransformed_X = self.input_transform.untransform(untransformed_X)
+                except NotImplementedError:
+                    # If the outcome transform does not support untransforming, we
+                    # re-transform the train targets.
+                    warnings.warn(
+                        "Input transform does not support untransforming. Cannot load"
+                        "the state dict with input transforms preserved. If the outcome"
+                        "transform requires the inputs to be computed, it will be"
+                        "computed on the transformed inputs."
+                    )
+
+            # We obtain the untransformed Y (the train_Y's) by untransforming the train
+            # targets.
+            if should_outcome_transform:
+                try:
+                    untransformed_Y, untransformed_Yvar = (
+                        self.outcome_transform.untransform(
+                            Y=untransformed_Y,
+                            Yvar=untransformed_Yvar,
+                            X=untransformed_X,
+                        )
+                    )
+                except NotImplementedError:
+                    # If the outcome transform does not support untransforming, we
+                    # re-transform the train targets.
+                    warnings.warn(
+                        "Outcome transform does not support untransforming. Cannot load the"
+                        "state dict with transforms preserved. Setting keep_transforms=False.",
+                    )
+                    super().load_state_dict(state_dict, strict)
+                    return
+
+        super().load_state_dict(state_dict, strict)
+        # If we want to keep the transforms, we cannot have them in train mode when the
+        # state dict is loaded.
+        if should_input_transform:
+            self.input_transform.eval()
+
+        # Now, the outcome transform is identical to the state_dict'ed model, so we may
+        # once again transform the train targets.
+        if should_outcome_transform:
+            self.outcome_transform.eval()
+            retransformed_Y, retransformed_Yvar = self.outcome_transform(
+                Y=untransformed_Y, Yvar=untransformed_Yvar, X=untransformed_X
+            )
+
+            # not all models have self._transform_tensor_args, so we do this instead.
+            if self._num_outputs > 1:
+                retransformed_Y = retransformed_Y.transpose(-1, -2)
+                retransformed_Yvar = retransformed_Yvar.transpose(-1, -2)
+            else:
+                retransformed_Y = retransformed_Y.squeeze(-1)
+                retransformed_Yvar = retransformed_Yvar.squeeze(-1)
+            self.set_train_data(
+                targets=retransformed_Y,
+                strict=strict,
+            )
+            if isinstance(self.likelihood, _GaussianLikelihoodBase):
+                self.likelihood.noise_covar.noise = retransformed_Yvar
+
 
 class FantasizeMixin(ABC):
     """
diff --git a/test/models/test_model.py b/test/models/test_model.py
@@ -9,12 +9,21 @@
 from botorch.acquisition.objective import PosteriorTransform
 from botorch.exceptions.errors import InputDataError
 from botorch.models.deterministic import GenericDeterministicModel
+from botorch.models.gp_regression import SingleTaskGP
 from botorch.models.model import Model, ModelDict, ModelList
+from botorch.models.transforms.input import Normalize, Round
+from botorch.models.transforms.outcome import Standardize
 from botorch.posteriors.ensemble import EnsemblePosterior
 from botorch.posteriors.posterior_list import PosteriorList
 from botorch.utils.datasets import SupervisedDataset
 from botorch.utils.testing import BotorchTestCase, MockModel, MockPosterior
 from torch import rand
+from torch.nn import Module
+
+
+class NonUntransformableOutcomeTransform(Standardize):
+    def untransform(self, **kwargs):
+        raise NotImplementedError
 
 
 class NotSoAbstractBaseModel(Model):
@@ -138,6 +147,235 @@ def test_posterior_transform(self):
         )
 
 
+def _get_input_output_transform(
+    d: int, m: int, use_transforms: bool = True
+) -> dict[str, Module]:
+    return {
+        "input_transform": Normalize(d=d) if use_transforms else None,
+        "outcome_transform": Standardize(m=m) if use_transforms else None,
+    }
+
+
+class TestTransformWarnings(BotorchTestCase):
+    def test_set_transformed_inputs_warning_no_train_inputs(self):
+        """Test warning when model has input_transform but no train_inputs."""
+        # Setup: Create a model with input_transform but without train_inputs attribute
+        model = NotSoAbstractBaseModel()
+        model.input_transform = Normalize(d=2)
+
+        # Execute: Call _set_transformed_inputs which should trigger warning
+        # Assert: Verify warning is raised
+        with self.assertWarnsRegex(
+            RuntimeWarning,
+            "Could not update `train_inputs` with transformed inputs "
+            "since NotSoAbstractBaseModel does not have a `train_inputs` "
+            "attribute. Make sure that the `input_transform` is applied to "
+            "both the train inputs and test inputs.",
+        ):
+            model._set_transformed_inputs()
+
+    def test_load_state_dict_input_warnings(self):
+        """Test warning when input transform doesn't support untransforming."""
+        tkwargs = {"device": self.device, "dtype": torch.double}
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_Y = torch.rand(3, 1, **tkwargs)
+
+        # Setup: Create model with untransformable input transform
+        model = SingleTaskGP(
+            train_X=train_X,
+            train_Y=train_Y,
+            input_transform=Round(),
+            outcome_transform=Standardize(m=1),
+        )
+        state_dict = model.state_dict()
+
+        # Execute: Load state dict with keep_transforms=True
+        # Assert: Verify warning is raised for untransformable input transform
+        with self.assertWarnsRegex(
+            UserWarning,
+            "Input transform does not support untransforming.*",
+        ):
+            model.load_state_dict(state_dict, keep_transforms=True)
+
+    def test_load_state_dict_output_warnings(self):
+        """Test warning when outcome transform doesn't support untransforming."""
+        tkwargs = {"device": self.device, "dtype": torch.double}
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_Y = torch.rand(3, 1, **tkwargs)
+
+        # Setup: Create model with untransformable outcome transform
+        model = SingleTaskGP(
+            train_X=train_X,
+            train_Y=train_Y,
+            input_transform=Normalize(d=2),
+            outcome_transform=NonUntransformableOutcomeTransform(m=1),
+        )
+        state_dict = model.state_dict()
+
+        # Assert: Verify warning is raised for untransformable outcome transform
+        with self.assertWarnsRegex(
+            UserWarning,
+            "Outcome transform does not support untransforming.*",
+        ):
+            model.load_state_dict(state_dict, keep_transforms=True)
+
+
+class TestLoadStateDict(BotorchTestCase):
+    def _test_load_state_dict_base(
+        self, num_outputs: int, include_yvar: bool = True
+    ) -> None:
+        """Base test helper for load_state_dict with transforms."""
+        tkwargs = {"device": self.device, "dtype": torch.double}
+        from botorch.models.gp_regression import SingleTaskGP
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_X = torch.cat(
+            [train_X, torch.tensor([[-0.02, 11.1], [17.1, -2.5]], **tkwargs)], dim=0
+        )
+        train_Y = torch.sin(train_X).sum(dim=1, keepdim=True).repeat(1, num_outputs)
+
+        model_kwargs = {
+            "train_X": train_X,
+            "train_Y": train_Y,
+        }
+
+        if include_yvar:
+            train_Yvar = 0.1 * torch.rand_like(train_Y)
+            model_kwargs["train_Yvar"] = train_Yvar
+
+        base_model = SingleTaskGP(
+            **model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+
+        original_train_inputs = base_model.input_transform(base_model.train_inputs[0])
+        original_train_targets = base_model.train_targets.clone()
+        original_train_yvar = base_model.likelihood.noise_covar.noise.clone()
+
+        state_dict = base_model.state_dict()
+
+        cv_model_kwargs = model_kwargs.copy()
+        cv_model_kwargs["train_X"] = train_X[:-1]
+        cv_model_kwargs["train_Y"] = train_Y[:-1]
+        if include_yvar:
+            cv_model_kwargs["train_Yvar"] = train_Yvar[:-1]
+        cv_model = SingleTaskGP(
+            **cv_model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+
+        # Test keep_transforms=True
+        cv_model.load_state_dict(state_dict, keep_transforms=True)
+
+        # Ensure outcome transform is in eval mode and doesn't change parameters
+        sd_mean = cv_model.outcome_transform.means
+        cv_model.outcome_transform(train_Y[:-1])
+        self.assertTrue(torch.all(cv_model.outcome_transform.means == sd_mean))
+
+        # Check that transform parameters match state_dict
+        self.assertTrue(
+            torch.allclose(
+                cv_model.input_transform._offset,
+                state_dict["input_transform._offset"],
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                cv_model.outcome_transform.means,
+                state_dict["outcome_transform.means"],
+            )
+        )
+
+        # Verify train data preservation in transformed space
+        self.assertAllClose(cv_model.train_targets, original_train_targets[..., :-1])
+        self.assertTrue(
+            torch.equal(
+                cv_model.input_transform(cv_model.train_inputs[0]),
+                original_train_inputs[..., :-1, :],
+            )
+        )
+        if include_yvar:
+            self.assertAllClose(
+                cv_model.likelihood.noise_covar.noise, original_train_yvar[..., :-1]
+            )
+
+        # Test keep_transforms=False (allows refitting)
+        cv_model = SingleTaskGP(
+            **cv_model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+        cv_model.load_state_dict(state_dict, keep_transforms=False)
+
+        # Transforms should refit on new data
+        sd_mean = cv_model.outcome_transform.means
+        cv_model.outcome_transform(train_Y[:-1])
+        self.assertTrue(torch.all(cv_model.outcome_transform.means != sd_mean))
+
+        self.assertFalse(
+            torch.equal(
+                cv_model.input_transform(cv_model.train_inputs[0]),
+                original_train_inputs[..., :-1, :],
+            )
+        )
+        self.assertFalse(
+            torch.equal(cv_model.train_targets, original_train_targets[..., :-1])
+        )
+        self.assertFalse(
+            torch.equal(
+                cv_model.input_transform._offset,
+                state_dict["input_transform._offset"],
+            )
+        )
+        self.assertFalse(
+            torch.equal(
+                cv_model.outcome_transform.means,
+                state_dict["outcome_transform.means"],
+            )
+        )
+
+    def test_load_state_dict_with_transforms(self):
+        """Test load_state_dict with input and outcome transforms."""
+        self._test_load_state_dict_base(num_outputs=1, include_yvar=True)
+
+    def test_load_state_dict_with_transforms_no_yvar(self):
+        """Test load_state_dict with input and outcome transforms without Yvar."""
+        self._test_load_state_dict_base(num_outputs=1, include_yvar=False)
+
+    def test_load_state_dict_multi_output_with_transforms(self):
+        """Test load_state_dict with multi-output model and transforms."""
+        self._test_load_state_dict_base(num_outputs=3, include_yvar=True)
+
+    def test_load_state_dict_multi_output_with_transforms_no_yvar(self):
+        """Test load_state_dict with multi-output model and transforms without Yvar."""
+        self._test_load_state_dict_base(num_outputs=3, include_yvar=False)
+
+    def test_load_state_dict_no_transforms(self):
+        """Test load_state_dict without any transforms."""
+        tkwargs = {"device": self.device, "dtype": torch.double}
+        from botorch.models.gp_regression import SingleTaskGP
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_X = torch.cat(
+            [train_X, torch.tensor([[-0.02, 11.1], [17.1, -2.5]], **tkwargs)], dim=0
+        )
+        train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
+
+        base_model = SingleTaskGP(
+            train_X=train_X, train_Y=train_Y, outcome_transform=None
+        )
+        original_train_targets = base_model.train_targets.clone()
+        state_dict = base_model.state_dict()
+
+        cv_model = SingleTaskGP(
+            train_X=train_X[:-1], train_Y=train_Y[:-1], outcome_transform=None
+        )
+        cv_model.load_state_dict(state_dict, keep_transforms=False)
+
+        # Verify train targets are preserved
+        self.assertTrue(
+            torch.equal(cv_model.train_targets, original_train_targets[:-1])
+        )
+
+
 class TestModelDict(BotorchTestCase):
     def test_model_dict(self):
         models = {"m1": MockModel(MockPosterior()), "m2": MockModel(MockPosterior())}