Preserving train inputs and targets through transforms (#3044)

Carl Hvarfner · facebook-github-bot · commit 9460560c1662 · 2025-10-15T16:13:31.000-07:00
Summary:

This PR preserves botorch transforms (specifically outcome_transforms, like Standardize) through state_dict loading. The fix also ensures that train_targets of a Leave-one-out model with outcome transforms will, in the default case, have the same targets as a base model, minus the point left out.


__Longer explanation:__
Transforms, and specifically learnable output transforms like Standardize, will currently:
a. Learn the parameters at initialization of the GP
b. Transform the train_Ys to the normalized space

Then, when we load a state dict, we will:
a. Impose new standardization parameters on already standardized data
b. Potentially make the transforms re-learnable, nullifying the change made by the state dict

This has undesired consequences for cross-validation, as all cross-validated models will effectively have different training data. In essence, _we don't simply leave one point out, but instead we leave one out and re-standardize_. When we have outliers in the data, this will lead to substantially different predictions when the outlier is left out, since the outlier will substantially impact the outcome transform parameters.

Notebook explaining the effect with some plots: N8342965

Reviewed By: Balandat

Differential Revision: D84571407
diff --git a/botorch/models/gpytorch.py b/botorch/models/gpytorch.py
@@ -17,7 +17,7 @@
 import warnings
 from abc import ABC
 from copy import deepcopy
-from typing import Any, TYPE_CHECKING
+from typing import Any, Mapping, TYPE_CHECKING
 
 import torch
 from botorch.acquisition.objective import PosteriorTransform
@@ -45,7 +45,10 @@
 from botorch.utils.multitask import separate_mtmvn
 from botorch.utils.transforms import is_ensemble
 from gpytorch.distributions import MultitaskMultivariateNormal, MultivariateNormal
-from gpytorch.likelihoods.gaussian_likelihood import FixedNoiseGaussianLikelihood
+from gpytorch.likelihoods.gaussian_likelihood import (
+    _GaussianLikelihoodBase,
+    FixedNoiseGaussianLikelihood,
+)
 from linear_operator.operators import BlockDiagLinearOperator, CatLinearOperator
 from torch import broadcast_shapes, Tensor
 
@@ -283,6 +286,107 @@ def condition_on_observations(
             ).detach()
         return fantasy_model
 
+    def load_state_dict(
+        self,
+        state_dict: Mapping[str, Any],
+        strict: bool = True,
+        keep_transforms: bool = True,
+    ) -> None:
+        r"""Load the model state.
+
+        Args:
+            state_dict: A dict containing the state of the model.
+            strict: A boolean indicating whether to strictly enforce that the keys.
+            keep_transforms: A boolean indicating whether to keep the input and outcome
+                transforms. Doing so is useful when loading a model that was trained on
+                a full set of data, and is later loaded with a subset of the data.
+        """
+        # If `keep_transforms is false, the transforms are reset to the default values
+        # and re-trained when the model is evaluated, which may lead to different
+        # behavior than when the initial model was trained, pre-loading.
+        if not keep_transforms:
+            super().load_state_dict(state_dict, strict)
+            return
+
+        # Checks that
+        # 1. the model has train targets (not necessarily true, e.g. for ApproximateGP),
+        # 2. The model accepts a transform, and that it is is not None.
+        should_outcome_transform = (
+            hasattr(self, "train_targets")
+            and getattr(self, "outcome_transform", None) is not None
+        )
+        with torch.no_grad():
+            retransformed_Y = None
+            untransformed_Yvar = None
+            # This means that
+            if self.num_outputs > 1:
+                untransformed_Y = self.train_targets.transpose(-1, -2)
+                if isinstance(self.likelihood, _GaussianLikelihoodBase):
+                    untransformed_Yvar = self.likelihood.noise_covar.noise.transpose(
+                        -1, -2
+                    )
+            else:
+                untransformed_Y = self.train_targets.unsqueeze(-1)
+                if isinstance(self.likelihood, _GaussianLikelihoodBase):
+                    untransformed_Yvar = self.likelihood.noise_covar.noise.unsqueeze(-1)
+
+            # NOTE Some outcome transforms require an X, but the untransformed X's cannot
+            # generally be extracted without transformations & adding batch dimensions,
+            # e.g. in Warp). Thus, we use the train inputs.
+            X = self.train_inputs[0]
+
+            # We obtain the untransformed Y (the train_Y's) by untransforming the train
+            # targets.
+            if should_outcome_transform:
+                try:
+                    untransformed_Y, untransformed_Yvar = (
+                        self.outcome_transform.untransform(
+                            Y=untransformed_Y,
+                            Yvar=untransformed_Yvar,
+                            X=X,
+                        )
+                    )
+                except NotImplementedError:
+                    # If the outcome transform does not support untransforming, we
+                    # re-transform the train targets.
+                    warnings.warn(
+                        "Outcome transform does not support untransforming."
+                        "Cannot load the state dict with transforms preserved."
+                        "Setting keep_transforms=False.",
+                        stacklevel=3,
+                    )
+                    super().load_state_dict(state_dict, strict)
+                    return
+
+        super().load_state_dict(state_dict, strict)
+
+        # If we want to keep the transforms, we cannot have them in train mode.
+        # If we do, the transforms will be re-trained when the model is evaluated.
+        if getattr(self, "input_transform", None) is not None:
+            self.input_transform.eval()
+
+        # Now, the outcome transform is identical to the state_dict'ed model, so we may
+        # once again transform the train targets.
+        if should_outcome_transform:
+            self.outcome_transform.eval()
+            retransformed_Y, retransformed_Yvar = self.outcome_transform(
+                Y=untransformed_Y, Yvar=untransformed_Yvar, X=X
+            )
+
+            # not all models have self._transform_tensor_args, so we do this instead.
+            if self.num_outputs > 1:
+                retransformed_Y = retransformed_Y.transpose(-1, -2)
+                retransformed_Yvar = retransformed_Yvar.transpose(-1, -2)
+            else:
+                retransformed_Y = retransformed_Y.squeeze(-1)
+                retransformed_Yvar = retransformed_Yvar.squeeze(-1)
+            self.set_train_data(
+                targets=retransformed_Y,
+                strict=strict,
+            )
+            if isinstance(self.likelihood, _GaussianLikelihoodBase):
+                self.likelihood.noise_covar.noise = retransformed_Yvar
+
 
 # pyre-fixme[13]: uninitialized attributes _num_outputs, _input_batch_shape,
 # _aug_batch_shape
diff --git a/botorch/models/likelihoods/sparse_outlier_noise.py b/botorch/models/likelihoods/sparse_outlier_noise.py
@@ -479,3 +479,11 @@ def _optimal_rhos(self, mll: ExactMarginalLogLikelihood) -> Tensor:
         loo_error = loo_mean - Y
         optimal_rho_deltas = loo_error.square() - loo_var
         return (optimal_rho_deltas - self.rho).clamp(0)[~self.is_active]
+
+    @property
+    def noise(self) -> Tensor:
+        return self.base_noise.noise
+
+    @noise.setter
+    def noise(self, value: Tensor) -> None:
+        self.base_noise.initialize(noise=value)
diff --git a/test/models/test_model.py b/test/models/test_model.py
@@ -9,12 +9,21 @@
 from botorch.acquisition.objective import PosteriorTransform
 from botorch.exceptions.errors import InputDataError
 from botorch.models.deterministic import GenericDeterministicModel
+from botorch.models.gp_regression import SingleTaskGP
 from botorch.models.model import Model, ModelDict, ModelList
+from botorch.models.transforms.input import Normalize, Round
+from botorch.models.transforms.outcome import Standardize
 from botorch.posteriors.ensemble import EnsemblePosterior
 from botorch.posteriors.posterior_list import PosteriorList
 from botorch.utils.datasets import SupervisedDataset
 from botorch.utils.testing import BotorchTestCase, MockModel, MockPosterior
 from torch import rand
+from torch.nn import Module
+
+
+class NonUntransformableOutcomeTransform(Standardize):
+    def untransform(self, **kwargs):
+        raise NotImplementedError
 
 
 class NotSoAbstractBaseModel(Model):
@@ -138,6 +147,211 @@ def test_posterior_transform(self):
         )
 
 
+def _get_input_output_transform(
+    d: int, m: int, use_transforms: bool = True
+) -> dict[str, Module]:
+    return {
+        "input_transform": Normalize(d=d) if use_transforms else None,
+        "outcome_transform": Standardize(m=m) if use_transforms else None,
+    }
+
+
+class TestTransformWarnings(BotorchTestCase):
+    def test_set_transformed_inputs_warning_no_train_inputs(self):
+        """Test warning when model has input_transform but no train_inputs."""
+        # Setup: Create a model with input_transform but without train_inputs attribute
+        model = NotSoAbstractBaseModel()
+        model.input_transform = Normalize(d=2)
+
+        # Execute: Call _set_transformed_inputs which should trigger warning
+        # Assert: Verify warning is raised
+        with self.assertWarnsRegex(
+            RuntimeWarning,
+            "Could not update `train_inputs` with transformed inputs "
+            "since NotSoAbstractBaseModel does not have a `train_inputs` "
+            "attribute. Make sure that the `input_transform` is applied to "
+            "both the train inputs and test inputs.",
+        ):
+            model._set_transformed_inputs()
+
+    def test_load_state_dict_output_warnings(self):
+        """Test warning when outcome transform doesn't support untransforming."""
+        tkwargs = {"device": self.device, "dtype": torch.double}
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_Y = torch.rand(3, 1, **tkwargs)
+
+        # Setup: Create model with untransformable outcome transform
+        model = SingleTaskGP(
+            train_X=train_X,
+            train_Y=train_Y,
+            input_transform=Normalize(d=2),
+            outcome_transform=NonUntransformableOutcomeTransform(m=1),
+        )
+        state_dict = model.state_dict()
+
+        # Assert: Verify warning is raised for untransformable outcome transform
+        with self.assertWarnsRegex(
+            UserWarning,
+            "Outcome transform does not support untransforming.*",
+        ):
+            model.load_state_dict(state_dict, keep_transforms=True)
+
+
+class TestLoadStateDict(BotorchTestCase):
+    def _test_load_state_dict_base(
+        self, num_outputs: int, include_yvar: bool = True
+    ) -> None:
+        """Base test helper for load_state_dict with transforms."""
+        tkwargs = {"device": self.device, "dtype": torch.double}
+        from botorch.models.gp_regression import SingleTaskGP
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_X = torch.cat(
+            [train_X, torch.tensor([[-0.02, 11.1], [17.1, -2.5]], **tkwargs)], dim=0
+        )
+        train_Y = torch.sin(train_X).sum(dim=1, keepdim=True).repeat(1, num_outputs)
+
+        model_kwargs = {
+            "train_X": train_X,
+            "train_Y": train_Y,
+        }
+
+        if include_yvar:
+            train_Yvar = 0.1 * torch.rand_like(train_Y)
+            model_kwargs["train_Yvar"] = train_Yvar
+
+        base_model = SingleTaskGP(
+            **model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+
+        original_train_inputs = base_model.input_transform(base_model.train_inputs[0])
+        original_train_targets = base_model.train_targets.clone()
+        original_train_yvar = base_model.likelihood.noise_covar.noise.clone()
+
+        state_dict = base_model.state_dict()
+
+        cv_model_kwargs = model_kwargs.copy()
+        cv_model_kwargs["train_X"] = train_X[:-1]
+        cv_model_kwargs["train_Y"] = train_Y[:-1]
+        if include_yvar:
+            cv_model_kwargs["train_Yvar"] = train_Yvar[:-1]
+        cv_model = SingleTaskGP(
+            **cv_model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+
+        # Test keep_transforms=True
+        cv_model.load_state_dict(state_dict, keep_transforms=True)
+
+        # Ensure outcome transform is in eval mode and doesn't change parameters
+        sd_mean = cv_model.outcome_transform.means
+        cv_model.outcome_transform(train_Y[:-1])
+        self.assertTrue(torch.all(cv_model.outcome_transform.means == sd_mean))
+
+        # Check that transform parameters match state_dict
+        self.assertTrue(
+            torch.allclose(
+                cv_model.input_transform._offset,
+                state_dict["input_transform._offset"],
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                cv_model.outcome_transform.means,
+                state_dict["outcome_transform.means"],
+            )
+        )
+
+        # Verify train data preservation in transformed space
+        self.assertAllClose(cv_model.train_targets, original_train_targets[..., :-1])
+        self.assertTrue(
+            torch.equal(
+                cv_model.input_transform(cv_model.train_inputs[0]),
+                original_train_inputs[..., :-1, :],
+            )
+        )
+        if include_yvar:
+            self.assertAllClose(
+                cv_model.likelihood.noise_covar.noise, original_train_yvar[..., :-1]
+            )
+
+        # Test keep_transforms=False (allows refitting)
+        cv_model = SingleTaskGP(
+            **cv_model_kwargs, **_get_input_output_transform(d=2, m=num_outputs)
+        )
+        cv_model.load_state_dict(state_dict, keep_transforms=False)
+
+        # Transforms should refit on new data
+        sd_mean = cv_model.outcome_transform.means
+        cv_model.outcome_transform(train_Y[:-1])
+        self.assertTrue(torch.all(cv_model.outcome_transform.means != sd_mean))
+
+        self.assertFalse(
+            torch.equal(
+                cv_model.input_transform(cv_model.train_inputs[0]),
+                original_train_inputs[..., :-1, :],
+            )
+        )
+        self.assertFalse(
+            torch.equal(cv_model.train_targets, original_train_targets[..., :-1])
+        )
+        self.assertFalse(
+            torch.equal(
+                cv_model.input_transform._offset,
+                state_dict["input_transform._offset"],
+            )
+        )
+        self.assertFalse(
+            torch.equal(
+                cv_model.outcome_transform.means,
+                state_dict["outcome_transform.means"],
+            )
+        )
+
+    def test_load_state_dict_with_transforms(self):
+        """Test load_state_dict with input and outcome transforms."""
+        self._test_load_state_dict_base(num_outputs=1, include_yvar=True)
+
+    def test_load_state_dict_with_transforms_no_yvar(self):
+        """Test load_state_dict with input and outcome transforms without Yvar."""
+        self._test_load_state_dict_base(num_outputs=1, include_yvar=False)
+
+    def test_load_state_dict_multi_output_with_transforms(self):
+        """Test load_state_dict with multi-output model and transforms."""
+        self._test_load_state_dict_base(num_outputs=3, include_yvar=True)
+
+    def test_load_state_dict_multi_output_with_transforms_no_yvar(self):
+        """Test load_state_dict with multi-output model and transforms without Yvar."""
+        self._test_load_state_dict_base(num_outputs=3, include_yvar=False)
+
+    def test_load_state_dict_no_transforms(self):
+        """Test load_state_dict without any transforms."""
+        tkwargs = {"device": self.device, "dtype": torch.double}
+        from botorch.models.gp_regression import SingleTaskGP
+
+        train_X = torch.rand(3, 2, **tkwargs)
+        train_X = torch.cat(
+            [train_X, torch.tensor([[-0.02, 11.1], [17.1, -2.5]], **tkwargs)], dim=0
+        )
+        train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
+
+        base_model = SingleTaskGP(
+            train_X=train_X, train_Y=train_Y, outcome_transform=None
+        )
+        original_train_targets = base_model.train_targets.clone()
+        state_dict = base_model.state_dict()
+
+        cv_model = SingleTaskGP(
+            train_X=train_X[:-1], train_Y=train_Y[:-1], outcome_transform=None
+        )
+        cv_model.load_state_dict(state_dict, keep_transforms=False)
+
+        # Verify train targets are preserved
+        self.assertTrue(
+            torch.equal(cv_model.train_targets, original_train_targets[:-1])
+        )
+
+
 class TestModelDict(BotorchTestCase):
     def test_model_dict(self):
         models = {"m1": MockModel(MockPosterior()), "m2": MockModel(MockPosterior())}