Pathwise Thomspon sampling for ensemble models (#2877)

SebastianAment · facebook-github-bot · commit ac56c93e77e8 · 2025-06-11T15:37:08.000-07:00
Summary: Pull Request resolved: #2877 This commit adds support for pathwise Thompson sampling for ensemble models, including fully Bayesian SAAS models. Differential Revision: D75990595
diff --git a/botorch/acquisition/thompson_sampling.py b/botorch/acquisition/thompson_sampling.py
@@ -9,7 +9,7 @@
 from botorch.acquisition.objective import PosteriorTransform
 from botorch.models.model import Model
 from botorch.sampling.pathwise.posterior_samplers import get_matheron_path_model
-from botorch.utils.transforms import t_batch_mode_transform
+from botorch.utils.transforms import is_ensemble, t_batch_mode_transform
 from torch import Tensor
 
 
@@ -42,45 +42,88 @@ def __init__(
                 a PosteriorTransform that transforms the multi-output posterior into a
                 single-output posterior is required.
         """
-        if model._is_fully_bayesian:
-            raise NotImplementedError(
-                "PathwiseThompsonSampling is not supported for fully Bayesian models",
-            )
 
         super().__init__(model=model)
         self.batch_size: int | None = None
 
-    def redraw(self) -> None:
+    def redraw(self, batch_size: int) -> None:
+        sample_shape = (batch_size,)
         self.samples = get_matheron_path_model(
-            model=self.model, sample_shape=torch.Size([self.batch_size])
+            model=self.model, sample_shape=torch.Size(sample_shape)
         )
+        if is_ensemble(self.model):
+            # the ensembling dimension is assumed to be part of the batch shape
+            # could add a dedicated proporty to keep track of the ensembling dimension
+            # i.e. generalizing num_mcmc_samples in AbstractFullyBayesianSingleTaskGP
+            model_batch_shape = self.model.batch_shape
+            if len(model_batch_shape) > 1:
+                raise NotImplementedError(
+                    "Ensemble models with more than one ensemble dimension are not "
+                    "yet supported."
+                )
+            num_ensemble = model_batch_shape[0]
+            self.ensemble_indices = torch.randint(
+                0, num_ensemble, (*sample_shape, 1, self.model.num_outputs)
+            )
 
     @t_batch_mode_transform()
     def forward(self, X: Tensor) -> Tensor:
         r"""Evaluate the pathwise posterior sample draws on the candidate set X.
 
         Args:
-            X: A `(b1 x ... bk) x 1 x d`-dim batched tensor of `d`-dim design points.
+            X: A `batch_shape x q x d`-dim batched tensor of `d`-dim design points.
 
         Returns:
-            A `(b1 x ... bk) x [num_models for fully bayesian]`-dim tensor of
-            evaluations on the posterior sample draws.
+            A `batch_shape [x m]`-dim tensor of evaluations on the posterior sample
+            draws, where `m` is the number of outputs of the model.
         """
         batch_size = X.shape[-2]
         q_dim = -2
-
         # batch_shape x q x 1 x d
         X = X.unsqueeze(-2)
         if self.batch_size is None:
             self.batch_size = batch_size
-            self.redraw()
+            self.redraw(batch_size=batch_size)
         elif self.batch_size != batch_size:
             raise ValueError(
                 BATCH_SIZE_CHANGE_ERROR.format(self.batch_size, batch_size)
             )
-
-        # posterior_values.shape post-squeeze:
+        # batch_shape x q [x num_ensembles] x 1 x m
+        posterior_values = self.samples(X)
+        # batch_shape x q [x num_ensembles] x m
+        posterior_values = posterior_values.squeeze(-2)
         # batch_shape x q x m
-        posterior_values = self.samples(X).squeeze(-2)
-        # sum over batch dim and squeeze num_objectives dim (-1)
-        return posterior_values.sum(q_dim).squeeze(-1)
+        posterior_values = self.select_from_ensemble_models(values=posterior_values)
+        # NOTE: can leverage batched L-BFGS computation instead of summing in the future
+        # sum over batch dim and squeeze num_objectives dim (-1): batch_shape [x m]
+        acqf_vals = posterior_values.sum(q_dim).squeeze(-1)
+        return acqf_vals
+
+    def select_from_ensemble_models(self, values: Tensor):
+        """Subselecting a value associated with a single sample in the ensemble for each
+        element of samples that is not associated with an ensemble dimension. NOTE: uses
+        `self.model` and `is_ensemble` to determine whether or not an ensembling
+        dimension is present.
+
+        Args:
+            values: A `batch_shape x num_draws x q [x num_ensemble] x m`-dim Tensor.
+
+        Returns:
+            A`batch_shape x num_draws x q x m`-dim where each element was chosen
+            independently randomly from the ensemble dimension.
+        """
+        if not is_ensemble(self.model):
+            return values
+
+        ensemble_dim = -2
+        # `ensemble_indices` are fixed so that the acquisition function becomes
+        # deterministic for the same input and can be optimized with LBFGS.
+        # ensemble indices have shape num_paths x 1 x m
+        index = self.ensemble_indices
+        input_batch_shape = values.shape[:-3]
+        index = index.expand(*input_batch_shape, *index.shape)
+        # samples is batch_shape x q x num_ensemble x m
+        values_wo_ensemble = torch.gather(values, dim=ensemble_dim, index=index)
+        return values_wo_ensemble.squeeze(
+            ensemble_dim
+        )  # removing the ensemble dimension
diff --git a/botorch/sampling/pathwise/paths.py b/botorch/sampling/pathwise/paths.py
@@ -147,6 +147,7 @@ def __init__(
         bias_module: Module | None = None,
         input_transform: TInputTransform | None = None,
         output_transform: TOutputTransform | None = None,
+        is_ensemble: bool = False,
     ):
         r"""Initializes a GeneralizedLinearPath instance.
 
@@ -161,6 +162,7 @@ def __init__(
             bias_module: An optional module used to define additive offsets.
             input_transform: An optional input transform for the module.
             output_transform: An optional output transform for the module.
+            is_ensemble: Whether the associated model is an ensemble model or not.
         """
         super().__init__()
         self.feature_map = feature_map
@@ -170,8 +172,13 @@ def __init__(
         self.bias_module = bias_module
         self.input_transform = input_transform
         self.output_transform = output_transform
+        self.is_ensemble = is_ensemble
 
     def forward(self, x: Tensor, **kwargs) -> Tensor:
+        if self.is_ensemble:
+            # assuming that the ensembling dimension is added after (n, d), but
+            # before the other batch dimensions, starting from the left.
+            x = x.unsqueeze(-3)
         feat = self.feature_map(x, **kwargs)
         out = (feat @ self.weight.unsqueeze(-1)).squeeze(-1)
         return out if self.bias_module is None else out + self.bias_module(x)
diff --git a/botorch/sampling/pathwise/prior_samplers.py b/botorch/sampling/pathwise/prior_samplers.py
@@ -24,6 +24,7 @@
 )
 from botorch.utils.dispatcher import Dispatcher
 from botorch.utils.sampling import draw_sobol_normal_samples
+from botorch.utils.transforms import is_ensemble
 from gpytorch.kernels import Kernel
 from gpytorch.models import ApproximateGP, ExactGP, GP
 from gpytorch.variational import _VariationalStrategy
@@ -61,6 +62,7 @@ def _draw_kernel_feature_paths_fallback(
     input_transform: TInputTransform | None = None,
     output_transform: TOutputTransform | None = None,
     weight_generator: Callable[[Size], Tensor] | None = None,
+    is_ensemble: bool = False,
 ) -> GeneralizedLinearPath:
     # Generate a kernel feature map
     feature_map = map_generator(
@@ -89,6 +91,7 @@ def _draw_kernel_feature_paths_fallback(
         bias_module=mean_module,
         input_transform=input_transform,
         output_transform=output_transform,
+        is_ensemble=is_ensemble,
     )
 
 
@@ -103,6 +106,7 @@ def _draw_kernel_feature_paths_ExactGP(
         covar_module=model.covar_module,
         input_transform=get_input_transform(model),
         output_transform=get_output_transform(model),
+        is_ensemble=is_ensemble(model),
         **kwargs,
     )
 
@@ -150,5 +154,6 @@ def _draw_kernel_feature_paths_ApproximateGP_fallback(
         num_inputs=num_inputs,
         mean_module=model.mean_module,
         covar_module=model.covar_module,
+        is_ensemble=is_ensemble(model),
         **kwargs,
     )
diff --git a/botorch/sampling/pathwise/update_strategies.py b/botorch/sampling/pathwise/update_strategies.py
@@ -13,6 +13,7 @@
 from typing import Any
 
 import torch
+
 from botorch.models.approximate_gp import ApproximateGPyTorchModel
 from botorch.models.transforms.input import InputTransform
 from botorch.sampling.pathwise.features import KernelEvaluationMap
@@ -24,6 +25,7 @@
     TInputTransform,
 )
 from botorch.utils.dispatcher import Dispatcher
+from botorch.utils.transforms import is_ensemble
 from botorch.utils.types import DEFAULT
 from gpytorch.kernels.kernel import Kernel
 from gpytorch.likelihoods import _GaussianLikelihoodBase, Likelihood
@@ -79,6 +81,7 @@ def _gaussian_update_exact(
     noise_covariance: Tensor | LinearOperator | None = None,
     scale_tril: Tensor | LinearOperator | None = None,
     input_transform: TInputTransform | None = None,
+    is_ensemble: bool = False,
 ) -> GeneralizedLinearPath:
     # Prepare Cholesky factor of `Cov(y, y)` and noise sample values as needed
     if isinstance(noise_covariance, (NoneType, ZeroLinearOperator)):
@@ -103,7 +106,9 @@ def _gaussian_update_exact(
         points=points,
         input_transform=input_transform,
     )
-    return GeneralizedLinearPath(feature_map=feature_map, weight=weight.squeeze(-1))
+    return GeneralizedLinearPath(
+        feature_map=feature_map, weight=weight.squeeze(-1), is_ensemble=is_ensemble
+    )
 
 
 @GaussianUpdate.register(ExactGP, _GaussianLikelihoodBase)
@@ -134,6 +139,7 @@ def _gaussian_update_ExactGP(
         noise_covariance=noise_covariance,
         scale_tril=scale_tril,
         input_transform=get_input_transform(model),
+        is_ensemble=is_ensemble(model),
     )
 
 
@@ -194,4 +200,5 @@ def _gaussian_update_ApproximateGP_VariationalStrategy(
         sample_values=sample_values,
         scale_tril=L,
         input_transform=input_transform,
+        is_ensemble=is_ensemble(model),
     )
diff --git a/botorch/utils/transforms.py b/botorch/utils/transforms.py
@@ -293,6 +293,7 @@ def decorated(
                     f"Expected X to be `batch_shape x q={expected_q} x d`, but"
                     f" got X with shape {X.shape}."
                 )
+            X_original_shape = X.shape
             # add t-batch dim
             X = X if X.dim() > 2 else X.unsqueeze(0)
             output = method(acqf, X, *args, **kwargs)
@@ -306,6 +307,8 @@ def decorated(
                     "X, or the `model.batch_shape` in the case of acquisition "
                     "functions using batch models; but got output with shape "
                     f"{output.shape} for X with shape {X.shape}."
+                    f"The original X shape was {X_original_shape} before the "
+                    "t_batch_mode_transform decorator modified it."
                 )
             return output
 
diff --git a/test/acquisition/test_thompson_sampling.py b/test/acquisition/test_thompson_sampling.py
@@ -6,6 +6,9 @@
 
 from itertools import product
 
+from unittest import mock
+from unittest.mock import PropertyMock
+
 import torch
 from botorch.acquisition.thompson_sampling import PathwiseThompsonSampling
 from botorch.models.fully_bayesian import SaasFullyBayesianSingleTaskGP
@@ -30,7 +33,7 @@ def get_fully_bayesian_model(
     train_Y,
     num_models,
     **tkwargs,
-):
+) -> SaasFullyBayesianSingleTaskGP:
     model = SaasFullyBayesianSingleTaskGP(
         train_X=train_X,
         train_Y=train_Y,
@@ -59,7 +62,7 @@ def _test_thompson_sampling_base(self, model: Model):
 
         acq_pass1 = acq(test_X)
         self.assertAllClose(acq_pass1, acq(test_X))
-        acq.redraw()
+        acq.redraw(batch_size=acq.batch_size)
         acq_pass2 = acq(test_X)
         self.assertFalse(torch.allclose(acq_pass1, acq_pass2))
 
@@ -109,10 +112,27 @@ def test_thompson_sampling_fully_bayesian(self):
         tkwargs = {"device": self.device, "dtype": torch.float64}
         train_X = torch.rand(4, input_dim, **tkwargs)
         train_Y = 10 * torch.rand(4, num_objectives, **tkwargs)
-
         fb_model = get_fully_bayesian_model(train_X, train_Y, num_models=3, **tkwargs)
-        with self.assertRaisesRegex(
-            NotImplementedError,
-            "PathwiseThompsonSampling is not supported for fully Bayesian models",
-        ):
-            PathwiseThompsonSampling(model=fb_model)
+        acqf = PathwiseThompsonSampling(model=fb_model)
+        acqf_vals = acqf(train_X)
+
+        acqf_vals_2 = acqf(train_X)
+
+        self.assertAllClose(acqf_vals, acqf_vals_2)
+
+        batch_shape = (2, 5)
+        test_X = torch.randn(*batch_shape, *train_X.shape)
+        batched_output = acqf(test_X)
+        self.assertEqual(batched_output.shape, batch_shape)
+        batched_output_2 = acqf(test_X)
+        self.assertAllClose(batched_output, batched_output_2)
+
+        with mock.patch.object(
+            type(acqf.model), "batch_shape", new_callable=PropertyMock
+        ) as mock_batch_shape:
+            mock_batch_shape.return_value = (2, 3)
+            with self.assertRaisesRegex(
+                NotImplementedError,
+                "Ensemble models with more than one ensemble dimension",
+            ):
+                acqf.redraw(batch_size=2)