Refactor of MultiTask / FullyBayesianMultiTaskGP to use ProductKernel & IndexKernel (#2908)

Carl Hvarfner · facebook-github-bot · commit 1e66b0d2a502 · 2025-07-15T14:24:13.000-07:00
Summary: X-link: facebook/Ax#3992 X-link: facebookexternal/botorch_fb#23 Modified MultiTask and FullyBayesianMultiTask to use IndexKernel instead of two different covar modules. For large matrices, this constitutes a significant speed-up (2-3x anecdotally) and an even larger memory decrease. In addition, this makes MultiTaskFBGP and SingleTaskFBGPs share a lot of code. I'll enable more code sharing between them in a subsequent diff. With some additional functionality in IndexKernel (i.e. structured learning of the covar_matrix elements), this change would apply to other MTGPs as well. NOTE: Providing negative indices to an IndexKernel is not supported: pytorch/pytorch#76347 Reviewed By: saitcakmak Differential Revision: D76317553
diff --git a/botorch/models/contextual_multioutput.py b/botorch/models/contextual_multioutput.py
@@ -19,9 +19,11 @@
 from botorch.models.multitask import MultiTaskGP
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
+from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
 from botorch.utils.datasets import MultiTaskDataset, SupervisedDataset
 from botorch.utils.types import _DefaultType, DEFAULT
 from gpytorch.constraints import Interval
+from gpytorch.distributions import MultivariateNormal
 from gpytorch.kernels.rbf_kernel import RBFKernel
 from gpytorch.likelihoods.likelihood import Likelihood
 from gpytorch.module import Module
@@ -107,6 +109,13 @@ def __init__(
             outcome_transform=outcome_transform,
             input_transform=input_transform,
         )
+        # Overwriting the covar_module created in the parent class
+        if covar_module is None:
+            self.covar_module = get_covar_module_with_dim_scaled_prior(
+                ard_num_dims=self.num_non_task_features
+            )
+        else:
+            self.covar_module = covar_module
         self.device = train_X.device
         if all_tasks is None:
             all_tasks_tensor = train_X[:, task_feature].unique()
@@ -188,6 +197,10 @@ def task_covar_module(self, task_idcs: Tensor) -> Tensor:
         Returns:
             Task covariance matrix of shape (b x n x n).
         """
+        # NOTE: This can probably be re-written more efficiently using
+        # IndexKernel (or an IndexKernel subclass) and the `evaluate_task_covar`
+        # and then have the forward pass evaluate a ProductKernel of the two.
+
         # This is a tensor of shape (num_tasks x num_tasks).
         covar_matrix = self._eval_context_covar().to_dense()
         # Here, we index into the base covar matrix to extract
@@ -208,6 +221,20 @@ def task_covar_module(self, task_idcs: Tensor) -> Tensor:
             covar_matrix[base_idx].transpose(-1, -2).gather(index=expanded_idx, dim=-2)
         )
 
+    def forward(self, x: Tensor) -> MultivariateNormal:
+        if self.training:
+            x = self.transform_inputs(x)
+        x_basic_lead, task_idcs, x_basic_trail = self._split_inputs(x)
+        x_basic = torch.cat([x_basic_lead, x_basic_trail], dim=-1)
+        # Compute base mean and covariance
+        mean_x = self.mean_module(x_basic)
+        covar_x = self.covar_module(x_basic)
+        # Compute task covariances
+        covar_i = self.task_covar_module(task_idcs)
+        # Combine the two in an ICM fashion
+        covar = covar_x.mul(covar_i)
+        return MultivariateNormal(mean_x, covar)
+
     @classmethod
     def construct_inputs(
         cls,
diff --git a/botorch/models/fully_bayesian_multitask.py b/botorch/models/fully_bayesian_multitask.py
@@ -14,6 +14,7 @@
 from botorch.acquisition.objective import PosteriorTransform
 from botorch.models.fully_bayesian import (
     matern52_kernel,
+    MCMC_DIM,
     MIN_INFERRED_NOISE_LEVEL,
     reshape_and_detach,
     SaasPyroModel,
@@ -22,7 +23,7 @@
 from botorch.models.multitask import MultiTaskGP
 from botorch.models.transforms.input import InputTransform
 from botorch.models.transforms.outcome import OutcomeTransform
-from botorch.posteriors.fully_bayesian import GaussianMixturePosterior, MCMC_DIM
+from botorch.posteriors.fully_bayesian import GaussianMixturePosterior
 from gpytorch.distributions import MultivariateNormal
 from gpytorch.kernels import MaternKernel
 from gpytorch.kernels.index_kernel import IndexKernel
@@ -66,6 +67,10 @@ def set_inputs(
             task_rank: The num of learned task embeddings to be used in the task kernel.
                 If omitted, use a full rank (i.e. number of tasks) kernel.
         """
+        # NOTE PyTorch does not support negative indexing for tensors in index_select,
+        # (https://github.com/pytorch/pytorch/issues/76347), so we have to make sure
+        # that the task feature is positive.
+        task_feature = task_feature % train_X.shape[-1]
         super().set_inputs(train_X, train_Y, train_Yvar)
         # obtain a list of task indicies
         all_tasks = train_X[:, task_feature].unique().to(dtype=torch.long).tolist()
@@ -140,15 +145,19 @@ def load_mcmc_samples(
         num_mcmc_samples = len(mcmc_samples["mean"])
         batch_shape = torch.Size([num_mcmc_samples])
 
-        mean_module, covar_module, likelihood, _ = super().load_mcmc_samples(
+        mean_module, data_covar_module, likelihood, _ = super().load_mcmc_samples(
             mcmc_samples=mcmc_samples
         )
+        data_indices = torch.arange(self.train_X.shape[-1] - 1)
+        data_indices[self.task_feature :] += 1  # exclude task feature
 
+        data_covar_module.active_dims = data_indices  # .to(tkwargs["device"])
         latent_covar_module = MaternKernel(
             nu=2.5,
             ard_num_dims=self.task_rank,
             batch_shape=batch_shape,
         ).to(**tkwargs)
+
         latent_covar_module.lengthscale = reshape_and_detach(
             target=latent_covar_module.lengthscale,
             new_value=mcmc_samples["task_lengthscale"],
@@ -159,22 +168,27 @@ def load_mcmc_samples(
             num_tasks=self.num_tasks,
             rank=self.task_rank,
             batch_shape=latent_features.shape[:-2],
-        ).to(**tkwargs)
+            active_dims=torch.tensor([self.task_feature]).to(tkwargs["device"]),
+        )
         task_covar_module.covar_factor = Parameter(
             task_covar.cholesky().to_dense().detach()
         )
 
-        # NOTE: 'var' is implicitly assumed to be zero from the sampling procedure in
-        # the FBMTGP model but not in the regular MTGP. I dont how if the var parameter
-        # affects predictions in practice, but setting it to zero is consistent with the
-        # previous implementation.
+        # NOTE: The IndexKernel has a learnable 'var' parameter in addition to the
+        # task covariances, corresponding do task-specific variances along the diagonal
+        # of the task covariance matrix. As this parameter is not sampled in `sample()`
+        # we implicitly assume it to be zero. This is consistent with the previous
+        # SAASFBMTGP implementation, but means that the non-fully Bayesian and fully
+        # Bayesian models run on slightly different task covar modules.
+
+        # We set the aforementioned task covar module var parameter to zero here.
         task_covar_module.var = torch.zeros_like(task_covar_module.var)
-        return mean_module, covar_module, likelihood, task_covar_module
+        covar_module = data_covar_module * task_covar_module
+        return mean_module, covar_module, likelihood, None
 
 
 class SaasFullyBayesianMultiTaskGP(MultiTaskGP):
     r"""A fully Bayesian multi-task GP model with the SAAS prior.
-
     This model assumes that the inputs have been normalized to [0, 1]^d and that the
     output has been stratified standardized to have zero mean and unit variance for
     each task. The SAAS model [Eriksson2021saasbo]_ with a Matern-5/2 is used as data
@@ -286,8 +300,6 @@ def __init__(
         self.mean_module = None
         self.covar_module = None
         self.likelihood = None
-        self.task_covar_module = None
-        self.register_buffer("latent_features", None)
         if pyro_model is None:
             pyro_model = MultitaskSaasPyroModel()
         pyro_model.set_inputs(
@@ -321,21 +333,20 @@ def train(
             self.mean_module = None
             self.covar_module = None
             self.likelihood = None
-            self.task_covar_module = None
         return self
 
     @property
     def median_lengthscale(self) -> Tensor:
         r"""Median lengthscales across the MCMC samples."""
         self._check_if_fitted()
-        lengthscale = self.covar_module.base_kernel.lengthscale.clone()
+        lengthscale = self.covar_module.kernels[0].base_kernel.lengthscale.clone()
         return lengthscale.median(0).values.squeeze(0)
 
     @property
     def num_mcmc_samples(self) -> int:
         r"""Number of MCMC samples in the model."""
         self._check_if_fitted()
-        return len(self.covar_module.outputscale)
+        return self.covar_module.kernels[0].batch_shape[0]
 
     @property
     def batch_shape(self) -> torch.Size:
@@ -367,7 +378,7 @@ def load_mcmc_samples(self, mcmc_samples: dict[str, Tensor]) -> None:
             self.mean_module,
             self.covar_module,
             self.likelihood,
-            self.task_covar_module,
+            _,
         ) = self.pyro_model.load_mcmc_samples(mcmc_samples=mcmc_samples)
 
     def posterior(
@@ -438,7 +449,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
             self.mean_module,
             self.covar_module,
             self.likelihood,
-            self.task_covar_module,
+            _,  # Possibly space for input transform
         ) = self.pyro_model.load_mcmc_samples(mcmc_samples=mcmc_samples)
         # Load the actual samples from the state dict
         super().load_state_dict(state_dict=state_dict, strict=strict)
diff --git a/botorch/models/multitask.py b/botorch/models/multitask.py
@@ -172,6 +172,11 @@ def __init__(
                 X=train_X, input_transform=input_transform
             )
         self._validate_tensor_args(X=transformed_X, Y=train_Y, Yvar=train_Yvar)
+
+        # IndexKernel cannot work with negative task features, so we shift them to
+        # be positive here.
+        if task_feature < 0:
+            task_feature += transformed_X.shape[-1]
         (
             all_tasks_inferred,
             task_feature,
@@ -220,16 +225,29 @@ def __init__(
         )
         self.mean_module = mean_module or ConstantMean()
         if covar_module is None:
-            self.covar_module = get_covar_module_with_dim_scaled_prior(
-                ard_num_dims=self.num_non_task_features
+            data_covar_module = get_covar_module_with_dim_scaled_prior(
+                ard_num_dims=self.num_non_task_features,
+                active_dims=self._base_idxr,
             )
         else:
-            self.covar_module = covar_module
+            data_covar_module = covar_module
+            # This check enables models which don't adhere to the convention (e.g.
+            # adding additional feature dimensions, like HeteroMTGP) to be used.
+            if covar_module.active_dims is None:
+                # Since we no longer use the custom indexing which derived the
+                # task indexing in the forward pass, we need to explicitly set
+                # the active dims here to ensure that the forward pass works.
+                data_covar_module.active_dims = self._base_idxr
 
         self._rank = rank if rank is not None else self.num_tasks
-        self.task_covar_module = IndexKernel(
-            num_tasks=self.num_tasks, rank=self._rank, prior=task_covar_prior
+        task_covar_module = IndexKernel(
+            num_tasks=self.num_tasks,
+            rank=self._rank,
+            prior=task_covar_prior,
+            active_dims=[task_feature],
         )
+
+        self.covar_module = data_covar_module * task_covar_module
         task_mapper = get_task_value_remapping(
             task_values=torch.tensor(
                 all_tasks, dtype=torch.long, device=train_X.device
@@ -244,45 +262,41 @@ def __init__(
             self.outcome_transform = outcome_transform
         self.to(train_X)
 
-    def _split_inputs(self, x: Tensor) -> tuple[Tensor, Tensor]:
-        r"""Extracts base features and task indices from input data.
+    def _split_inputs(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
+        r"""Extracts features before task feature, task indices, and features after
+        the task feature.
 
         Args:
             x: The full input tensor with trailing dimension of size `d + 1`.
                 Should be of float/double data type.
 
         Returns:
-            2-element tuple containing
-
-            - A `q x d` or `b x q x d` (batch mode) tensor with trailing
-            dimension made up of the `d` non-task-index columns of `x`, arranged
-            in the order as specified by the indexer generated during model
-            instantiation.
-            - A `q` or `b x q` (batch mode) tensor of long data type containing
-            the task indices.
+            3-element tuple containing
+
+            - A  `q x d` or `b x q x d` tensor with features before the task feature
+            - A  `q` or `b x q` tensor with mapped task indices
+            - A  `q x d` or `b x q x d` tensor with features after the task feature
         """
-        batch_shape, d = x.shape[:-2], x.shape[-1]
-        x_basic = x[..., self._base_idxr].view(batch_shape + torch.Size([-1, d - 1]))
-        task_idcs = (
-            x[..., self._task_feature]
-            .view(batch_shape + torch.Size([-1, 1]))
-            .to(dtype=torch.long)
-        )
-        task_idcs = self._map_tasks(task_values=task_idcs)
-        return x_basic, task_idcs
+        batch_shape = x.shape[:-2]
+        # Extract task indices and convert to long
+        task_idcs = x[..., self._task_feature].view(batch_shape + torch.Size([-1, 1]))
+        task_idcs = self._map_tasks(task_values=task_idcs.to(dtype=torch.long))
+
+        # Extract features before and after task feature
+        x_before = x[..., : self._task_feature]
+        x_after = x[..., (self._task_feature + 1) :]
+        return x_before, task_idcs, x_after
 
     def forward(self, x: Tensor) -> MultivariateNormal:
         if self.training:
             x = self.transform_inputs(x)
-        x_basic, task_idcs = self._split_inputs(x)
-        # Compute base mean and covariance
-        mean_x = self.mean_module(x_basic)
-        covar_x = self.covar_module(x_basic)
-        # Compute task covariances
-        covar_i = self.task_covar_module(task_idcs)
-        # Combine the two in an ICM fashion
-        covar = covar_x.mul(covar_i)
-        return MultivariateNormal(mean_x, covar)
+
+        # Get features before task feature, task indices, and features after task the
+        # feature, with the feature mapping applied to the task indices.
+        x = torch.cat(self._split_inputs(x), dim=-1)
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+        return MultivariateNormal(mean_x, covar_x)
 
     @classmethod
     def get_all_tasks(
diff --git a/test/models/test_contextual_multioutput.py b/test/models/test_contextual_multioutput.py
@@ -13,6 +13,7 @@
 from botorch.utils.test_helpers import gen_multi_task_dataset
 from botorch.utils.testing import BotorchTestCase
 from gpytorch.distributions import MultitaskMultivariateNormal, MultivariateNormal
+from gpytorch.kernels import MaternKernel
 from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood
 from linear_operator.operators import LinearOperator
 from linear_operator.operators.interpolated_linear_operator import (
@@ -101,6 +102,16 @@ def test_LCEMGP(self):
                 right_interp_indices=task_idcs,
             ).to_dense()
             self.assertAllClose(previous_covar, model.task_covar_module(task_idcs))
+            custom_covar_module = MaternKernel()
+            model_custom_covar = LCEMGP(
+                train_X=train_x,
+                train_Y=train_y,
+                task_feature=task_feature,
+                embs_dim_list=[2],  # increase dim from 1 to 2
+                context_emb_feature=torch.tensor([[0.2], [0.3]]),
+                covar_module=custom_covar_module,
+            )
+            self.assertIsInstance(model_custom_covar.covar_module, MaternKernel)
 
     def test_construct_inputs(self) -> None:
         for with_embedding_inputs, yvar, skip_task_features_in_datasets in zip(
diff --git a/test/models/test_fully_bayesian_multitask.py b/test/models/test_fully_bayesian_multitask.py
diff --git a/test/models/test_multitask.py b/test/models/test_multitask.py