Merge branch 'main' into entropy_search

benmltu · benmltu · commit 054cd5cb8506 · 2022-11-24T09:32:39.000Z
diff --git a/botorch/models/transforms/input.py b/botorch/models/transforms/input.py
@@ -12,19 +12,19 @@
 rounding functions, and log transformations. The input transformation
 is typically part of a Model and applied within the model.forward()
 method.
-
 """
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Union
+from warnings import warn
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError
 from botorch.models.transforms.utils import subset_transform
 from botorch.models.utils import fantasize
-from botorch.utils.rounding import approximate_round
+from botorch.utils.rounding import approximate_round, OneHotArgmaxSTE, RoundSTE
 from gpytorch import Module as GPyTorchModule
 from gpytorch.constraints import GreaterThan
 from gpytorch.priors import Prior
@@ -649,10 +649,10 @@ def _update_coefficients(self, X: Tensor) -> None:
 
 
 class Round(InputTransform, Module):
-    r"""A rounding transformation for integer inputs.
+    r"""A discretization transformation for discrete inputs.
 
-    This will typically be used in conjunction with normalization as
-    follows:
+    For integers, this will typically be used in conjunction
+    with normalization as follows:
 
     In eval() mode (i.e. after training), the inputs pass
     would typically be normalized to the unit cube (e.g. during candidate
@@ -667,19 +667,26 @@ class Round(InputTransform, Module):
     should be set to False, so that the raw inputs are rounded and then
     normalized to the unit cube.
 
-    This transformation uses differentiable approximate rounding by default.
-    The rounding function is approximated with a piece-wise function where
-    each piece is a hyperbolic tangent function.
+    By default, the straight through estimators are used for the gradients as
+    proposed in [Daulton2022bopr]_. This transformation supports differentiable
+    approximate rounding (currently only for integers). The rounding function
+    is approximated with a piece-wise function where each piece is a hyperbolic
+    tangent function.
+
+    For categorical parameters, the input must be one-hot encoded.
 
     Example:
+        >>> bounds = torch.tensor([[0, 5], [0, 1], [0, 1]]).t()
+        >>> integer_indices = [0]
+        >>> categorical_features = {1: 2}
         >>> unnormalize_tf = Normalize(
         >>>     d=d,
         >>>     bounds=bounds,
         >>>     transform_on_eval=True,
         >>>     transform_on_train=True,
         >>>     reverse=True,
         >>> )
-        >>> round_tf = Round(integer_indices)
+        >>> round_tf = Round(integer_indices, categorical_features)
         >>> normalize_tf = Normalize(d=d, bounds=bounds)
         >>> tf = ChainedInputTransform(
         >>>     tf1=unnormalize_tf, tf2=round_tf, tf3=normalize_tf
@@ -688,46 +695,76 @@ class Round(InputTransform, Module):
 
     def __init__(
         self,
-        indices: List[int],
+        integer_indices: Optional[List[int]] = None,
+        categorical_features: Optional[Dict[int, int]] = None,
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
         transform_on_fantasize: bool = True,
-        approximate: bool = True,
+        approximate: bool = False,
         tau: float = 1e-3,
+        **kwargs,
     ) -> None:
         r"""Initialize transform.
 
         Args:
-            indices: The indices of the integer inputs.
+            integer_indices: The indices of the integer inputs.
+            categorical_features: A dictionary mapping the starting index of each
+                categorical feature to its cardinality. This assumes that categoricals
+                are one-hot encoded.
             transform_on_train: A boolean indicating whether to apply the
                 transforms in train() mode. Default: True.
             transform_on_eval: A boolean indicating whether to apply the
                 transform in eval() mode. Default: True.
             transform_on_fantasize: A boolean indicating whether to apply the
                 transform when called from within a `fantasize` call. Default: True.
             approximate: A boolean indicating whether approximate or exact
-                rounding should be used. Default: approximate.
+                rounding should be used. Default: False.
             tau: The temperature parameter for approximate rounding.
         """
+        indices = kwargs.get("indices")
+        if indices is not None:
+            warn(
+                "`indices` is marked for deprecation in favor of `integer_indices`.",
+                DeprecationWarning,
+            )
+            integer_indices = indices
+        if approximate and categorical_features is not None:
+            raise NotImplementedError
         super().__init__()
         self.transform_on_train = transform_on_train
         self.transform_on_eval = transform_on_eval
         self.transform_on_fantasize = transform_on_fantasize
-        self.register_buffer("indices", torch.tensor(indices, dtype=torch.long))
+        integer_indices = integer_indices or []
+        self.register_buffer(
+            "integer_indices", torch.tensor(integer_indices, dtype=torch.long)
+        )
+        self.categorical_features = categorical_features or {}
         self.approximate = approximate
         self.tau = tau
 
-    @subset_transform
     def transform(self, X: Tensor) -> Tensor:
-        r"""Round the inputs.
+        r"""Discretize the inputs.
 
         Args:
             X: A `batch_shape x n x d`-dim tensor of inputs.
 
         Returns:
-            A `batch_shape x n x d`-dim tensor of rounded inputs.
+            A `batch_shape x n x d`-dim tensor of discretized inputs.
         """
-        return approximate_round(X, tau=self.tau) if self.approximate else X.round()
+        X_rounded = X.clone()
+        # round integers
+        X_int = X_rounded[..., self.integer_indices]
+        if self.approximate:
+            X_int = approximate_round(X_int, tau=self.tau)
+        else:
+            X_int = RoundSTE.apply(X_int)
+        X_rounded[..., self.integer_indices] = X_int
+        # discrete categoricals to the category with the largest value
+        # in the continuous relaxation of the one-hot encoding
+        for start, card in self.categorical_features.items():
+            end = start + card
+            X_rounded[..., start:end] = OneHotArgmaxSTE.apply(X[..., start:end])
+        return X_rounded
 
     def equals(self, other: InputTransform) -> bool:
         r"""Check if another input transform is equivalent.
@@ -740,6 +777,8 @@ def equals(self, other: InputTransform) -> bool:
         """
         return (
             super().equals(other=other)
+            and (self.integer_indices == other.integer_indices).all()
+            and self.categorical_features == other.categorical_features
             and self.approximate == other.approximate
             and self.tau == other.tau
         )
diff --git a/botorch/optim/initializers.py b/botorch/optim/initializers.py
@@ -114,7 +114,6 @@ def gen_batch_initial_conditions(
     batch_limit: Optional[int] = options.get(
         "init_batch_limit", options.get("batch_limit")
     )
-    batch_initial_arms: Tensor
     factor, max_factor = 1, 5
     init_kwargs = {}
     device = bounds.device
diff --git a/botorch/test_functions/multi_objective.py b/botorch/test_functions/multi_objective.py
@@ -11,7 +11,8 @@
 
 .. [Daulton2022]
     S. Daulton, S. Cakmak, M. Balandat, M. A. Osborne, E. Zhou, and E. Bakshy.
-    Robust Multi-Objective Bayesian Optimization Under Input Noise. 2022.
+    Robust Multi-Objective Bayesian Optimization Under Input Noise.
+    Proceedings of the 39th International Conference on Machine Learning, 2022.
 
 .. [Deb2005dtlz]
     K. Deb, L. Thiele, M. Laumanns, E. Zitzler, A. Abraham, L. Jain, and
diff --git a/botorch/utils/rounding.py b/botorch/utils/rounding.py
@@ -4,10 +4,24 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+r"""
+Discretization (rounding) functions for acquisition optimization.
+
+References
+
+.. [Daulton2022bopr]
+    S. Daulton, X. Wan, D. Eriksson, M. Balandat, M. A. Osborne, E. Bakshy.
+    Bayesian Optimization over Discrete and Mixed Spaces via Probabilistic
+    Reparameterization. Advances in Neural Information Processing Systems
+    35, 2022.
+"""
+
 from __future__ import annotations
 
 import torch
 from torch import Tensor
+from torch.autograd import Function
+from torch.nn.functional import one_hot
 
 
 def approximate_round(X: Tensor, tau: float = 1e-3) -> Tensor:
@@ -27,3 +41,68 @@ def approximate_round(X: Tensor, tau: float = 1e-3) -> Tensor:
     scaled_remainder = (X - offset - 0.5) / tau
     rounding_component = (torch.tanh(scaled_remainder) + 1) / 2
     return offset + rounding_component
+
+
+class IdentitySTEFunction(Function):
+    """Base class for functions using straight through gradient estimators.
+
+    This class approximates the gradient with the identity function.
+    """
+
+    @staticmethod
+    def backward(ctx, grad_output: Tensor) -> Tensor:
+        r"""Use a straight-through estimator the gradient.
+
+        This uses the identity function.
+
+        Args:
+            grad_output: A tensor of gradients.
+
+        Returns:
+            The provided tensor.
+        """
+        return grad_output
+
+
+class RoundSTE(IdentitySTEFunction):
+    r"""Round the input tensor and use a straight-through gradient estimator.
+
+    [Daulton2022bopr]_ proposes using this in acquisition optimization.
+    """
+
+    @staticmethod
+    def forward(ctx, X: Tensor) -> Tensor:
+        r"""Round the input tensor element-wise.
+
+        Args:
+            X: The tensor to be rounded.
+
+        Returns:
+            A tensor where each element is rounded to the nearest integer.
+        """
+        return X.round()
+
+
+class OneHotArgmaxSTE(IdentitySTEFunction):
+    r"""Discretize a continuous relaxation of a one-hot encoded categorical.
+
+    This returns a one-hot encoded categorical and use a straight-through
+    gradient estimator via an identity function.
+
+    [Daulton2022bopr]_ proposes using this in acquisition optimization.
+    """
+
+    @staticmethod
+    def forward(ctx, X: Tensor) -> Tensor:
+        r"""Discretize the input tensor.
+
+        This applies a argmax along the last dimensions of the input tensor
+        and one-hot encodes the result.
+
+        Args:
+            X: The tensor to be rounded.
+
+        Returns:
+            A tensor where each element is rounded to the nearest integer.
+        """
+        return one_hot(X.argmax(dim=-1), num_classes=X.shape[-1]).to(X)
diff --git a/test/models/transforms/test_input.py b/test/models/transforms/test_input.py
diff --git a/test/utils/test_rounding.py b/test/utils/test_rounding.py

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,6 @@ def gen_batch_initial_conditions(`
`114`	`114`	`batch_limit: Optional[int] = options.get(`
`115`	`115`	`"init_batch_limit", options.get("batch_limit")`
`116`	`116`	`)`
`117`		`- batch_initial_arms: Tensor`
`118`	`117`	`factor, max_factor = 1, 5`
`119`	`118`	`init_kwargs = {}`
`120`	`119`	`device = bounds.device`