Skip to content

Commit cd77978

Browse files
jduerholtfacebook-github-bot
authored andcommitted
NumericToCategoricalEncoding Input Transform. (#2907)
Summary: <!-- Thank you for sending the PR! We appreciate you spending the time to make BoTorch better. Help us understand your motivation by explaining why you decided to make this change. You can learn more about contributing to BoTorch here: https://github.com/pytorch/botorch/blob/main/CONTRIBUTING.md --> ## Motivation This PR refers to #2879. It adds a new input transform that transforms a categorical degree of freedom encoded a an integer into some kind of vector based description. This could be for example a one-hot encoding, but also a descriptor encoding as it is often used in chemistry. It adds the possibility to use the alternating acqf optimizer also with surrogates that do not treat categoricals as integer based values. For example one could then also use a SAAS GP with the mixed alternating acqf optimizer and treat the categoricals under the hood as one-hots. ### Have you read the [Contributing Guidelines on pull requests](https://github.com/pytorch/botorch/blob/main/CONTRIBUTING.md#pull-requests)? Yes. Pull Request resolved: #2907 Test Plan: Unit tests, most of them are implemented (also to demonstrate the functionality), the ones which check the equality between transforms and correct behavior of transform on train etc. are still missing. My plan is to add them after a first feedback after a first review. Reviewed By: esantorella Differential Revision: D80088225 Pulled By: Balandat fbshipit-source-id: 5b5a4c3aa2e9d4b7eabfe94deeafb9a485fe4214
1 parent 6621d77 commit cd77978

File tree

2 files changed

+402
-4
lines changed

2 files changed

+402
-4
lines changed

botorch/models/transforms/input.py

Lines changed: 152 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1625,6 +1625,158 @@ def _expanded_perturbations(self, X: Tensor) -> Tensor:
16251625
return p.transpose(-3, -2) # p is batch_shape x n_p x n x d
16261626

16271627

1628+
class NumericToCategoricalEncoding(InputTransform):
1629+
"""Transform categorical parameters from an integer/numeric representation
1630+
to a vector based representation like one-hot encoding or a descriptor
1631+
encoding.
1632+
1633+
The vector encoding is inserted at the position of the categorical feature
1634+
in the input tensor. This is demonstrated in the example below in which a
1635+
categorical feature of cardinality 3 at position 1 in the original
1636+
representation is one-hot encoded.
1637+
1638+
Example:
1639+
1640+
>>> import torch
1641+
>>> from torch.nn.functional import one_hot
1642+
>>> from functools import partial
1643+
>>> from botorch.models.transforms.input import NumericToCategoricalEncoding
1644+
>>> tf = NumericToCategoricalEncoding(
1645+
... dim=3,
1646+
... categorical_features={1: 3},
1647+
... encoders={1: partial(one_hot, num_classes=3)},
1648+
... )
1649+
>>> X = torch.tensor([[0.5, 2, 1.2], [1.1, 0, 0.8]])
1650+
>>> tf.transform(X)
1651+
tensor([[0.5000, 0.0000, 0.0000, 1.0000, 1.2000],
1652+
[1.1000, 1.0000, 0.0000, 0.0000, 0.8000]])
1653+
"""
1654+
1655+
def __init__(
1656+
self,
1657+
dim: int,
1658+
categorical_features: dict[int, int],
1659+
encoders: dict[int, Callable[[Tensor], Tensor]],
1660+
transform_on_train: bool = True,
1661+
transform_on_eval: bool = True,
1662+
transform_on_fantasize: bool = True,
1663+
) -> None:
1664+
r"""Initialize.
1665+
1666+
Args:
1667+
dim: The dimension of the numerically encoded input.
1668+
categorical_features: A dictionary mapping the index of each
1669+
categorical feature to its cardinality which has to be
1670+
greater than 1. This assumes that categoricals
1671+
are integer encoded.
1672+
encoders: A dictionary mapping the index of each categorical feature to
1673+
a callable that encodes the categorical feature into a vector
1674+
representation.
1675+
transform_on_train: A boolean indicating whether to apply the
1676+
transforms in train() mode. Default: False.
1677+
transform_on_eval: A boolean indicating whether to apply the
1678+
transform in eval() mode. Default: True.
1679+
transform_on_fantasize: A boolean indicating whether to apply the
1680+
transform when called from within a `fantasize` call. Default: False.
1681+
"""
1682+
super().__init__()
1683+
self.transform_on_train = transform_on_train
1684+
self.transform_on_eval = transform_on_eval
1685+
self.transform_on_fantasize = transform_on_fantasize
1686+
1687+
self.encoders = encoders
1688+
self.categorical_features = categorical_features
1689+
1690+
if len(self.categorical_features) == 0:
1691+
raise ValueError(
1692+
"At least one categorical feature for encoding must be provided."
1693+
)
1694+
1695+
if (num_cat := len(self.categorical_features)) > dim:
1696+
raise ValueError(
1697+
f"The number of categorical features ({num_cat}) exceeds the "
1698+
f"provided dimension ({dim})."
1699+
)
1700+
1701+
for idx, card in self.categorical_features.items():
1702+
if card <= 1 or not isinstance(card, int):
1703+
raise ValueError(
1704+
f"Categorical feature at index {idx} has cardinality {card}. "
1705+
f"All categorical features must be an integer and have cardinality "
1706+
"greater than 1."
1707+
)
1708+
1709+
# check that the encoders match the categorical features
1710+
if (enc_keys := set(self.encoders)) != (
1711+
cf_keys := set(self.categorical_features)
1712+
):
1713+
raise ValueError(
1714+
f"The keys of `encoders` ({enc_keys}) must match the keys of "
1715+
f"of `categorical_features` ({cf_keys})."
1716+
)
1717+
1718+
self.ordinal_idx = list(
1719+
self.categorical_features.keys()
1720+
) # indices of categorical features before encoding
1721+
1722+
self.numerical_idx = list(
1723+
set(range(dim)) - set(self.ordinal_idx)
1724+
) # indices of numerical features before encoding
1725+
1726+
self.new_numerical_idx = [] # indices of numerical features after encoding
1727+
self.encoded_idx = [] # indices of categorical features after encoding
1728+
1729+
offset = 0
1730+
for idx in range(dim):
1731+
if idx in self.numerical_idx:
1732+
self.new_numerical_idx.append(idx + offset)
1733+
else:
1734+
card = self.categorical_features[idx]
1735+
self.encoded_idx.append(
1736+
np.arange(
1737+
idx + offset, idx + offset + card
1738+
).tolist() # indices of categorical features after encoding
1739+
)
1740+
offset += card - 1 # adjust offset for next categorical feature
1741+
1742+
def transform(self, X: Tensor) -> Tensor:
1743+
r"""Transform the categorical inputs into a vector representation.
1744+
1745+
Args:
1746+
X: A `batch_shape x n x d`-dim tensor of inputs.
1747+
1748+
Returns:
1749+
A `batch_shape x n x d'`-dim tensor with
1750+
`d' = d + sum(categorical_features.values())` in which the
1751+
integer-encoded categoricals are transformed to a vector representation.
1752+
"""
1753+
s = list(X.shape)
1754+
s[-1] = len(self.numerical_idx) + len(np.concatenate(self.encoded_idx))
1755+
X_encoded = torch.zeros(size=s, device=X.device, dtype=X.dtype)
1756+
X_encoded[..., self.new_numerical_idx] = X[..., self.numerical_idx]
1757+
for i, idx in enumerate(self.categorical_features.keys()):
1758+
encoded_val = self.encoders[idx](X[..., idx].long()).to(X_encoded)
1759+
X_encoded[..., self.encoded_idx[i]] = encoded_val
1760+
return X_encoded
1761+
1762+
def equals(self, other: InputTransform) -> bool:
1763+
r"""Check if another input transform is equivalent.
1764+
1765+
Args:
1766+
other: Another input transform.
1767+
1768+
Returns:
1769+
A boolean indicating if the other transform is equivalent.
1770+
"""
1771+
return (
1772+
type(self) is type(other)
1773+
and (self.transform_on_train == other.transform_on_train)
1774+
and (self.transform_on_eval == other.transform_on_eval)
1775+
and (self.transform_on_fantasize == other.transform_on_fantasize)
1776+
and self.categorical_features == other.categorical_features
1777+
)
1778+
1779+
16281780
class OneHotToNumeric(InputTransform):
16291781
r"""Transform categorical parameters from a one-hot to a numeric representation."""
16301782

@@ -1649,10 +1801,6 @@ def __init__(
16491801
transform in eval() mode. Default: True.
16501802
transform_on_fantasize: A boolean indicating whether to apply the
16511803
transform when called from within a `fantasize` call. Default: False.
1652-
1653-
Returns:
1654-
A `batch_shape x n x d'`-dim tensor of where the one-hot encoded
1655-
categoricals are transformed to integer representation.
16561804
"""
16571805
super().__init__()
16581806
self.transform_on_train = transform_on_train

0 commit comments

Comments
 (0)