Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a bayesian gaussian mixture model transformer #329

Merged
merged 33 commits into from
Dec 17, 2021
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd
import scipy
from sklearn.mixture import BayesianGaussianMixture

from rdt.transformers.base import BaseTransformer
from rdt.transformers.null import NullTransformer
Expand Down Expand Up @@ -487,3 +488,139 @@ def _reverse_transform(self, data):
data = self._univariate.ppf(scipy.stats.norm.cdf(data))

return super()._reverse_transform(data)


class BayesGMMTransformer(NumericalTransformer):
"""Bayesian GMM transformer."""

STD_MULTIPLIER = 4
csala marked this conversation as resolved.
Show resolved Hide resolved
DETERMINISTIC_TRANSFORM = False
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = True

def __init__(self, dtype=None, nan='mean', null_column=None, max_clusters=10,
weight_threshold=0.005):
super().__init__(dtype=dtype, nan=nan, null_column=null_column)
self._max_clusters = max_clusters
self._weight_threshold = weight_threshold
self._number_of_modes = None
fealho marked this conversation as resolved.
Show resolved Hide resolved
self._column_raw_dtypes = None
self._bgm_transformer = None
self._valid_component_indicator = None

def get_output_types(self):
"""Return the output types supported by the transformer.

Returns:
dict:
Mapping from the transformed column names to supported data types.
"""
output_types = {
'continuous': 'float',
fealho marked this conversation as resolved.
Show resolved Hide resolved
'discrete': 'categorical'
}
if self.null_transformer and self.null_transformer.creates_null_column():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)

def _fit(self, data):
"""Fit the transformer to the data.

Args:
data (pandas.Series):
Data to fit to.
"""
self._bgm_transformer = BayesianGaussianMixture(
n_components=self._max_clusters,
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=0.001,
n_init=1
)

self._bgm_transformer.fit(data.array.reshape(-1, 1))
self._valid_component_indicator = self._bgm_transformer.weights_ > self._weight_threshold
fealho marked this conversation as resolved.
Show resolved Hide resolved
self._number_of_modes = self._valid_component_indicator.sum()
self._column_raw_dtypes = data.infer_objects().dtypes
fealho marked this conversation as resolved.
Show resolved Hide resolved

def _transform(self, data):
"""Transform numerical data.

Args:
data (pandas.Series):
Data to transform.

Returns:
numpy.ndarray
"""
data = data.to_numpy()
data = data.reshape((len(data), 1))
means = self._bgm_transformer.means_.reshape((1, self._max_clusters))
stds = np.sqrt(self._bgm_transformer.covariances_).reshape((1, self._max_clusters))
normalized_values = (data - means) / (self.STD_MULTIPLIER * stds)
normalized_values = normalized_values[:, self._valid_component_indicator]
component_probs = self._bgm_transformer.predict_proba(data)
component_probs = component_probs[:, self._valid_component_indicator]

selected_component = np.zeros(len(data), dtype='int')
for i in range(len(data)):
component_prob_t = component_probs[i] + 1e-6
component_prob_t = component_prob_t / component_prob_t.sum()
selected_component[i] = np.random.choice(
np.arange(self._number_of_modes), p=component_prob_t
)

aranged = np.arange(len(data))
normalized = normalized_values[aranged, selected_component].reshape([-1, 1])
normalized = np.clip(normalized, -.99, .99)
normalized = normalized[:, 0]

one_hot = np.zeros_like(component_probs)
fealho marked this conversation as resolved.
Show resolved Hide resolved
one_hot[np.arange(len(data)), selected_component] = 1
fealho marked this conversation as resolved.
Show resolved Hide resolved
one_hot_as_label = one_hot.argmax(axis=1)

return pd.DataFrame({
'continuous': normalized,
'discrete': one_hot_as_label
})

def _reverse_transform_helper(self, data, sigma):
normalized = data[:, 0]
selected_component_probs = data[:, 1:]

if sigma is not None:
normalized = np.random.normal(normalized, sigma)

normalized = np.clip(normalized, -1, 1)
component_probs = np.ones((len(data), self._max_clusters)) * -np.inf
component_probs[:, self._valid_component_indicator] = selected_component_probs

means = self._bgm_transformer.means_.reshape([-1])
stds = np.sqrt(self._bgm_transformer.covariances_).reshape([-1])
selected_component = np.argmax(component_probs, axis=1)
fealho marked this conversation as resolved.
Show resolved Hide resolved

std_t = stds[selected_component]
mean_t = means[selected_component]
column = normalized * self.STD_MULTIPLIER * std_t + mean_t

return column

def _reverse_transform(self, data, sigma=None):
"""Convert data back into the original format.

Args:
data (pd.Series or numpy.ndarray):
Data to transform.

Returns:
pandas.Series
"""
one_hot = np.zeros(shape=(data.shape[1], self._number_of_modes))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we changing the indexing?

discrete_column = data[1].astype(int).tolist()
one_hot[np.arange(data.shape[1]), discrete_column] = 1.0
data = np.concatenate([data[0][:, None], one_hot], axis=1)

recovered_data = self._reverse_transform_helper(data, sigma)
recovered_data = pd.Series(recovered_data).astype(self._column_raw_dtypes)

return recovered_data
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
'pandas>=1.1.3,<2',
'scipy>=1.5.4,<2',
'psutil>=5.7,<6',
'scikit-learn>=0.24,<1',
]

copulas_requires = [
Expand All @@ -36,7 +37,6 @@
'jupyter>=1.0.0,<2',
'rundoc>=0.4.3,<0.5',
'pytest-subtests>=0.5,<1.0',
'scikit-learn>=0.24,<1',
]

addons_require = []
Expand Down
21 changes: 20 additions & 1 deletion tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
import pandas as pd

from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer
from rdt.transformers.numerical import (
BayesGMMTransformer, GaussianCopulaTransformer, NumericalTransformer)


class TestNumericalTransformer:
Expand Down Expand Up @@ -135,3 +136,21 @@ def test_int_nan(self):

reverse = ct.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=2)


class TestBayesGMMTransformer:

def test_simple(self):
fealho marked this conversation as resolved.
Show resolved Hide resolved
data = pd.DataFrame(np.random.normal(loc=4, scale=4, size=123), columns=['col'])

bgmm_transformer = BayesGMMTransformer()
bgmm_transformer.fit(data, list(data.columns))
transformed = bgmm_transformer.transform(data)

assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (123, 2)
assert all(isinstance(x, float) for x in transformed['col.continuous'])
assert all(isinstance(x, int) for x in transformed['col.discrete'])

reverse = bgmm_transformer.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=1)
Loading