Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a bayesian gaussian mixture model transformer #329

Merged
merged 33 commits into from
Dec 17, 2021
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 158 additions & 1 deletion rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd
import scipy
from sklearn.mixture import BayesianGaussianMixture

from rdt.transformers.base import BaseTransformer
from rdt.transformers.null import NullTransformer
Expand Down Expand Up @@ -156,7 +157,9 @@ def _transform(self, data):
Returns:
numpy.ndarray
"""
return self.null_transformer.transform(data)
if self.nan is not None:
fealho marked this conversation as resolved.
Show resolved Hide resolved
return self.null_transformer.transform(data)
return data.to_numpy()

def _reverse_transform(self, data):
"""Convert data back into the original format.
Expand Down Expand Up @@ -487,3 +490,157 @@ def _reverse_transform(self, data):
data = self._univariate.ppf(scipy.stats.norm.cdf(data))

return super()._reverse_transform(data)


class BayesGMMTransformer(NumericalTransformer):
"""Bayesian GMM transformer."""

STD_MULTIPLIER = 4
csala marked this conversation as resolved.
Show resolved Hide resolved
DETERMINISTIC_TRANSFORM = False
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = False

def __init__(self, dtype=None, nan='mean', null_column=None, max_clusters=10,
weight_threshold=0.005, rounding=None, min_value=None, max_value=None):
super().__init__(
dtype=dtype, nan=nan, null_column=null_column, rounding=rounding,
min_value=min_value, max_value=max_value
)
self._max_clusters = max_clusters
self._weight_threshold = weight_threshold
self._number_of_modes = None
fealho marked this conversation as resolved.
Show resolved Hide resolved
self._bgm_transformer = None
self._valid_component_indicator = None

def get_output_types(self):
"""Return the output types supported by the transformer.

Returns:
dict:
Mapping from the transformed column names to supported data types.
"""
output_types = {
'continuous': 'float',
fealho marked this conversation as resolved.
Show resolved Hide resolved
'discrete': 'categorical'
}
if self.null_transformer and self.null_transformer.creates_null_column():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)

def _fit(self, data):
"""Fit the transformer to the data.

Args:
data (pandas.Series):
Data to fit to.
"""
self._bgm_transformer = BayesianGaussianMixture(
n_components=self._max_clusters,
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=0.001,
n_init=1
)

super()._fit(data)
data = super()._transform(data)
fealho marked this conversation as resolved.
Show resolved Hide resolved
if data.ndim > 1:
data = data[:, 0]

self._bgm_transformer.fit(data.reshape(-1, 1))
self._valid_component_indicator = self._bgm_transformer.weights_ > self._weight_threshold
fealho marked this conversation as resolved.
Show resolved Hide resolved
self._number_of_modes = self._valid_component_indicator.sum()

def _transform(self, data):
"""Transform numerical data.

Args:
data (pandas.Series):
Data to transform.

Returns:
numpy.ndarray
"""
data = super()._transform(data)
if data.ndim > 1:
fealho marked this conversation as resolved.
Show resolved Hide resolved
data, null_column = data[:, 0], data[:, 1]

data = data.reshape((len(data), 1))
means = self._bgm_transformer.means_.reshape((1, self._max_clusters))

stds = np.sqrt(self._bgm_transformer.covariances_).reshape((1, self._max_clusters))
normalized_values = (data - means) / (self.STD_MULTIPLIER * stds)
normalized_values = normalized_values[:, self._valid_component_indicator]
component_probs = self._bgm_transformer.predict_proba(data)
component_probs = component_probs[:, self._valid_component_indicator]

selected_component = np.zeros(len(data), dtype='int')
for i in range(len(data)):
component_prob_t = component_probs[i] + 1e-6
component_prob_t = component_prob_t / component_prob_t.sum()
selected_component[i] = np.random.choice(
np.arange(self._number_of_modes),
p=component_prob_t
)

aranged = np.arange(len(data))
normalized = normalized_values[aranged, selected_component].reshape([-1, 1])
normalized = np.clip(normalized, -.99, .99)
normalized = normalized[:, 0]

one_hot = np.zeros_like(component_probs)
fealho marked this conversation as resolved.
Show resolved Hide resolved
one_hot[aranged, selected_component] = 1
one_hot_as_label = one_hot.argmax(axis=1)

rows = [normalized, one_hot_as_label]
if self.null_transformer and self.null_transformer.creates_null_column():
rows.append(null_column)

return np.stack(rows, axis=1) # noqa: PD013

def _reverse_transform_helper(self, data, sigma):
normalized = data[:, 0]
selected_component_probs = data[:, 1:]

if sigma is not None:
normalized = np.random.normal(normalized, sigma)

normalized = np.clip(normalized, -1, 1)
component_probs = np.ones((len(data), self._max_clusters)) * -np.inf
component_probs[:, self._valid_component_indicator] = selected_component_probs

means = self._bgm_transformer.means_.reshape([-1])
stds = np.sqrt(self._bgm_transformer.covariances_).reshape([-1])
selected_component = np.argmax(component_probs, axis=1)
fealho marked this conversation as resolved.
Show resolved Hide resolved

std_t = stds[selected_component]
mean_t = means[selected_component]
reversed_data = normalized * self.STD_MULTIPLIER * std_t + mean_t

return reversed_data

def _reverse_transform(self, data, sigma=None):
"""Convert data back into the original format.

Args:
data (pd.DataFrame or numpy.ndarray):
Data to transform.

Returns:
pandas.Series
"""
if not isinstance(data, np.ndarray):
data = data.to_numpy()

one_hot = np.zeros(shape=(data.shape[0], self._number_of_modes))
discrete_column = data[:, 1].astype(int).tolist()
one_hot[np.arange(data.shape[0]), discrete_column] = 1.0
recovered_data = np.concatenate([data[:, :1], one_hot], axis=1)
recovered_data = self._reverse_transform_helper(recovered_data, sigma)

if self.null_transformer and self.null_transformer.creates_null_column():
data = np.stack([recovered_data, data[:, -1]], axis=1) # noqa: PD013
else:
data = recovered_data

return pd.Series(super()._reverse_transform(data))
fealho marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
'pandas>=1.1.3,<2',
'scipy>=1.5.4,<2',
'psutil>=5.7,<6',
'scikit-learn>=0.24,<1',
]

copulas_requires = [
Expand All @@ -36,7 +37,6 @@
'jupyter>=1.0.0,<2',
'rundoc>=0.4.3,<0.5',
'pytest-subtests>=0.5,<1.0',
'scikit-learn>=0.24,<1',
]

addons_require = []
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def validate_transformer(transformer, steps=None, subtests=None):
data = pd.DataFrame({TEST_COL: dg.generate(DATA_SIZE)})

if subtests:
with subtests.test(msg='test_transformer_with_dataset', generator=dg):
with subtests.test(msg=f'test_transformer_with_dataset_{dg}', generator=dg):
_test_transformer_with_dataset(transformer, data, steps)
_test_transformer_with_hypertransformer(transformer, data, steps)
else:
Expand Down
46 changes: 45 additions & 1 deletion tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
import pandas as pd

from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer
from rdt.transformers.numerical import (
BayesGMMTransformer, GaussianCopulaTransformer, NumericalTransformer)


class TestNumericalTransformer:
Expand Down Expand Up @@ -135,3 +136,46 @@ def test_int_nan(self):

reverse = ct.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=2)


class TestBayesGMMTransformer:

def generate_data(self):
data1 = np.random.normal(loc=5, scale=1, size=100)
data2 = np.random.normal(loc=-5, scale=1, size=100)
data = np.concatenate([data1, data2])

return pd.DataFrame(data, columns=['col'])

def test_dataframe(self):
data = self.generate_data()

bgmm_transformer = BayesGMMTransformer()
bgmm_transformer.fit(data, list(data.columns))
transformed = bgmm_transformer.transform(data)

assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (200, 2)
assert all(isinstance(x, float) for x in transformed['col.continuous'])
assert all(isinstance(x, float) for x in transformed['col.discrete'])

reverse = bgmm_transformer.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=1)

def test_nulls(self):
data = self.generate_data()
mask = np.random.choice([1, 0], data.shape, p=[.1, .9]).astype(bool)
data[mask] = np.nan

bgmm_transformer = BayesGMMTransformer()
bgmm_transformer.fit(data, list(data.columns))
transformed = bgmm_transformer.transform(data)

assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (200, 3)
assert all(isinstance(x, float) for x in transformed['col.continuous'])
assert all(isinstance(x, float) for x in transformed['col.discrete'])
assert all(isinstance(x, float) for x in transformed['col.is_null'])

reverse = bgmm_transformer.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=1)
Loading