Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a bayesian gaussian mixture model transformer #329

Merged
merged 33 commits into from
Dec 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd
import scipy
from sklearn.mixture import BayesianGaussianMixture

from rdt.transformers.base import BaseTransformer
from rdt.transformers.null import NullTransformer
Expand Down Expand Up @@ -487,3 +488,196 @@ def _reverse_transform(self, data):
data = self._univariate.ppf(scipy.stats.norm.cdf(data))

return super()._reverse_transform(data)


class BayesGMMTransformer(NumericalTransformer):
"""Transformer for numerical data using a Bayesian Gaussian Mixture Model.

This transformation takes a numerical value and transforms it using a Bayesian GMM
model. It generates two outputs, a discrete value which indicates the selected
'component' of the GMM and a continuous value which represents the normalized value
based on the mean and std of the selected component.

Args:
dtype (data type):
Data type of the data to transform. It will be used when reversing the
transformation. If not provided, the dtype of the fit data will be used.
Defaults to ``None``.
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
rounding (int, str or None):
Define rounding scheme for data. If set to an int, values will be rounded
to that number of decimal places. If ``None``, values will not be rounded.
If set to ``'auto'``, the transformer will round to the maximum number of
decimal places detected in the fitted data.
min_value (int, str or None):
Indicate whether or not to set a minimum value for the data. If an integer is given,
reverse transformed data will be greater than or equal to it. If the string ``'auto'``
is given, the minimum will be the minimum value seen in the fitted data. If ``None``
is given, there won't be a minimum.
max_value (int, str or None):
Indicate whether or not to set a maximum value for the data. If an integer is given,
reverse transformed data will be less than or equal to it. If the string ``'auto'``
is given, the maximum will be the maximum value seen in the fitted data. If ``None``
is given, there won't be a maximum.
max_clusters (int):
The maximum number of mixture components. Depending on the data, the model may select
fewer components (based on the ``weight_threshold``).
Defaults to 10.
weight_threshold (int, float):
The minimum value a component weight can take to be considered a valid component.
``weights_`` under this value will be ignored.
Defaults to 0.005.

Attributes:
_bgm_transformer:
An instance of sklearn`s ``BayesianGaussianMixture`` class.
_valid_component_indicator:
An array indicating the valid components. If the weight of a component is greater
than the ``weight_threshold``, it's indicated with True, otherwise it's set to False.
"""

STD_MULTIPLIER = 4
csala marked this conversation as resolved.
Show resolved Hide resolved
DETERMINISTIC_TRANSFORM = False
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = False

_bgm_transformer = None
_valid_component_indicator = None

def __init__(self, dtype=None, nan='mean', null_column=None, rounding=None,
min_value=None, max_value=None, max_clusters=10, weight_threshold=0.005):
super().__init__(dtype=dtype, nan=nan, null_column=null_column, rounding=rounding,
min_value=min_value, max_value=max_value)
self._max_clusters = max_clusters
self._weight_threshold = weight_threshold

def get_output_types(self):
"""Return the output types supported by the transformer.

Returns:
dict:
Mapping from the transformed column names to supported data types.
"""
output_types = {
'normalized': 'float',
'component': 'categorical'
}
if self.null_transformer and self.null_transformer.creates_null_column():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)

def _fit(self, data):
"""Fit the transformer to the data.

Args:
data (pandas.Series):
Data to fit to.
"""
self._bgm_transformer = BayesianGaussianMixture(
n_components=self._max_clusters,
weight_concentration_prior_type='dirichlet_process',
weight_concentration_prior=0.001,
n_init=1
)

super()._fit(data)
data = super()._transform(data)
fealho marked this conversation as resolved.
Show resolved Hide resolved
if data.ndim > 1:
data = data[:, 0]

self._bgm_transformer.fit(data.reshape(-1, 1))
self._valid_component_indicator = self._bgm_transformer.weights_ > self._weight_threshold
fealho marked this conversation as resolved.
Show resolved Hide resolved

def _transform(self, data):
"""Transform the numerical data.

Args:
data (pandas.Series):
Data to transform.

Returns:
numpy.ndarray.
"""
data = super()._transform(data)
if data.ndim > 1:
fealho marked this conversation as resolved.
Show resolved Hide resolved
data, null_column = data[:, 0], data[:, 1]

data = data.reshape((len(data), 1))
means = self._bgm_transformer.means_.reshape((1, self._max_clusters))

stds = np.sqrt(self._bgm_transformer.covariances_).reshape((1, self._max_clusters))
normalized_values = (data - means) / (self.STD_MULTIPLIER * stds)
normalized_values = normalized_values[:, self._valid_component_indicator]
component_probs = self._bgm_transformer.predict_proba(data)
component_probs = component_probs[:, self._valid_component_indicator]

selected_component = np.zeros(len(data), dtype='int')
for i in range(len(data)):
component_prob_t = component_probs[i] + 1e-6
component_prob_t = component_prob_t / component_prob_t.sum()
selected_component[i] = np.random.choice(
np.arange(self._valid_component_indicator.sum()),
p=component_prob_t
)

aranged = np.arange(len(data))
normalized = normalized_values[aranged, selected_component].reshape([-1, 1])
normalized = np.clip(normalized, -.99, .99)
normalized = normalized[:, 0]
rows = [normalized, selected_component]
if self.null_transformer and self.null_transformer.creates_null_column():
rows.append(null_column)

return np.stack(rows, axis=1) # noqa: PD013

def _reverse_transform_helper(self, data, sigma):
normalized = data[:, 0]

if sigma is not None:
normalized = np.random.normal(normalized, sigma)

normalized = np.clip(normalized, -1, 1)

means = self._bgm_transformer.means_.reshape([-1])
stds = np.sqrt(self._bgm_transformer.covariances_).reshape([-1])
selected_component = data[:, 1].astype(int)

std_t = stds[self._valid_component_indicator][selected_component]
mean_t = means[self._valid_component_indicator][selected_component]
reversed_data = normalized * self.STD_MULTIPLIER * std_t + mean_t

return reversed_data

def _reverse_transform(self, data, sigma=None):
"""Convert data back into the original format.

Args:
data (pd.DataFrame or numpy.ndarray):
Data to transform.
sigma (float):
Add random noise to the normalized value.

Returns:
pandas.Series.
"""
if not isinstance(data, np.ndarray):
data = data.to_numpy()

recovered_data = self._reverse_transform_helper(data, sigma)
if self.null_transformer and self.null_transformer.creates_null_column():
data = np.stack([recovered_data, data[:, -1]], axis=1) # noqa: PD013
else:
data = recovered_data

return super()._reverse_transform(data)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
'pandas>=1.1.3,<2',
'scipy>=1.5.4,<2',
'psutil>=5.7,<6',
'scikit-learn>=0.24,<1',
]

copulas_requires = [
Expand All @@ -36,7 +37,6 @@
'jupyter>=1.0.0,<2',
'rundoc>=0.4.3,<0.5',
'pytest-subtests>=0.5,<1.0',
'scikit-learn>=0.24,<1',
]

addons_require = []
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def validate_transformer(transformer, steps=None, subtests=None):
data = pd.DataFrame({TEST_COL: dg.generate(DATA_SIZE)})

if subtests:
with subtests.test(msg='test_transformer_with_dataset', generator=dg):
with subtests.test(msg=f'test_transformer_with_dataset_{dg}', generator=dg):
_test_transformer_with_dataset(transformer, data, steps)
_test_transformer_with_hypertransformer(transformer, data, steps)
else:
Expand Down
85 changes: 84 additions & 1 deletion tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
import pandas as pd

from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer
from rdt.transformers.numerical import (
BayesGMMTransformer, GaussianCopulaTransformer, NumericalTransformer)


class TestNumericalTransformer:
Expand Down Expand Up @@ -135,3 +136,85 @@ def test_int_nan(self):

reverse = ct.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=2)


class TestBayesGMMTransformer:

def generate_data(self):
data1 = np.random.normal(loc=5, scale=1, size=100)
data2 = np.random.normal(loc=-5, scale=1, size=100)
data = np.concatenate([data1, data2])

return pd.DataFrame(data, columns=['col'])

def test_dataframe(self):
data = self.generate_data()

bgmm_transformer = BayesGMMTransformer()
bgmm_transformer.fit(data, list(data.columns))
transformed = bgmm_transformer.transform(data)

assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (200, 2)
assert all(isinstance(x, float) for x in transformed['col.normalized'])
assert all(isinstance(x, float) for x in transformed['col.component'])

reverse = bgmm_transformer.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=1)

def test_nulls(self):
data = self.generate_data()
mask = np.random.choice([1, 0], data.shape, p=[.1, .9]).astype(bool)
data[mask] = np.nan

bgmm_transformer = BayesGMMTransformer()
bgmm_transformer.fit(data, list(data.columns))
transformed = bgmm_transformer.transform(data)

assert isinstance(transformed, pd.DataFrame)
assert transformed.shape == (200, 3)
assert all(isinstance(x, float) for x in transformed['col.normalized'])
assert all(isinstance(x, float) for x in transformed['col.component'])
assert all(isinstance(x, float) for x in transformed['col.is_null'])

reverse = bgmm_transformer.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=1)

def test_data_different_sizes(self):
data = np.concatenate([
np.random.normal(loc=5, scale=1, size=100),
np.random.normal(loc=100, scale=1, size=500),
])
data = pd.DataFrame(data, columns=['col'])

bgmm_transformer = BayesGMMTransformer()
bgmm_transformer.fit(data, list(data.columns))
transformed = bgmm_transformer.transform(data)

assert isinstance(transformed, pd.DataFrame)
assert all(isinstance(x, float) for x in transformed['col.normalized'])
assert all(isinstance(x, float) for x in transformed['col.component'])

reverse = bgmm_transformer.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=1)

def test_multiple_components(self):
data = np.concatenate([
np.random.normal(loc=5, scale=0.02, size=300),
np.random.normal(loc=-4, scale=0.1, size=1000),
np.random.normal(loc=-180, scale=3, size=1500),
np.random.normal(loc=100, scale=10, size=500),
])
data = pd.DataFrame(data, columns=['col'])
data = data.sample(frac=1).reset_index(drop=True)

bgmm_transformer = BayesGMMTransformer()
bgmm_transformer.fit(data, list(data.columns))
transformed = bgmm_transformer.transform(data)

assert isinstance(transformed, pd.DataFrame)
assert all(isinstance(x, float) for x in transformed['col.normalized'])
assert all(isinstance(x, float) for x in transformed['col.component'])

reverse = bgmm_transformer.reverse_transform(transformed)
np.testing.assert_array_almost_equal(reverse, data, decimal=1)
5 changes: 5 additions & 0 deletions tests/performance/test_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest

from rdt.transformers import get_transformers_by_type
from rdt.transformers.numerical import BayesGMMTransformer
from tests.datasets import get_dataset_generators_by_type
from tests.performance.profiling import profile_transformer

Expand Down Expand Up @@ -68,6 +69,7 @@ def _get_dataset_sizes(data_type):
def _get_performance_test_cases():
"""Get all the (transformer, dataset_generator) combinations for testing."""
all_test_cases = []
sandbox = [BayesGMMTransformer]

dataset_generators = get_dataset_generators_by_type()
transformers = get_transformers_by_type()
Expand All @@ -76,6 +78,9 @@ def _get_performance_test_cases():
dataset_generators_for_type = dataset_generators.get(data_type, [])

for transformer in transformers_for_type:
if transformer in sandbox:
continue
fealho marked this conversation as resolved.
Show resolved Hide resolved

for dataset_generator in dataset_generators_for_type:
all_test_cases.append((transformer, dataset_generator))

Expand Down
Loading