sdv-dev · fealho · Dec 17, 2021 · Dec 1, 2021 · Dec 1, 2021 · Dec 1, 2021
diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import scipy
+from sklearn.mixture import BayesianGaussianMixture
 
 from rdt.transformers.base import BaseTransformer
 from rdt.transformers.null import NullTransformer
@@ -487,3 +488,196 @@ def _reverse_transform(self, data):
             data = self._univariate.ppf(scipy.stats.norm.cdf(data))
 
         return super()._reverse_transform(data)
+
+
+class BayesGMMTransformer(NumericalTransformer):
+    """Transformer for numerical data using a Bayesian Gaussian Mixture Model.
+
+    This transformation takes a numerical value and transforms it using a Bayesian GMM
+    model. It generates two outputs, a discrete value which indicates the selected
+    'component' of the GMM and a continuous value which represents the normalized value
+    based on the mean and std of the selected component.
+
+    Args:
+        dtype (data type):
+            Data type of the data to transform. It will be used when reversing the
+            transformation. If not provided, the dtype of the fit data will be used.
+            Defaults to ``None``.
+        nan (int, str or None):
+            Indicate what to do with the null values. If an integer is given, replace them
+            with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
+            them with the corresponding aggregation. If ``None`` is given, do not replace them.
+            Defaults to ``'mean'``.
+        null_column (bool):
+            Whether to create a new column to indicate which values were null or not.
+            If ``None``, only create a new column when the data contains null values.
+            If ``True``, always create the new column whether there are null values or not.
+            If ``False``, do not create the new column.
+            Defaults to ``None``.
+        rounding (int, str or None):
+            Define rounding scheme for data. If set to an int, values will be rounded
+            to that number of decimal places. If ``None``, values will not be rounded.
+            If set to ``'auto'``, the transformer will round to the maximum number of
+            decimal places detected in the fitted data.
+        min_value (int, str or None):
+            Indicate whether or not to set a minimum value for the data. If an integer is given,
+            reverse transformed data will be greater than or equal to it. If the string ``'auto'``
+            is given, the minimum will be the minimum value seen in the fitted data. If ``None``
+            is given, there won't be a minimum.
+        max_value (int, str or None):
+            Indicate whether or not to set a maximum value for the data. If an integer is given,
+            reverse transformed data will be less than or equal to it. If the string ``'auto'``
+            is given, the maximum will be the maximum value seen in the fitted data. If ``None``
+            is given, there won't be a maximum.
+        max_clusters (int):
+            The maximum number of mixture components. Depending on the data, the model may select
+            fewer components (based on the ``weight_threshold``).
+            Defaults to 10.
+        weight_threshold (int, float):
+            The minimum value a component weight can take to be considered a valid component.
+            ``weights_`` under this value will be ignored.
+            Defaults to 0.005.
+
+    Attributes:
+        _bgm_transformer:
+            An instance of sklearn`s ``BayesianGaussianMixture`` class.
+        _valid_component_indicator:
+            An array indicating the valid components. If the weight of a component is greater
+            than the ``weight_threshold``, it's indicated with True, otherwise it's set to False.
+    """
+
+    STD_MULTIPLIER = 4
+    DETERMINISTIC_TRANSFORM = False
+    DETERMINISTIC_REVERSE = True
+    COMPOSITION_IS_IDENTITY = False
+
+    _bgm_transformer = None
+    _valid_component_indicator = None
+
+    def __init__(self, dtype=None, nan='mean', null_column=None, rounding=None,
+                 min_value=None, max_value=None, max_clusters=10, weight_threshold=0.005):
+        super().__init__(dtype=dtype, nan=nan, null_column=null_column, rounding=rounding,
+                         min_value=min_value, max_value=max_value)
+        self._max_clusters = max_clusters
+        self._weight_threshold = weight_threshold
+
+    def get_output_types(self):
+        """Return the output types supported by the transformer.
+
+        Returns:
+            dict:
+                Mapping from the transformed column names to supported data types.
+        """
+        output_types = {
+            'normalized': 'float',
+            'component': 'categorical'
+        }
+        if self.null_transformer and self.null_transformer.creates_null_column():
+            output_types['is_null'] = 'float'
+
+        return self._add_prefix(output_types)
+
+    def _fit(self, data):
+        """Fit the transformer to the data.
+
+        Args:
+            data (pandas.Series):
+                Data to fit to.
+        """
+        self._bgm_transformer = BayesianGaussianMixture(
+            n_components=self._max_clusters,
+            weight_concentration_prior_type='dirichlet_process',
+            weight_concentration_prior=0.001,
+            n_init=1
+        )
+
+        super()._fit(data)
+        data = super()._transform(data)
+        if data.ndim > 1:
+            data = data[:, 0]
+
+        self._bgm_transformer.fit(data.reshape(-1, 1))
+        self._valid_component_indicator = self._bgm_transformer.weights_ > self._weight_threshold
+
+    def _transform(self, data):
+        """Transform the numerical data.
+
+        Args:
+            data (pandas.Series):
+                Data to transform.
+
+        Returns:
+            numpy.ndarray.
+        """
+        data = super()._transform(data)
+        if data.ndim > 1:
+            data, null_column = data[:, 0], data[:, 1]
+
+        data = data.reshape((len(data), 1))
+        means = self._bgm_transformer.means_.reshape((1, self._max_clusters))
+
+        stds = np.sqrt(self._bgm_transformer.covariances_).reshape((1, self._max_clusters))
+        normalized_values = (data - means) / (self.STD_MULTIPLIER * stds)
+        normalized_values = normalized_values[:, self._valid_component_indicator]
+        component_probs = self._bgm_transformer.predict_proba(data)
+        component_probs = component_probs[:, self._valid_component_indicator]
+
+        selected_component = np.zeros(len(data), dtype='int')
+        for i in range(len(data)):
+            component_prob_t = component_probs[i] + 1e-6
+            component_prob_t = component_prob_t / component_prob_t.sum()
+            selected_component[i] = np.random.choice(
+                np.arange(self._valid_component_indicator.sum()),
+                p=component_prob_t
+            )
+
+        aranged = np.arange(len(data))
+        normalized = normalized_values[aranged, selected_component].reshape([-1, 1])
+        normalized = np.clip(normalized, -.99, .99)
+        normalized = normalized[:, 0]
+        rows = [normalized, selected_component]
+        if self.null_transformer and self.null_transformer.creates_null_column():
+            rows.append(null_column)
+
+        return np.stack(rows, axis=1)  # noqa: PD013
+
+    def _reverse_transform_helper(self, data, sigma):
+        normalized = data[:, 0]
+
+        if sigma is not None:
+            normalized = np.random.normal(normalized, sigma)
+
+        normalized = np.clip(normalized, -1, 1)
+
+        means = self._bgm_transformer.means_.reshape([-1])
+        stds = np.sqrt(self._bgm_transformer.covariances_).reshape([-1])
+        selected_component = data[:, 1].astype(int)
+
+        std_t = stds[self._valid_component_indicator][selected_component]
+        mean_t = means[self._valid_component_indicator][selected_component]
+        reversed_data = normalized * self.STD_MULTIPLIER * std_t + mean_t
+
+        return reversed_data
+
+    def _reverse_transform(self, data, sigma=None):
+        """Convert data back into the original format.
+
+        Args:
+            data (pd.DataFrame or numpy.ndarray):
+                Data to transform.
+            sigma (float):
+                Add random noise to the normalized value.
+
+        Returns:
+            pandas.Series.
+        """
+        if not isinstance(data, np.ndarray):
+            data = data.to_numpy()
+
+        recovered_data = self._reverse_transform_helper(data, sigma)
+        if self.null_transformer and self.null_transformer.creates_null_column():
+            data = np.stack([recovered_data, data[:, -1]], axis=1)  # noqa: PD013
+        else:
+            data = recovered_data
+
+        return super()._reverse_transform(data)
diff --git a/setup.py b/setup.py
@@ -20,6 +20,7 @@
     'pandas>=1.1.3,<2',
     'scipy>=1.5.4,<2',
     'psutil>=5.7,<6',
+    'scikit-learn>=0.24,<1',
 ]
 
 copulas_requires = [
@@ -36,7 +37,6 @@
     'jupyter>=1.0.0,<2',
     'rundoc>=0.4.3,<0.5',
     'pytest-subtests>=0.5,<1.0',
-    'scikit-learn>=0.24,<1',
 ]
 
 addons_require = []

diff --git a/tests/integration/test_transformers.py b/tests/integration/test_transformers.py
@@ -244,7 +244,7 @@ def validate_transformer(transformer, steps=None, subtests=None):
         data = pd.DataFrame({TEST_COL: dg.generate(DATA_SIZE)})
 
         if subtests:
-            with subtests.test(msg='test_transformer_with_dataset', generator=dg):
+            with subtests.test(msg=f'test_transformer_with_dataset_{dg}', generator=dg):
                 _test_transformer_with_dataset(transformer, data, steps)
                 _test_transformer_with_hypertransformer(transformer, data, steps)
         else:

diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py
@@ -1,7 +1,8 @@
 import numpy as np
 import pandas as pd
 
-from rdt.transformers.numerical import GaussianCopulaTransformer, NumericalTransformer
+from rdt.transformers.numerical import (
+    BayesGMMTransformer, GaussianCopulaTransformer, NumericalTransformer)
 
 
 class TestNumericalTransformer:
@@ -135,3 +136,85 @@ def test_int_nan(self):
 
         reverse = ct.reverse_transform(transformed)
         np.testing.assert_array_almost_equal(reverse, data, decimal=2)
+
+
+class TestBayesGMMTransformer:
+
+    def generate_data(self):
+        data1 = np.random.normal(loc=5, scale=1, size=100)
+        data2 = np.random.normal(loc=-5, scale=1, size=100)
+        data = np.concatenate([data1, data2])
+
+        return pd.DataFrame(data, columns=['col'])
+
+    def test_dataframe(self):
+        data = self.generate_data()
+
+        bgmm_transformer = BayesGMMTransformer()
+        bgmm_transformer.fit(data, list(data.columns))
+        transformed = bgmm_transformer.transform(data)
+
+        assert isinstance(transformed, pd.DataFrame)
+        assert transformed.shape == (200, 2)
+        assert all(isinstance(x, float) for x in transformed['col.normalized'])
+        assert all(isinstance(x, float) for x in transformed['col.component'])
+
+        reverse = bgmm_transformer.reverse_transform(transformed)
+        np.testing.assert_array_almost_equal(reverse, data, decimal=1)
+
+    def test_nulls(self):
+        data = self.generate_data()
+        mask = np.random.choice([1, 0], data.shape, p=[.1, .9]).astype(bool)
+        data[mask] = np.nan
+
+        bgmm_transformer = BayesGMMTransformer()
+        bgmm_transformer.fit(data, list(data.columns))
+        transformed = bgmm_transformer.transform(data)
+
+        assert isinstance(transformed, pd.DataFrame)
+        assert transformed.shape == (200, 3)
+        assert all(isinstance(x, float) for x in transformed['col.normalized'])
+        assert all(isinstance(x, float) for x in transformed['col.component'])
+        assert all(isinstance(x, float) for x in transformed['col.is_null'])
+
+        reverse = bgmm_transformer.reverse_transform(transformed)
+        np.testing.assert_array_almost_equal(reverse, data, decimal=1)
+
+    def test_data_different_sizes(self):
+        data = np.concatenate([
+            np.random.normal(loc=5, scale=1, size=100),
+            np.random.normal(loc=100, scale=1, size=500),
+        ])
+        data = pd.DataFrame(data, columns=['col'])
+
+        bgmm_transformer = BayesGMMTransformer()
+        bgmm_transformer.fit(data, list(data.columns))
+        transformed = bgmm_transformer.transform(data)
+
+        assert isinstance(transformed, pd.DataFrame)
+        assert all(isinstance(x, float) for x in transformed['col.normalized'])
+        assert all(isinstance(x, float) for x in transformed['col.component'])
+
+        reverse = bgmm_transformer.reverse_transform(transformed)
+        np.testing.assert_array_almost_equal(reverse, data, decimal=1)
+
+    def test_multiple_components(self):
+        data = np.concatenate([
+            np.random.normal(loc=5, scale=0.02, size=300),
+            np.random.normal(loc=-4, scale=0.1, size=1000),
+            np.random.normal(loc=-180, scale=3, size=1500),
+            np.random.normal(loc=100, scale=10, size=500),
+        ])
+        data = pd.DataFrame(data, columns=['col'])
+        data = data.sample(frac=1).reset_index(drop=True)
+
+        bgmm_transformer = BayesGMMTransformer()
+        bgmm_transformer.fit(data, list(data.columns))
+        transformed = bgmm_transformer.transform(data)
+
+        assert isinstance(transformed, pd.DataFrame)
+        assert all(isinstance(x, float) for x in transformed['col.normalized'])
+        assert all(isinstance(x, float) for x in transformed['col.component'])
+
+        reverse = bgmm_transformer.reverse_transform(transformed)
+        np.testing.assert_array_almost_equal(reverse, data, decimal=1)
diff --git a/tests/performance/test_performance.py b/tests/performance/test_performance.py
@@ -7,6 +7,7 @@
 import pytest
 
 from rdt.transformers import get_transformers_by_type
+from rdt.transformers.numerical import BayesGMMTransformer
 from tests.datasets import get_dataset_generators_by_type
 from tests.performance.profiling import profile_transformer
 
@@ -68,6 +69,7 @@ def _get_dataset_sizes(data_type):
 def _get_performance_test_cases():
     """Get all the (transformer, dataset_generator) combinations for testing."""
     all_test_cases = []
+    sandbox = [BayesGMMTransformer]
 
     dataset_generators = get_dataset_generators_by_type()
     transformers = get_transformers_by_type()
@@ -76,6 +78,9 @@ def _get_performance_test_cases():
         dataset_generators_for_type = dataset_generators.get(data_type, [])
 
         for transformer in transformers_for_type:
+            if transformer in sandbox:
+                continue
+
             for dataset_generator in dataset_generators_for_type:
                 all_test_cases.append((transformer, dataset_generator))