Skip to content

Commit

Permalink
Update FrequencyEncoder (#386)
Browse files Browse the repository at this point in the history
* Rename transformer

* Update CategoricalTransformer

* Change history

* Rename tests

* Bad implementation that works

* Complete overhaul of the code

* Vectorize _transform

* Limit warning messages to 5 elements per warning
  • Loading branch information
fealho authored and amontanez24 committed Mar 7, 2022
1 parent 8c5262a commit bc6aa44
Show file tree
Hide file tree
Showing 9 changed files with 247 additions and 203 deletions.
4 changes: 2 additions & 2 deletions CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,8 @@ on each dataset, how that score compares to average and whether or not it is acc
In [1]: from tests.contributing import validate_transformer_quality
In [2]: results = validate_transformer_quality('rdt.transformers.CategoricalTransformer') # Replace CategoricalTransformer with your transformer
Validating Quality Tests for transformer CategoricalTransformer
In [2]: results = validate_transformer_quality('rdt.transformers.FrequencyEncoder') # Replace FrequencyEncoder with your transformer
Validating Quality Tests for transformer FrequencyEncoder
SUCCESS: The quality tests were successful.
Expand Down
12 changes: 6 additions & 6 deletions DEVELOPMENT.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,11 @@ Since the ``country_code`` may or may not be present, we can overwrite the
def get_next_transformers(self):
next_transformers = {
'country_code': 'CategoricalTransformer',
'area_code': 'CategoricalTransformer'
'country_code': 'FrequencyEncoder',
'area_code': 'FrequencyEncoder'
}
if self.has_country_code:
next_transformers['country_code'] = 'CategoricalTransformer'
next_transformers['country_code'] = 'FrequencyEncoder'
return self._add_prefix(next_transformers)
Expand Down Expand Up @@ -230,11 +230,11 @@ handles that for us. Let's view the complete class below.
def get_next_transformers(self):
next_transformers = {
'country_code': 'CategoricalTransformer',
'area_code': 'CategoricalTransformer'
'country_code': 'FrequencyEncoder',
'area_code': 'FrequencyEncoder'
}
if self.has_country_code:
next_transformers['country_code'] = 'CategoricalTransformer'
next_transformers['country_code'] = 'FrequencyEncoder'
return self._add_prefix(next_transformers)
Expand Down
4 changes: 2 additions & 2 deletions rdt/hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,10 @@ def get_transformer_tree_yaml(self):
transformer: ExampleTransformer instance
outputs: [field1.out1, field1.out2]
field1.out1:
transformer: CategoricalTransformer instance
transformer: FrequencyEncoder instance
outputs: [field1.out1.value]
field1.out2:
transformer: CategoricalTransformer instance
transformer: FrequencyEncoder instance
outputs: [field1.out2.value]
"""
modified_tree = deepcopy(self._transformers_tree)
Expand Down
4 changes: 2 additions & 2 deletions rdt/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from rdt.transformers.base import BaseTransformer
from rdt.transformers.boolean import BooleanTransformer
from rdt.transformers.categorical import CategoricalTransformer
from rdt.transformers.categorical import FrequencyEncoder
from rdt.transformers.datetime import UnixTimestampEncoder
from rdt.transformers.null import NullTransformer
from rdt.transformers.numerical import NumericalTransformer
Expand Down Expand Up @@ -54,7 +54,7 @@ def _import_addons():
'numerical': NumericalTransformer,
'integer': NumericalTransformer(dtype=np.int64),
'float': NumericalTransformer(dtype=np.float64),
'categorical': CategoricalTransformer(fuzzy=True),
'categorical': FrequencyEncoder(add_noise=True),
'boolean': BooleanTransformer,
'datetime': UnixTimestampEncoder,
}
Expand Down
76 changes: 37 additions & 39 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from rdt.transformers.base import BaseTransformer


class CategoricalTransformer(BaseTransformer):
class FrequencyEncoder(BaseTransformer):
"""Transformer for categorical data.
This transformer computes a float representative for each one of the categories
Expand All @@ -27,12 +27,9 @@ class CategoricalTransformer(BaseTransformer):
Null values are considered just another category.
Args:
fuzzy (bool):
add_noise (bool):
Whether to generate gaussian noise around the class representative of each interval
or just use the mean for all the replaced values. Defaults to ``False``.
clip (bool):
If ``True``, clip the values to [0, 1]. Otherwise normalize them using modulo 1.
Defaults to ``False``.
"""

INPUT_TYPE = 'categorical'
Expand All @@ -57,9 +54,8 @@ def __setstate__(self, state):

self.__dict__ = state

def __init__(self, fuzzy=False, clip=False):
self.fuzzy = fuzzy
self.clip = clip
def __init__(self, add_noise=False):
self.add_noise = add_noise

def is_transform_deterministic(self):
"""Return whether the transform is deterministic.
Expand All @@ -68,7 +64,7 @@ def is_transform_deterministic(self):
bool:
Whether or not the transform is deterministic.
"""
return not self.fuzzy
return not self.add_noise

def is_composition_identity(self):
"""Return whether composition of transform and reverse transform produces the input data.
Expand All @@ -77,7 +73,7 @@ def is_composition_identity(self):
bool:
Whether or not transforming and then reverse transforming returns the input data.
"""
return self.COMPOSITION_IS_IDENTITY and not self.fuzzy
return self.COMPOSITION_IS_IDENTITY and not self.add_noise

@staticmethod
def _get_intervals(data):
Expand Down Expand Up @@ -122,18 +118,14 @@ def _get_intervals(data):
def _fit(self, data):
"""Fit the transformer to the data.
Create the mapping dict to save the label encoding.
Finally, compute the intervals for each categorical value.
Compute the intervals for each categorical value.
Args:
data (pandas.Series):
Data to fit the transformer to.
"""
self.mapping = {}
self.dtype = data.dtype

self.intervals, self.means, self.starts = self._get_intervals(data)
self._get_category_from_index = list(self.means.index).__getitem__

def _transform_by_category(self, data):
"""Transform the data by iterating over the different categories."""
Expand All @@ -147,7 +139,7 @@ def _transform_by_category(self, data):
else:
mask = (data.to_numpy() == category)

if self.fuzzy:
if self.add_noise:
result[mask] = norm.rvs(mean, std, size=mask.sum())
else:
result[mask] = mean
Expand All @@ -161,7 +153,7 @@ def _get_value(self, category):

mean, std = self.intervals[category][2:]

if self.fuzzy:
if self.add_noise:
return norm.rvs(mean, std)

return mean
Expand All @@ -171,9 +163,7 @@ def _transform_by_row(self, data):
return data.fillna(np.nan).apply(self._get_value).to_numpy()

def _transform(self, data):
"""Transform categorical values to float values.
Replace the categories with their float representative value.
"""Transform the categorical values to float representatives.
Args:
data (pandas.Series):
Expand All @@ -182,21 +172,25 @@ def _transform(self, data):
Returns:
numpy.ndarray
"""
fit_categories = pd.Series(self.intervals.keys())
has_nan = pd.isna(fit_categories).any()
unseen_indexes = ~(data.isin(fit_categories) | (pd.isna(data) & has_nan))
if unseen_indexes.any():
# Select only the first 5 unseen categories to avoid flooding the console.
unseen_categories = set(data[unseen_indexes][:5])
warnings.warn(
f'The data contains {unseen_indexes.sum()} new categories that were not '
f'seen in the original data (examples: {unseen_categories}). Assigning '
'them random values. If you want to model new categories, '
'please fit the transformer again with the new data.'
)

data[unseen_indexes] = np.random.choice(fit_categories, size=unseen_indexes.size)
if len(self.means) < len(data):
return self._transform_by_category(data)

return self._transform_by_row(data)

def _normalize(self, data):
"""Normalize data to the range [0, 1].
This is done by either clipping or computing the values modulo 1.
"""
if self.clip:
return data.clip(0, 1)

return data % 1

def _reverse_transform_by_matrix(self, data):
"""Reverse transform the data with matrix operations."""
num_rows = len(data)
Expand Down Expand Up @@ -240,8 +234,7 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
data = self._normalize(data)

data = data.clip(0, 1)
num_rows = len(data)
num_categories = len(self.means)

Expand Down Expand Up @@ -375,10 +368,13 @@ def _transform(self, data):
unique_data = {np.nan if pd.isna(x) else x for x in pd.unique(data)}
unseen_categories = unique_data - set(self.dummies)
if unseen_categories:
# Select only the first 5 unseen categories to avoid flooding the console.
examples_unseen_categories = set(list(unseen_categories)[:5])
warnings.warn(
f'Warning: The data contains new categories {unseen_categories} that were not '
'seen in the original data. Creating a vector of all 0s. If you want to model '
'new categories, please fit the transformer again with the new data.'
f'The data contains {len(unseen_categories)} new categories that were not '
f'seen in the original data (examples: {examples_unseen_categories}). Creating '
'a vector of all 0s. If you want to model new categories, '
'please fit the transformer again with the new data.'
)

return self._transform_helper(data)
Expand Down Expand Up @@ -462,11 +458,13 @@ def _transform(self, data):
mapped = data.fillna(np.nan).map(self.categories_to_values)
is_null = mapped.isna()
if is_null.any():
unseen_categories = set(data[is_null])
# Select only the first 5 unseen categories to avoid flooding the console.
unseen_categories = set(data[is_null][:5])
warnings.warn(
f'Warning: The data contains new categories {unseen_categories} that were not '
'seen in the original data. Assigning them random values. If you want to model '
'new categories, please fit the transformer again with the new data.'
f'The data contains {is_null.sum()} new categories that were not '
f'seen in the original data (examples: {unseen_categories}). Assigning '
'them random values. If you want to model new categories, '
'please fit the transformer again with the new data.'
)

mapped[is_null] = np.random.randint(
Expand Down
18 changes: 9 additions & 9 deletions tests/integration/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from rdt import HyperTransformer
from rdt.transformers import (
DEFAULT_TRANSFORMERS, BaseTransformer, BooleanTransformer, CategoricalTransformer,
DEFAULT_TRANSFORMERS, BaseTransformer, BooleanTransformer, FrequencyEncoder,
NumericalTransformer, OneHotEncoder, UnixTimestampEncoder)


Expand Down Expand Up @@ -95,7 +95,7 @@ def get_transformed_data():


DETERMINISTIC_DEFAULT_TRANSFORMERS = deepcopy(DEFAULT_TRANSFORMERS)
DETERMINISTIC_DEFAULT_TRANSFORMERS['categorical'] = CategoricalTransformer
DETERMINISTIC_DEFAULT_TRANSFORMERS['categorical'] = FrequencyEncoder


@patch('rdt.transformers.DEFAULT_TRANSFORMERS', DETERMINISTIC_DEFAULT_TRANSFORMERS)
Expand All @@ -107,7 +107,7 @@ def test_hypertransformer_default_inputs():
transformers to use for each field.
Setup:
- Patch the DEFAULT_TRANSFORMERS to use the `CategoricalTransformer`
- Patch the DEFAULT_TRANSFORMERS to use the `FrequencyEncoder`
for categorical data types, so that the output is predictable.
Input:
Expand Down Expand Up @@ -137,13 +137,13 @@ def test_hypertransformer_default_inputs():
assert ht._transformers_tree['integer']['outputs'] == ['integer.value']
assert isinstance(ht._transformers_tree['float']['transformer'], NumericalTransformer)
assert ht._transformers_tree['float']['outputs'] == ['float.value']
assert isinstance(ht._transformers_tree['categorical']['transformer'], CategoricalTransformer)
assert isinstance(ht._transformers_tree['categorical']['transformer'], FrequencyEncoder)
assert ht._transformers_tree['categorical']['outputs'] == ['categorical.value']
assert isinstance(ht._transformers_tree['bool']['transformer'], BooleanTransformer)
assert ht._transformers_tree['bool']['outputs'] == ['bool.value']
assert isinstance(ht._transformers_tree['datetime']['transformer'], UnixTimestampEncoder)
assert ht._transformers_tree['datetime']['outputs'] == ['datetime.value']
assert isinstance(ht._transformers_tree['names']['transformer'], CategoricalTransformer)
assert isinstance(ht._transformers_tree['names']['transformer'], FrequencyEncoder)
assert ht._transformers_tree['names']['outputs'] == ['names.value']


Expand Down Expand Up @@ -171,11 +171,11 @@ def test_hypertransformer_field_transformers():
field_transformers = {
'integer': NumericalTransformer(dtype=np.int64),
'float': NumericalTransformer(dtype=float),
'categorical': CategoricalTransformer,
'categorical': FrequencyEncoder,
'bool': BooleanTransformer,
'datetime': DummyTransformerNotMLReady,
'datetime.value': CategoricalTransformer,
'names': CategoricalTransformer
'datetime.value': FrequencyEncoder,
'names': FrequencyEncoder
}
data = get_input_data()

Expand Down Expand Up @@ -254,7 +254,7 @@ def test_with_unfitted_columns():
"""HyperTransform should be able to transform even if there are unseen columns in data."""
# Setup
data = get_input_data()
ht = HyperTransformer(default_data_type_transformers={'categorical': CategoricalTransformer})
ht = HyperTransformer(default_data_type_transformers={'categorical': FrequencyEncoder})
ht.fit(data)

# Run
Expand Down
Loading

0 comments on commit bc6aa44

Please sign in to comment.