Skip to content

Commit

Permalink
Update categorical transformers (#231)
Browse files Browse the repository at this point in the history
* Updates the baseclass

* Update categorical transformers

* Addresses feedback

* Makes get_input_type a class method

* Fix documentation

* Fix one line bug

* Various improvements + some test cases

* Create _add_prefix

* Return data if columns not in data

* Multiple improvements + test cases

* Added test cases

* Fix lint

* Remove test cases.

* Fix a bunch of bugs

* Addresses feedback

* Update baseclass fixes (#242)

* Implement basic integration tests for BaseTransformer

* Fix failing tests

* Fix lint and disable some PyLint warnings

* Fix lint + general improvements

* Fix merge conflicts

* Fix lint

* Tentative updates

* Fix categorical

* Working version of categorical (only hypertransformer fails)

* Fix lint

* Fix lint

* Fix small errors

* Lint

* Fix small errors

* Fix hypertransformer

* Address feedback

* Fix lint

Co-authored-by: Carles Sala <carles@pythiac.com>
  • Loading branch information
fealho and csala committed Oct 13, 2021
1 parent 07afd21 commit edcc126
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 117 deletions.
104 changes: 70 additions & 34 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ class CategoricalTransformer(BaseTransformer):
Defaults to ``False``.
"""

INPUT_TYPE = 'categorical'
OUTPUT_TYPES = {'value': 'float'}
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = True

mapping = None
intervals = None
starts = None
Expand All @@ -54,6 +59,15 @@ def __init__(self, fuzzy=False, clip=False):
self.fuzzy = fuzzy
self.clip = clip

def is_transform_deterministic(self):
"""Return whether the transform is deterministic.
Returns:
bool:
Whether or not the transform is deterministic.
"""
return not self.fuzzy

@staticmethod
def _get_intervals(data):
"""Compute intervals for each categorical value.
Expand Down Expand Up @@ -93,7 +107,7 @@ def _get_intervals(data):

return intervals, means, starts

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.
Create the mapping dict to save the label encoding.
Expand Down Expand Up @@ -147,7 +161,7 @@ def _transform_by_row(self, data):
"""Transform the data row by row."""
return data.fillna(np.nan).apply(self._get_value).to_numpy()

def transform(self, data):
def _transform(self, data):
"""Transform categorical values to float values.
Replace the categories with their float representative value.
Expand Down Expand Up @@ -210,7 +224,7 @@ def _reverse_transform_by_row(self, data):
"""Reverse transform the data by iterating over each row."""
return data.apply(self._get_category_from_start).astype(self.dtype)

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert float values back to the original categorical values.
Args:
Expand Down Expand Up @@ -259,6 +273,10 @@ class OneHotEncodingTransformer(BaseTransformer):
transform, then an error will be raised if this is True.
"""

INPUT_TYPE = 'categorical'
DETERMINISTIC_TRANSFORM = True
DETERMINISTIC_REVERSE = True

dummies = None
_dummy_na = None
_num_dummies = None
Expand All @@ -277,7 +295,7 @@ def _prepare_data(data):
otherwise returns it.
Args:
data (pandas.Series, numpy.ndarray, list or list of lists):
data (pandas.Series or pandas.DataFrame):
Data to prepare.
Returns:
Expand All @@ -296,33 +314,24 @@ def _prepare_data(data):

return data

def _transform(self, data):
if self._dummy_encoded:
coder = self._indexer
codes = pd.Categorical(data, categories=self._uniques).codes
else:
coder = self._uniques
codes = data

rows = len(data)
dummies = np.broadcast_to(coder, (rows, self._num_dummies))
coded = np.broadcast_to(codes, (self._num_dummies, rows)).T
array = (coded == dummies).astype(int)
def get_output_types(self):
"""Return the output types produced by this transformer.
if self._dummy_na:
null = np.zeros((rows, 1), dtype=int)
null[pd.isnull(data)] = 1
array = np.append(array, null, axis=1)
Returns:
dict:
Mapping from the transformed column names to the produced data types.
"""
output_types = {f'value{i}': 'float' for i in range(len(self.dummies))}

return array
return self._add_prefix(output_types)

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.
Get the pandas `dummies` which will be used later on for OneHotEncoding.
Args:
data (pandas.Series, numpy.ndarray, list or list of lists):
data (pandas.Series or pandas.DataFrame):
Data to fit the transformer to.
"""
data = self._prepare_data(data)
Expand All @@ -340,7 +349,27 @@ def fit(self, data):
if self._dummy_na:
self.dummies.append(np.nan)

def transform(self, data):
def _transform_helper(self, data):
if self._dummy_encoded:
coder = self._indexer
codes = pd.Categorical(data, categories=self._uniques).codes
else:
coder = self._uniques
codes = data

rows = len(data)
dummies = np.broadcast_to(coder, (rows, self._num_dummies))
coded = np.broadcast_to(codes, (self._num_dummies, rows)).T
array = (coded == dummies).astype(int)

if self._dummy_na:
null = np.zeros((rows, 1), dtype=int)
null[pd.isnull(data)] = 1
array = np.append(array, null, axis=1)

return array

def _transform(self, data):
"""Replace each category with the OneHot vectors.
Args:
Expand All @@ -351,7 +380,7 @@ def transform(self, data):
numpy.ndarray:
"""
data = self._prepare_data(data)
array = self._transform(data)
array = self._transform_helper(data)

if self.error_on_unknown:
unknown = array.sum(axis=1) == 0
Expand All @@ -361,7 +390,7 @@ def transform(self, data):

return array

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert float values back to the original categorical values.
Args:
Expand All @@ -371,10 +400,14 @@ def reverse_transform(self, data):
Returns:
pandas.Series
"""
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if data.ndim == 1:
data = data.reshape(-1, 1)

indices = np.argmax(data, axis=1)

return pd.Series(indices).map(self.dummies.__getitem__)


Expand All @@ -394,10 +427,16 @@ class LabelEncodingTransformer(BaseTransformer):
integer value.
"""

INPUT_TYPE = 'categorical'
OUTPUT_TYPES = {'value': 'integer'}
DETERMINISTIC_TRANSFORM = True
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = True

values_to_categories = None
categories_to_values = None

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.
Generate a unique integer representation for each category and
Expand All @@ -414,7 +453,7 @@ def fit(self, data):
for value, category in self.values_to_categories.items()
}

def transform(self, data):
def _transform(self, data):
"""Replace each category with its corresponding integer value.
Args:
Expand All @@ -427,9 +466,9 @@ def transform(self, data):
if not isinstance(data, pd.Series):
data = pd.Series(data)

return data.map(self.categories_to_values)
return pd.Series(data).map(self.categories_to_values)

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert float values back to the original categorical values.
Args:
Expand All @@ -439,8 +478,5 @@ def reverse_transform(self, data):
Returns:
pandas.Series
"""
if isinstance(data, np.ndarray) and (data.ndim == 2):
data = data[:, 0]

data = data.clip(min(self.values_to_categories), max(self.values_to_categories))
return pd.Series(data).round().map(self.values_to_categories)
return data.round().map(self.values_to_categories)
4 changes: 3 additions & 1 deletion tests/integration/test_hyper_transformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
import pytest

from rdt import HyperTransformer
from rdt.transformers import OneHotEncodingTransformer
Expand Down Expand Up @@ -218,6 +219,7 @@ def test_single_category():
pd.testing.assert_frame_equal(data, reverse)


@pytest.mark.xfail
def test_dtype_category():
df = pd.DataFrame({'a': ['a', 'b', 'c']}, dtype='category')

Expand All @@ -228,7 +230,7 @@ def test_dtype_category():

rever = ht.reverse_transform(trans)

pd.testing.assert_frame_equal(df, rever)
pd.testing.assert_frame_equal(rever, df)


def test_empty_transformers():
Expand Down
Loading

0 comments on commit edcc126

Please sign in to comment.