Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update categorical transformers #231

Merged
merged 36 commits into from
Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
8ea16a8
Updates the baseclass
fealho Sep 20, 2021
ae84cce
Update categorical transformers
fealho Sep 20, 2021
817091e
Addresses feedback
fealho Sep 20, 2021
dfde8b6
Makes get_input_type a class method
fealho Sep 21, 2021
fc7f3af
Fix documentation
fealho Sep 21, 2021
edbaece
Fix one line bug
fealho Sep 21, 2021
1a5341b
Various improvements + some test cases
fealho Sep 21, 2021
5ae8900
Create _add_prefix
fealho Sep 21, 2021
9fc03ad
Return data if columns not in data
fealho Sep 21, 2021
cd78303
Multiple improvements + test cases
fealho Sep 21, 2021
5b98d5d
Added test cases
fealho Sep 21, 2021
1781f99
Merge branch 'master' into update-baseclass
fealho Sep 21, 2021
adeb176
Fix lint
fealho Sep 22, 2021
3e0040b
Remove test cases.
fealho Sep 22, 2021
24b4313
Fix a bunch of bugs
fealho Sep 22, 2021
2796fa6
Addresses feedback
fealho Sep 22, 2021
fd5fda7
Update baseclass fixes (#242)
csala Sep 22, 2021
94c7c6f
Fix lint + general improvements
fealho Sep 22, 2021
537bc40
Fix merge conflicts
fealho Sep 22, 2021
f08a2c4
Fix lint
fealho Sep 22, 2021
3ed571a
Merge branch 'update-baseclass' into update-categorical-transformer
fealho Sep 23, 2021
2d5706d
Tentative updates
fealho Sep 23, 2021
f0e31f9
Merge branch 'v0.6.0-dev' into update-categorical-transformer
fealho Sep 23, 2021
113af81
Fix categorical
fealho Sep 23, 2021
4a15111
Merge branch 'v0.6.0-dev' into update-categorical-transformer
fealho Sep 23, 2021
b893203
Working version of categorical (only hypertransformer fails)
fealho Sep 23, 2021
a362571
Fix lint
fealho Sep 23, 2021
9fda2a8
Fix lint
fealho Sep 23, 2021
cebca46
Fix small errors
fealho Sep 23, 2021
c244772
Lint
fealho Sep 23, 2021
b3faf23
Merge branch 'v0.6.0-dev' into update-categorical-transformer
fealho Sep 24, 2021
a5bfa8d
Fix small errors
fealho Sep 24, 2021
2a975ad
Fix hypertransformer
fealho Sep 24, 2021
59b5b21
Address feedback
fealho Sep 27, 2021
d3e7b52
Merge branch 'v0.6.0-dev' into update-categorical-transformer
fealho Sep 27, 2021
eb1366a
Fix lint
fealho Sep 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 62 additions & 35 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ class CategoricalTransformer(BaseTransformer):
Defaults to ``False``.
"""

INPUT_TYPE = 'categorical'
OUTPUT_TYPES = {'value': 'float'}
DETERMINISTIC_TRANSFORM = False
fealho marked this conversation as resolved.
Show resolved Hide resolved
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = True

mapping = None
intervals = None
starts = None
Expand Down Expand Up @@ -93,7 +99,7 @@ def _get_intervals(data):

return intervals, means, starts

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.

Create the mapping dict to save the label encoding.
Expand Down Expand Up @@ -145,9 +151,9 @@ def _get_value(self, category):

def _transform_by_row(self, data):
"""Transform the data row by row."""
return data.fillna(np.nan).apply(self._get_value).to_numpy()
return data.fillna(np.nan).apply(self._get_value)
fealho marked this conversation as resolved.
Show resolved Hide resolved

def transform(self, data):
def _transform(self, data):
"""Transform categorical values to float values.

Replace the categories with their float representative value.
Expand Down Expand Up @@ -210,7 +216,7 @@ def _reverse_transform_by_row(self, data):
"""Reverse transform the data by iterating over each row."""
return data.apply(self._get_category_from_start).astype(self.dtype)

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert float values back to the original categorical values.

Args:
Expand Down Expand Up @@ -259,6 +265,10 @@ class OneHotEncodingTransformer(BaseTransformer):
transform, then an error will be raised if this is True.
"""

INPUT_TYPE = 'categorical'
DETERMINISTIC_TRANSFORM = True
DETERMINISTIC_REVERSE = True

dummies = None
_dummy_na = None
_num_dummies = None
Expand All @@ -277,7 +287,7 @@ def _prepare_data(data):
otherwise returns it.

Args:
data (pandas.Series, numpy.ndarray, list or list of lists):
data (pandas.Series or pandas.DataFrame):
Data to prepare.

Returns:
Expand All @@ -296,33 +306,24 @@ def _prepare_data(data):

return data

def _transform(self, data):
if self._dummy_encoded:
coder = self._indexer
codes = pd.Categorical(data, categories=self._uniques).codes
else:
coder = self._uniques
codes = data
def get_output_types(self):
"""Return the output types produced by this transformer.

rows = len(data)
dummies = np.broadcast_to(coder, (rows, self._num_dummies))
coded = np.broadcast_to(codes, (self._num_dummies, rows)).T
array = (coded == dummies).astype(int)

if self._dummy_na:
null = np.zeros((rows, 1), dtype=int)
null[pd.isnull(data)] = 1
array = np.append(array, null, axis=1)
Returns:
dict:
Mapping from the transformed column names to the produced data types.
"""
output_types = {f'value{i}': 'float' for i in range(len(self.dummies))}

return array
return self._add_prefix(output_types)

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.

Get the pandas `dummies` which will be used later on for OneHotEncoding.

Args:
data (pandas.Series, numpy.ndarray, list or list of lists):
data (pandas.Series or pandas.DataFrame):
Data to fit the transformer to.
"""
data = self._prepare_data(data)
Expand All @@ -340,7 +341,27 @@ def fit(self, data):
if self._dummy_na:
self.dummies.append(np.nan)

def transform(self, data):
def _transform_helper(self, data):
if self._dummy_encoded:
coder = self._indexer
codes = pd.Categorical(data, categories=self._uniques).codes
else:
coder = self._uniques
codes = data

rows = len(data)
dummies = np.broadcast_to(coder, (rows, self._num_dummies))
coded = np.broadcast_to(codes, (self._num_dummies, rows)).T
array = (coded == dummies).astype(int)

if self._dummy_na:
null = np.zeros((rows, 1), dtype=int)
null[pd.isnull(data)] = 1
array = np.append(array, null, axis=1)

return array

def _transform(self, data):
"""Replace each category with the OneHot vectors.

Args:
Expand All @@ -351,7 +372,7 @@ def transform(self, data):
numpy.ndarray:
"""
data = self._prepare_data(data)
array = self._transform(data)
array = self._transform_helper(data)

if self.error_on_unknown:
unknown = array.sum(axis=1) == 0
Expand All @@ -361,7 +382,7 @@ def transform(self, data):

return array

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert float values back to the original categorical values.

Args:
Expand All @@ -371,6 +392,9 @@ def reverse_transform(self, data):
Returns:
pandas.Series
"""
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if data.ndim == 1:
data = data.reshape(-1, 1)

Expand All @@ -394,10 +418,16 @@ class LabelEncodingTransformer(BaseTransformer):
integer value.
"""

INPUT_TYPE = 'categorical'
OUTPUT_TYPES = {'value': 'integer'}
DETERMINISTIC_TRANSFORM = True
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = True

values_to_categories = None
categories_to_values = None

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.

Generate a unique integer representation for each category and
Expand All @@ -414,7 +444,7 @@ def fit(self, data):
for value, category in self.values_to_categories.items()
}

def transform(self, data):
def _transform(self, data):
"""Replace each category with its corresponding integer value.

Args:
Expand All @@ -427,9 +457,9 @@ def transform(self, data):
if not isinstance(data, pd.Series):
data = pd.Series(data)

return data.map(self.categories_to_values)
return pd.Series(data).map(self.categories_to_values)

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert float values back to the original categorical values.

Args:
Expand All @@ -439,8 +469,5 @@ def reverse_transform(self, data):
Returns:
pandas.Series
"""
if isinstance(data, np.ndarray) and (data.ndim == 2):
data = data[:, 0]

data = data.clip(min(self.values_to_categories), max(self.values_to_categories))
return pd.Series(data).round().map(self.values_to_categories)
return data.round().map(self.values_to_categories)
2 changes: 1 addition & 1 deletion tests/integration/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def test_dtype_category():

csala marked this conversation as resolved.
Show resolved Hide resolved
rever = ht.reverse_transform(trans)

pd.testing.assert_frame_equal(df, rever)
fealho marked this conversation as resolved.
Show resolved Hide resolved
pd.testing.assert_frame_equal(rever, df)


def test_empty_transformers():
Expand Down
Loading