Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create IDGenerator transformer #680

Merged
merged 7 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion rdt/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from rdt.transformers.null import NullTransformer
from rdt.transformers.numerical import ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer
from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker
from rdt.transformers.text import RegexGenerator
from rdt.transformers.text import IDGenerator, RegexGenerator

__all__ = [
'BaseTransformer',
Expand All @@ -36,6 +36,7 @@
'RegexGenerator',
'AnonymizedFaker',
'PseudoAnonymizedFaker',
'IDGenerator',
'get_transformer_name',
'get_transformer_class',
'get_transformers_by_type',
Expand Down
4 changes: 3 additions & 1 deletion rdt/transformers/pii/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,9 @@ def get_supported_sdtypes(cls):
list:
Accepted input sdtypes of the transformer.
"""
unsupported_sdtypes = {'numerical', 'datetime', 'categorical', 'boolean', 'text', None}
unsupported_sdtypes = {
'numerical', 'datetime', 'categorical', 'boolean', 'text', None
}
all_sdtypes = {cls.INPUT_SDTYPE}
for transformer in BaseTransformer.get_subclasses():
if not issubclass(transformer, cls):
Expand Down
60 changes: 60 additions & 0 deletions rdt/transformers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,72 @@
import warnings

import numpy as np
import pandas as pd

from rdt.errors import TransformerProcessingError
from rdt.transformers.base import BaseTransformer
from rdt.transformers.utils import strings_from_regex


class IDGenerator(BaseTransformer):
"""Generate an ID column.

This transformer generates an ID column based on a given prefix, starting value and suffix.

Args:
prefix (str):
Prefix of the generated IDs column.
Defaults to ``None``.
starting_value (int):
Starting value of the generated IDs column.
Defaults to ``0``.
suffix (str):
Suffix of the generated IDs column.
Defaults to ``None``.
"""

IS_GENERATOR = True
INPUT_SDTYPE = 'text'

def __init__(self, prefix=None, starting_value=0, suffix=None):
super().__init__()
self.prefix = prefix
self.starting_value = starting_value
self.suffix = suffix
self._counter = 0
self.output_properties = {None: {'next_transformer': None}}

def reset_randomization(self):
"""Reset the sampling _counter."""
self._counter = 0

def _fit(self, data):
pass

def _transform(self, _data):
"""Drop the input column by returning ``None``."""
return None

def _reverse_transform(self, data):
"""Generate new id column.

Args:
data (pd.Series or numpy.ndarray):
Data to transform.

Returns:
pd.Series
"""
start = self.starting_value + self._counter
prefix_str = self.prefix if self.prefix is not None else ''
suffix_str = self.suffix if self.suffix is not None else ''

values = [f'{prefix_str}{start + idx}{suffix_str}' for idx in range(len(data))]
self._counter += len(data)

return pd.Series(values)


class RegexGenerator(BaseTransformer):
"""RegexGenerator transformer.

Expand Down
41 changes: 40 additions & 1 deletion tests/integration/transformers/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,46 @@
import numpy as np
import pandas as pd

from rdt.transformers.text import RegexGenerator
from rdt.transformers.text import IDGenerator, RegexGenerator


class TestIDGenerator():

def test_end_to_end(self):
"""End to end test of the ``IDGenerator``."""
# Setup
data = pd.DataFrame({
'id': [1, 2, 3, 4, 5],
'username': ['a', 'b', 'c', 'd', 'e']
})

# Run
transformer = IDGenerator(prefix='id_', starting_value=100, suffix='_X')
transformed = transformer.fit_transform(data, 'id')
reverse_transform = transformer.reverse_transform(transformed)
reverse_transform_2 = transformer.reverse_transform(transformed)
transformer.reset_randomization()
reverse_transform_3 = transformer.reverse_transform(transformed)

# Assert
expected_transformed = pd.DataFrame({
'username': ['a', 'b', 'c', 'd', 'e']
})

expected_reverse_transform = pd.DataFrame({
'username': ['a', 'b', 'c', 'd', 'e'],
'id': ['id_100_X', 'id_101_X', 'id_102_X', 'id_103_X', 'id_104_X']
})

expected_reverse_transform_2 = pd.DataFrame({
'username': ['a', 'b', 'c', 'd', 'e'],
'id': ['id_105_X', 'id_106_X', 'id_107_X', 'id_108_X', 'id_109_X']
})

pd.testing.assert_frame_equal(transformed, expected_transformed)
pd.testing.assert_frame_equal(reverse_transform, expected_reverse_transform)
pd.testing.assert_frame_equal(reverse_transform_2, expected_reverse_transform_2)
pd.testing.assert_frame_equal(reverse_transform_3, expected_reverse_transform)


def test_regexgenerator():
Expand Down
111 changes: 110 additions & 1 deletion tests/unit/transformers/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pytest

from rdt.errors import TransformerProcessingError
from rdt.transformers.text import RegexGenerator
from rdt.transformers.text import IDGenerator, RegexGenerator


class AsciiGenerator:
Expand All @@ -32,6 +32,115 @@ def __next__(self):
return char


class TestIDGenerator:

def test___init__default(self):
"""Test the ``__init__`` method."""
# Run
transformer = IDGenerator()

# Assert
assert transformer.prefix is None
assert transformer.starting_value == 0
assert transformer.suffix is None
assert transformer._counter == 0
assert transformer.output_properties == {None: {'next_transformer': None}}

def test___init__with_parameters(self):
"""Test the ``__init__`` method with paremeters."""
# Run
transformer_prefix = IDGenerator(prefix='prefix_')
transformer_suffix = IDGenerator(suffix='_suffix')
transformer_starting_value = IDGenerator(starting_value=10)
transformer_all = IDGenerator(prefix='prefix_', starting_value=10, suffix='_suffix')

# Assert
assert transformer_prefix.prefix == 'prefix_'
assert transformer_prefix.starting_value == 0
assert transformer_prefix.suffix is None
assert transformer_prefix._counter == 0
assert transformer_prefix.output_properties == {None: {'next_transformer': None}}

assert transformer_suffix.prefix is None
assert transformer_suffix.starting_value == 0
assert transformer_suffix.suffix == '_suffix'
assert transformer_suffix._counter == 0
assert transformer_suffix.output_properties == {None: {'next_transformer': None}}

assert transformer_starting_value.prefix is None
assert transformer_starting_value.starting_value == 10
assert transformer_starting_value.suffix is None
assert transformer_starting_value._counter == 0
assert transformer_starting_value.output_properties == {None: {'next_transformer': None}}

assert transformer_all.prefix == 'prefix_'
assert transformer_all.starting_value == 10
assert transformer_all.suffix == '_suffix'
assert transformer_all._counter == 0
assert transformer_all.output_properties == {None: {'next_transformer': None}}

def test_reset_randomization(self):
"""Test the ``reset_randomization`` method."""
# Setup
transformer = IDGenerator()
transformer._counter = 10

# Run
transformer.reset_randomization()

# Assert
assert transformer._counter == 0

def test__fit(self):
"""Test the ``_fit`` method."""
# Setup
transformer = IDGenerator()

# Run
transformer._fit(None)

# Assert
assert True

def test__transform(self):
"""Test the ``_transform`` method."""
# Setup
transformer = IDGenerator()

# Run
result = transformer._transform(None)

# Assert
assert result is None

def test__reverse_transform(self):
"""Test the ``_reverse_transform`` method."""
# Setup
transformer = IDGenerator()
transformer._counter = 10

# Run
result = transformer._reverse_transform(np.array([1, 2, 3]))

# Assert
assert isinstance(result, pd.Series)
assert result.tolist() == ['10', '11', '12']
assert transformer._counter == 13

def test__reverse_transform_with_everything(self):
"""Test the ``_reverse_transform`` method with all parameters."""
# Setup
transformer = IDGenerator(prefix='prefix_', starting_value=100, suffix='_suffix')

# Run
result = transformer._reverse_transform(np.array([1, 2, 3]))

# Assert
assert isinstance(result, pd.Series)
assert result.tolist() == ['prefix_100_suffix', 'prefix_101_suffix', 'prefix_102_suffix']
assert transformer._counter == 3


class TestRegexGenerator:
"""Test class for ``RegexGenerator``."""

Expand Down
Loading