Skip to content

Commit

Permalink
Add enforce_uniqueness to AnonymizedFaker and RegexGenerator (#548
Browse files Browse the repository at this point in the history
)

* Finalize and add tests

* Add missing docstring

* Address comments

* Address comment
  • Loading branch information
pvk-developer authored Sep 13, 2022
1 parent efc941e commit c3fcb99
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 16 deletions.
33 changes: 23 additions & 10 deletions rdt/transformers/pii/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ class AnonymizedFaker(BaseTransformer):
will be created only if there are null values. If ``True``, create the new column if
there are null values. If ``False``, do not create the new column even if there
are null values. Defaults to ``False``.
enforce_uniqueness (bool):
Whether or not to ensure that the new anonymized data is all unique. If it isn't
possible to create the requested number of rows, then an ``Error`` will be raised.
Defaults to ``False``.
"""

DETERMINISTIC_TRANSFORM = False
Expand Down Expand Up @@ -94,8 +98,10 @@ def _check_locales(self):
)

def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
locales=None, missing_value_replacement=None, model_missing_values=False):
locales=None, missing_value_replacement=None,
model_missing_values=False, enforce_uniqueness=False):
self.data_length = None
self.enforce_uniqueness = enforce_uniqueness
self.provider_name = provider_name if provider_name else 'BaseProvider'
if self.provider_name != 'BaseProvider' and function_name is None:
raise Error(
Expand All @@ -117,6 +123,9 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None,

def _function(self):
"""Return a callable ``faker`` function."""
if self.enforce_uniqueness:
return getattr(self.faker.unique, self.function_name)(**self.function_kwargs)

return getattr(self.faker, self.function_name)(**self.function_kwargs)

def get_output_sdtypes(self):
Expand Down Expand Up @@ -179,10 +188,17 @@ def _reverse_transform(self, data):
else:
sample_size = self.data_length

reverse_transformed = np.array([
self._function()
for _ in range(sample_size)
], dtype=object)
try:
reverse_transformed = np.array([
self._function()
for _ in range(sample_size)
], dtype=object)
except faker.exceptions.UniquenessException as exception:
raise Error(
f'The Faker function you specified is not able to generate {sample_size} unique '
'values. Please use a different Faker function for column '
f"('{self.get_input_column()}')."
) from exception

if self.null_transformer.models_missing_values():
reverse_transformed = np.column_stack((reverse_transformed, data))
Expand Down Expand Up @@ -234,7 +250,7 @@ class PseudoAnonymizedFaker(AnonymizedFaker):
"""

OUTPUT_SDTYPES = {'value': 'categorical'}
NEXT_TRANSFORMER = {
NEXT_TRANSFORMERS = {
'value': LabelEncoder(add_noise=True)
}

Expand All @@ -254,14 +270,11 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
function_name=function_name,
function_kwargs=function_kwargs,
locales=locales,
enforce_uniqueness=True
)
self._mapping_dict = {}
self._reverse_mapping_dict = {}

def _function(self):
"""Return a callable ``faker`` function."""
return getattr(self.faker.unique, self.function_name)(**self.function_kwargs)

def get_mapping(self):
"""Return the mapping dictionary."""
return deepcopy(self._mapping_dict)
Expand Down
15 changes: 14 additions & 1 deletion rdt/transformers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np

from rdt.errors import Error
from rdt.transformers.base import BaseTransformer
from rdt.transformers.null import NullTransformer
from rdt.transformers.utils import strings_from_regex
Expand All @@ -28,6 +29,10 @@ class RegexGenerator(BaseTransformer):
will be created only if there are null values. If ``True``, create the new column if
there are null values. If ``False``, do not create the new column even if there
are null values. Defaults to ``False``.
enforce_uniqueness (bool):
Whether or not to ensure that the new generated data is all unique. If it isn't
possible to create the requested number of rows, then an ``Error`` will be raised.
Defaults to ``False``.
"""

DETERMINISTIC_TRANSFORM = False
Expand All @@ -36,9 +41,10 @@ class RegexGenerator(BaseTransformer):
null_transformer = None

def __init__(self, regex_format='[A-Za-z]{5}', missing_value_replacement=None,
model_missing_values=False):
model_missing_values=False, enforce_uniqueness=False):
self.missing_value_replacement = missing_value_replacement
self.model_missing_values = model_missing_values
self.enforce_uniqueness = enforce_uniqueness
self.regex_format = regex_format
self.data_length = None

Expand Down Expand Up @@ -103,7 +109,14 @@ def _reverse_transform(self, data):
sample_size = self.data_length

generator, size = strings_from_regex(self.regex_format)

if sample_size > size:
if self.enforce_uniqueness:
raise Error(
f'The regex is not able to generate {sample_size} unique values. '
f"Please use a different regex for column ('{self.get_input_column()}')."
)

warnings.warn(
f"The data has {sample_size} rows but the regex for '{self.get_input_column()}' "
f'can only create {size} unique values. Some values in '
Expand Down
85 changes: 81 additions & 4 deletions tests/unit/transformers/pii/test_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,15 @@ def test_check_provider_function_raise_attribute_error(self):
with pytest.raises(Error, match=expected_message):
AnonymizedFaker.check_provider_function('TestProvider', 'TestFunction')

def test__function(self):
"""Test that `_function`.
def test__function_enforce_uniqueness_false(self):
"""Test that ``_function`` does not use ``faker.unique``.
The method `_function` should return a call from the `instance.faker.provider.<function>`.
The method ``_function`` should return a call from the
``instance.faker.provider.<function>``.
Mock:
- Instance of 'AnonymizedFaker'.
- Instance ``enforce_uniqueness`` set to `False`
- Faker instance.
- A function for the faker instance.
Expand All @@ -92,8 +94,12 @@ def test__function(self):
"""
# setup
instance = Mock()
instance.enforce_uniqueness = False
function = Mock()
unique_function = Mock()
function.return_value = 1

instance.faker.unique.number = unique_function
instance.faker.number = function
instance.function_name = 'number'
instance.function_kwargs = {'type': 'int'}
Expand All @@ -102,9 +108,49 @@ def test__function(self):
result = AnonymizedFaker._function(instance)

# Assert
unique_function.assert_not_called()
function.assert_called_once_with(type='int')
assert result == 1

def test__function_enforce_uniqueness_true(self):
"""Test that ``_function`` uses the ``faker.unique``.
The method ``_function`` should return a call from the
``instance.faker.unique.<function>``.
Mock:
- Instance of 'AnonymizedFaker'.
- Instance ``enforce_uniqueness`` set to ``True``
- Faker instance.
- A function for the faker instance.
Output:
- Return value of mocked function.
Side Effects:
- The returned function, when called, has to call the ``faker.unique.<function_name>``
function with the provided kwargs.
"""
# setup
instance = Mock()
instance.enforce_uniqueness = True
function = Mock()
unique_function = Mock()
unique_function.return_value = 1

instance.faker.unique.number = unique_function
instance.faker.number = function
instance.function_name = 'number'
instance.function_kwargs = {'type': 'int'}

# Run
result = AnonymizedFaker._function(instance)

# Assert
function.assert_not_called()
unique_function.assert_called_once_with(type='int')
assert result == 1

@patch('rdt.transformers.pii.anonymizer.importlib')
@patch('rdt.transformers.pii.anonymizer.warnings')
def test__check_locales(self, mock_warnings, mock_importlib):
Expand Down Expand Up @@ -172,6 +218,7 @@ def test___init__default(self, mock_check_provider_function, mock_faker):
assert not instance.model_missing_values
assert instance.locales is None
assert mock_faker.Faker.called_once_with(None)
assert instance.enforce_uniqueness is False

@patch('rdt.transformers.pii.anonymizer.faker')
@patch('rdt.transformers.pii.anonymizer.AnonymizedFaker.check_provider_function')
Expand Down Expand Up @@ -206,7 +253,8 @@ def test___init__custom(self, mock_check_provider_function, mock_faker):
function_kwargs={
'type': 'visa'
},
locales=['en_US', 'fr_FR']
locales=['en_US', 'fr_FR'],
enforce_uniqueness=True
)

# Assert
Expand All @@ -218,6 +266,7 @@ def test___init__custom(self, mock_check_provider_function, mock_faker):
assert not instance.model_missing_values
assert instance.locales == ['en_US', 'fr_FR']
assert mock_faker.Faker.called_once_with(['en_US', 'fr_FR'])
assert instance.enforce_uniqueness

def test___init__no_function_name(self):
"""Test the instantiation of the transformer with custom parameters.
Expand Down Expand Up @@ -476,6 +525,34 @@ def test__reverse_transform_models_missing_values(self):
expected_function_calls = [call(), call(), call()]
assert function.call_args_list == expected_function_calls

def test__reverse_transform_not_enough_unique_values(self):
"""Test the ``_reverse_transform`` method.
Test that when calling the ``_reverse_transform`` method and the ``instance._function`` is
not generating enough unique values raises an ``Error``.
Setup:
-Instance of ``AnonymizedFaker``.
Input:
- ``pandas.Series`` representing a column.
Side Effect:
- Raises an ``Error``.
"""
# Setup
instance = AnonymizedFaker('misc', 'boolean', enforce_uniqueness=True)
data = pd.Series(['a', 'b', 'c', 'd'])
instance.columns = ['a']

# Run / Assert
error_msg = re.escape(
'The Faker function you specified is not able to generate 4 unique '
"values. Please use a different Faker function for column ('a')."
)
with pytest.raises(Error, match=error_msg):
instance._reverse_transform(data)

def test___repr__default(self):
"""Test the ``__repr__`` method.
Expand Down
43 changes: 42 additions & 1 deletion tests/unit/transformers/test_text.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
"""Test Text Transformers."""

import re
from string import ascii_uppercase
from unittest.mock import Mock, patch

import numpy as np
import pandas as pd
import pytest

from rdt.errors import Error
from rdt.transformers.null import NullTransformer
from rdt.transformers.text import RegexGenerator

Expand Down Expand Up @@ -51,6 +54,7 @@ def test___init__default(self):
assert instance.missing_value_replacement is None
assert instance.model_missing_values is False
assert instance.regex_format == '[A-Za-z]{5}'
assert instance.enforce_uniqueness is False

def test___init__custom(self):
"""Test the default instantiation of the transformer.
Expand All @@ -68,14 +72,16 @@ def test___init__custom(self):
instance = RegexGenerator(
regex_format='[0-9]',
missing_value_replacement='AAAA',
model_missing_values=True
model_missing_values=True,
enforce_uniqueness=True
)

# Assert
assert instance.data_length is None
assert instance.missing_value_replacement == 'AAAA'
assert instance.model_missing_values
assert instance.regex_format == '[0-9]'
assert instance.enforce_uniqueness

def test_get_output_sdtypes(self):
"""Test the ``get_output_sdtypes``.
Expand Down Expand Up @@ -356,3 +362,38 @@ def test__reverse_transform_models_missing_values(self,
np.testing.assert_array_equal(result, expected_result)

mock_warnings.warn.assert_called_once_with(expected_warning_message)

@patch('rdt.transformers.text.strings_from_regex')
def test__reverse_transform_not_enough_unique_values(self, mock_strings_from_regex):
"""Test the ``_reverse_transform`` method.
Validate that the ``_reverse_transform`` method calls the ``strings_from_regex``
function using the ``instance.regex_format`` and then generates the
``instance.data_length`` number of data.
Setup:
- Initialize a ``RegexGenerator`` instance with ``enforce_uniqueness`` to ``True``.
- Set ``data_length`` to 6.
- Initialize a generator.
Mock:
- Mock the ``strings_from_regex`` function to return a generator and a size of 2.
Side Effects:
- An ``Error`` is being raised as not enough unique values can be generated.
"""
# Setup
instance = RegexGenerator('[A-Z]', enforce_uniqueness=True)
instance.data_length = 6
generator = AsciiGenerator(5)
mock_strings_from_regex.return_value = (generator, 2)
instance.columns = ['a']
columns_data = pd.Series()

# Assert
error_msg = re.escape(
'The regex is not able to generate 6 unique values. Please use a different regex '
"for column ('a')."
)
with pytest.raises(Error, match=error_msg):
instance._reverse_transform(columns_data)

0 comments on commit c3fcb99

Please sign in to comment.