Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for numpy 2.0.0 #2269

Merged
merged 14 commits into from
Oct 30, 2024
15 changes: 9 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ dependencies = [
'botocore>=1.31,<2.0.0',
'cloudpickle>=2.1.0',
'graphviz>=0.13.2',
"numpy>=1.21.0,<2.0.0;python_version<'3.10'",
"numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'",
"numpy>=1.26.0,<2.0.0;python_version>='3.12'",
"numpy>=1.21.0;python_version<'3.10'",
"numpy>=1.23.3;python_version>='3.10' and python_version<'3.12'",
"numpy>=1.26.0;python_version>='3.12'",
"pandas>=1.4.0;python_version<'3.11'",
"pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'",
"pandas>=2.1.1;python_version>='3.12'",
'tqdm>=4.29',
'copulas>=0.11.0',
'ctgan>=0.10.0',
'deepecho>=0.6.0',
'ctgan>=0.10.2',
'deepecho>=0.6.1',
'rdt>=1.12.3',
'sdmetrics>=0.16.0',
'platformdirs>=4.0',
Expand Down Expand Up @@ -207,7 +207,10 @@ select = [
# print statements
"T201",
# pandas-vet
"PD"
"PD",
# numpy 2.0
"NPY201"

]
ignore = [
# pydocstyle
Expand Down
23 changes: 17 additions & 6 deletions sdv/data_processing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@
)
from sdv.data_processing.datetime_formatter import DatetimeFormatter
from sdv.data_processing.errors import InvalidConstraintsError, NotFittedError
from sdv.data_processing.numerical_formatter import NumericalFormatter
from sdv.data_processing.numerical_formatter import INTEGER_BOUNDS, NumericalFormatter
from sdv.data_processing.utils import load_module_from_path
from sdv.errors import SynthesizerInputError, log_exc_stacktrace
from sdv.metadata.single_table import SingleTableMetadata

LOGGER = logging.getLogger(__name__)
INTEGER_BOUNDS = {str(key).lower(): value for key, value in INTEGER_BOUNDS.items()}


class DataProcessor:
Expand Down Expand Up @@ -561,26 +562,36 @@ def _create_config(self, data, columns_created_by_constraints):
)

if sdtype == 'id':
is_numeric = pd.api.types.is_numeric_dtype(data[column].dtype)
function_name = 'bothify'
column_dtype = data[column].dtype
is_numeric = pd.api.types.is_numeric_dtype(column_dtype)
if column_metadata.get('regex_format', False):
transformers[column] = self.create_regex_generator(
column, sdtype, column_metadata, is_numeric
)
sdtypes[column] = 'text'

else:
bothify_format = 'sdv-id-??????'
if is_numeric:
bothify_format = '#########'
function_name = 'random_int'
column_dtype = str(column_dtype).lower()
function_kwargs = {'min': 0, 'max': 9999999}
for key in INTEGER_BOUNDS:
if key in column_dtype:
_, max_value = INTEGER_BOUNDS[key]
function_kwargs = {'min': 0, 'max': max_value}

else:
function_kwargs = {'text': 'sdv-id-??????'}

cardinality_rule = None
if column in self._keys:
cardinality_rule = 'unique'

transformers[column] = AnonymizedFaker(
provider_name=None,
function_name='bothify',
function_kwargs={'text': bothify_format},
function_name=function_name,
function_kwargs=function_kwargs,
cardinality_rule=cardinality_rule,
)

Expand Down
8 changes: 4 additions & 4 deletions tests/benchmark/numpy_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@
}),
'np.string': pd.DataFrame({
'np.string': pd.Series([
np.string_('string1'),
np.string_('string2'),
np.string_('string3'),
np.bytes_('string1'),
np.bytes_('string2'),
np.bytes_('string3'),
pvk-developer marked this conversation as resolved.
Show resolved Hide resolved
])
}),
'np.unicode': pd.DataFrame({
'np.unicode': pd.Series(
[np.unicode_('unicode1'), np.unicode_('unicode2'), np.unicode_('unicode3')],
[np.str_('unicode1'), np.str_('unicode2'), np.str_('unicode3')],
dtype='string',
)
}),
Expand Down
50 changes: 25 additions & 25 deletions tests/integration/single_table/test_copulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,31 +347,31 @@ def test_numerical_columns_gets_pii():

# Assert
expected_sampled = pd.DataFrame({
'id': {
0: 807994768,
1: 746439230,
2: 201363792,
3: 364823003,
4: 726973888,
5: 693331380,
6: 795819284,
7: 607278621,
8: 783746695,
9: 162118876,
},
'city': {
0: 'Danielfort',
1: 'Glendaside',
2: 'Port Jenniferchester',
3: 'Port Susan',
4: 'West Michellemouth',
5: 'West Jason',
6: 'Ryanfort',
7: 'West Stephenland',
8: 'Davidland',
9: 'Port Christopher',
},
'numerical': {0: 22, 1: 24, 2: 22, 3: 23, 4: 22, 5: 24, 6: 23, 7: 24, 8: 24, 9: 24},
'id': [
1089619006166876142,
8373046707753416652,
9070705361670139280,
7227045982112645011,
3461931576753619633,
1005734164466301683,
3312031189447929384,
82456842876428117,
1819741328868365520,
8019169766233150107,
],
'city': [
'Danielfort',
'Glendaside',
'Port Jenniferchester',
'Port Susan',
'West Michellemouth',
'West Jason',
'Ryanfort',
'West Stephenland',
'Davidland',
'Port Christopher',
],
'numerical': [22, 24, 22, 23, 22, 24, 23, 24, 24, 24],
})
pd.testing.assert_frame_equal(expected_sampled, sampled)

Expand Down
39 changes: 29 additions & 10 deletions tests/unit/data_processing/test_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,7 +1137,9 @@ def test__create_config(self):
'first_name': ['John', 'Doe', 'Johanna'],
'id': ['ID_001', 'ID_002', 'ID_003'],
'id_no_regex': ['ID_001', 'ID_002', 'ID_003'],
'id_numeric': [0, 1, 2],
'id_numeric_int8': pd.Series([1, 2, 3], dtype='Int8'),
'id_numeric_int16': pd.Series([1, 2, 3], dtype='Int16'),
'id_numeric_int32': pd.Series([1, 2, 3], dtype='Int32'),
'id_column': ['ID_999', 'ID_999', 'ID_007'],
'date': ['2021-02-01', '2022-03-05', '2023-01-31'],
'unknown': ['a', 'b', 'c'],
Expand All @@ -1151,9 +1153,9 @@ def test__create_config(self):
dp.create_anonymized_transformer.return_value = 'AnonymizedFaker'
dp.create_regex_generator.return_value = 'RegexGenerator'
dp.metadata.primary_key = 'id'
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric']
dp.metadata.alternate_keys = ['id_no_regex', 'id_numeric_int8']
dp._primary_key = 'id'
dp._keys = ['id', 'id_no_regex', 'id_numeric']
dp._keys = ['id', 'id_no_regex', 'id_numeric_int8']
dp.metadata.columns = {
'int': {'sdtype': 'numerical'},
'float': {'sdtype': 'numerical'},
Expand All @@ -1163,7 +1165,9 @@ def test__create_config(self):
'first_name': {'sdtype': 'first_name'},
'id': {'sdtype': 'id', 'regex_format': 'ID_\\d{3}[0-9]'},
'id_no_regex': {'sdtype': 'id'},
'id_numeric': {'sdtype': 'id'},
'id_numeric_int8': {'sdtype': 'id'},
'id_numeric_int16': {'sdtype': 'id'},
'id_numeric_int32': {'sdtype': 'id'},
'id_column': {'sdtype': 'id'},
'date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
'unknown': {'sdtype': 'unknown'},
Expand All @@ -1188,7 +1192,9 @@ def test__create_config(self):
'first_name': 'pii',
'id': 'text',
'id_no_regex': 'text',
'id_numeric': 'text',
'id_numeric_int8': 'text',
'id_numeric_int16': 'text',
'id_numeric_int32': 'text',
'id_column': 'text',
'date': 'datetime',
'unknown': 'pii',
Expand Down Expand Up @@ -1236,11 +1242,24 @@ def test__create_config(self):
assert id_no_regex_transformer.function_kwargs == {'text': 'sdv-id-??????'}
assert id_no_regex_transformer.cardinality_rule == 'unique'

id_numeric_transformer = config['transformers']['id_numeric']
assert isinstance(id_numeric_transformer, AnonymizedFaker)
assert id_numeric_transformer.function_name == 'bothify'
assert id_numeric_transformer.function_kwargs == {'text': '#########'}
assert id_numeric_transformer.cardinality_rule == 'unique'
id_numeric_int_8_transformer = config['transformers']['id_numeric_int8']
assert isinstance(id_numeric_int_8_transformer, AnonymizedFaker)
assert id_numeric_int_8_transformer.function_name == 'random_int'
assert id_numeric_int_8_transformer.function_kwargs == {'min': 0, 'max': 127}
assert id_numeric_int_8_transformer.cardinality_rule == 'unique'

id_numeric_int_16_transformer = config['transformers']['id_numeric_int16']
assert isinstance(id_numeric_int_16_transformer, AnonymizedFaker)
assert id_numeric_int_16_transformer.function_name == 'random_int'
assert id_numeric_int_16_transformer.function_kwargs == {'min': 0, 'max': 32767}

id_numeric_int_32_transformer = config['transformers']['id_numeric_int32']
assert isinstance(id_numeric_int_32_transformer, AnonymizedFaker)
assert id_numeric_int_32_transformer.function_name == 'random_int'
assert id_numeric_int_32_transformer.function_kwargs == {
'min': 0,
'max': 2147483647,
}

id_column_transformer = config['transformers']['id_column']
assert isinstance(id_column_transformer, AnonymizedFaker)
Expand Down
Loading