Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update NullTransformer to make it user friendly #384

Merged
merged 21 commits into from
Feb 10, 2022
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion rdt/performance/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,42 @@

DATASET_SIZES = [1000, 10000, 100000]

# Additional arguments for transformers
TRANSFORMER_ARGS = {
'BooleanTransformer': {
'missing_value_replacement': -1,
amontanez24 marked this conversation as resolved.
Show resolved Hide resolved
'model_missing_values': True
},
'DatetimeTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'DatetimeRoundedTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'NumericalTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'NumericalRoundedBoundedTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'NumericalBoundedTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'GaussianCopulaTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'BayesGMMTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
}


def _get_dataset_sizes(data_type):
"""Get a list of (fit_size, transform_size) for each dataset generator.
Expand Down Expand Up @@ -50,7 +86,8 @@ def evaluate_transformer_performance(transformer, dataset_generator, verbose=Fal
pandas.DataFrame:
The performance test results.
"""
transformer_instance = transformer()
transformer_args = TRANSFORMER_ARGS.get(transformer.__name__, {})
transformer_instance = transformer(**transformer_args)

sizes = _get_dataset_sizes(dataset_generator.DATA_TYPE)

Expand Down
1 change: 1 addition & 0 deletions rdt/performance/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def profile_transformer(transformer, dataset_generator, transform_size, fit_size
fit_dataset = pd.Series(dataset_generator.generate(fit_size))
replace = transform_size > fit_size
transform_dataset = fit_dataset.sample(transform_size, replace=replace)
transformer.columns = ['column']

try:
fit_time = _profile_time(transformer, 'fit', fit_dataset, copy=True)
Expand Down
10 changes: 5 additions & 5 deletions rdt/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,14 @@ def get_next_transformers(self):
"""
return self._add_prefix(self.NEXT_TRANSFORMERS)

def get_input_columns(self):
"""Return list of input column names for transformer.
def get_input_column(self):
"""Return input column name for transformer.

Returns:
list:
Input column names.
str:
Input column name.
"""
return self.columns
return self.columns[0]

def get_output_columns(self):
"""Return list of column names created in ``transform``.
Expand Down
21 changes: 12 additions & 9 deletions rdt/transformers/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ class BooleanTransformer(BaseTransformer):
Null values are replaced using a ``NullTransformer``.

Args:
nan (int or None):
missing_value_replacement (int or None):
Replace null values with the given value. If ``None``, do not replace them.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd update this to match what's in the NullTransformer as well (ie. explain mode and so on)

Defaults to ``-1``.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't true anymore right?

null_column (bool):
model_missing_values (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the fit data contains null values.
If ``True``, always create the new column whether there are null values or not.
Expand All @@ -33,9 +33,9 @@ class BooleanTransformer(BaseTransformer):

null_transformer = None

def __init__(self, nan=-1, null_column=None):
self.nan = nan
self.null_column = null_column
def __init__(self, missing_value_replacement=None, model_missing_values=False):
self.missing_value_replacement = missing_value_replacement
self.model_missing_values = model_missing_values

def get_output_types(self):
"""Return the output types returned by this transformer.
Expand All @@ -47,7 +47,7 @@ def get_output_types(self):
output_types = {
'value': 'float',
}
if self.null_transformer and self.null_transformer.creates_null_column():
if self.null_transformer and self.null_transformer.creates_model_missing_values():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)
Expand All @@ -59,8 +59,11 @@ def _fit(self, data):
data (pandas.Series):
Data to fit to.
"""
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer.fit(data)
self.null_transformer = NullTransformer(
self.missing_value_replacement,
self.model_missing_values
)
self.null_transformer.fit(data, self.get_input_column())

def _transform(self, data):
"""Transform boolean to float.
Expand Down Expand Up @@ -92,7 +95,7 @@ def _reverse_transform(self, data):
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if self.nan is not None:
if self.missing_value_replacement is not None:
data = self.null_transformer.reverse_transform(data)

if isinstance(data, np.ndarray):
Expand Down
33 changes: 19 additions & 14 deletions rdt/transformers/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ class DatetimeTransformer(BaseTransformer):
Null values are replaced using a ``NullTransformer``.

Args:
nan (int, str or None):
missing_value_replacement (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
model_missing_values (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
Expand All @@ -45,9 +45,10 @@ class DatetimeTransformer(BaseTransformer):
null_transformer = None
divider = None

def __init__(self, nan='mean', null_column=None, strip_constant=False, datetime_format=None):
self.nan = nan
self.null_column = null_column
def __init__(self, missing_value_replacement=None, model_missing_values=False,
strip_constant=False, datetime_format=None):
self.missing_value_replacement = missing_value_replacement
self.model_missing_values = model_missing_values
self.strip_constant = strip_constant
self.datetime_format = datetime_format

Expand All @@ -58,7 +59,7 @@ def is_composition_identity(self):
bool:
Whether or not transforming and then reverse transforming returns the input data.
"""
if self.null_transformer and not self.null_transformer.creates_null_column():
if self.null_transformer and not self.null_transformer.creates_model_missing_values():
return False

return self.COMPOSITION_IS_IDENTITY
Expand All @@ -73,7 +74,7 @@ def get_output_types(self):
output_types = {
'value': 'float',
}
if self.null_transformer and self.null_transformer.creates_null_column():
if self.null_transformer and self.null_transformer.creates_model_missing_values():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)
Expand Down Expand Up @@ -124,8 +125,11 @@ def _fit(self, data):
Data to fit the transformer to.
"""
transformed = self._transform_helper(data)
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer.fit(transformed)
self.null_transformer = NullTransformer(
self.missing_value_replacement,
self.model_missing_values
)
self.null_transformer.fit(transformed, self.get_input_column())

def _transform(self, data):
"""Transform datetime values to float values.
Expand Down Expand Up @@ -153,7 +157,7 @@ def _reverse_transform(self, data):
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if self.nan is not None:
if self.missing_value_replacement is not None:
data = self.null_transformer.reverse_transform(data)

if isinstance(data, np.ndarray) and (data.ndim == 2):
Expand All @@ -180,18 +184,19 @@ class DatetimeRoundedTransformer(DatetimeTransformer):
This class behaves exactly as the ``DatetimeTransformer`` with ``strip_constant=True``.

Args:
nan (int, str or None):
missing_value_replacement (int, str or None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should put object everywhere.

Indicate what to do with the null values. If an integer is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
model_missing_values (bool):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Delete the None docstring.

Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
"""

def __init__(self, nan='mean', null_column=None):
super().__init__(nan=nan, null_column=null_column, strip_constant=True)
def __init__(self, missing_value_replacement=None, model_missing_values=False):
super().__init__(missing_value_replacement=missing_value_replacement,
model_missing_values=model_missing_values, strip_constant=True)
Loading