Skip to content

Commit

Permalink
Update NullTransformer to make it user friendly (#384)
Browse files Browse the repository at this point in the history
  • Loading branch information
pvk-developer authored and amontanez24 committed Mar 7, 2022
1 parent 381ded1 commit c1a3b3a
Show file tree
Hide file tree
Showing 17 changed files with 698 additions and 620 deletions.
39 changes: 38 additions & 1 deletion rdt/performance/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,42 @@

DATASET_SIZES = [1000, 10000, 100000]

# Additional arguments for transformers
TRANSFORMER_ARGS = {
'BooleanTransformer': {
'missing_value_replacement': -1,
'model_missing_values': True
},
'DatetimeTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'DatetimeRoundedTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'NumericalTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'NumericalRoundedBoundedTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'NumericalBoundedTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'GaussianCopulaTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
'BayesGMMTransformer': {
'missing_value_replacement': 'mean',
'model_missing_values': True
},
}


def _get_dataset_sizes(data_type):
"""Get a list of (fit_size, transform_size) for each dataset generator.
Expand Down Expand Up @@ -48,7 +84,8 @@ def evaluate_transformer_performance(transformer, dataset_generator, verbose=Fal
pandas.DataFrame:
The performance test results.
"""
transformer_instance = transformer()
transformer_args = TRANSFORMER_ARGS.get(transformer.__name__, {})
transformer_instance = transformer(**transformer_args)

sizes = _get_dataset_sizes(dataset_generator.DATA_TYPE)

Expand Down
10 changes: 5 additions & 5 deletions rdt/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,14 @@ def get_next_transformers(self):
"""
return self._add_prefix(self.NEXT_TRANSFORMERS)

def get_input_columns(self):
"""Return list of input column names for transformer.
def get_input_column(self):
"""Return input column name for transformer.
Returns:
list:
Input column names.
str:
Input column name.
"""
return self.columns
return self.columns[0]

def get_output_columns(self):
"""Return list of column names created in ``transform``.
Expand Down
32 changes: 18 additions & 14 deletions rdt/transformers/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,16 @@ class BooleanTransformer(BaseTransformer):
Null values are replaced using a ``NullTransformer``.
Args:
nan (int or None):
Replace null values with the given value. If ``None``, do not replace them.
Defaults to ``-1``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the fit data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
missing_value_replacement (object or None):
Indicate what to do with the null values. If an object is given, replace them
with the given value. If the string ``'mode'`` is given, replace them with the
most common value. If ``None`` is given, do not replace them.
Defaults to ``None``.
model_missing_values (bool):
Whether to create a new column to indicate which values were null or not. The column
will be created only if there are null values. If ``True``, create the new column if
there are null values. If ``False``, do not create the new column even if there
are null values. Defaults to ``False``.
"""

INPUT_TYPE = 'boolean'
Expand All @@ -33,9 +34,9 @@ class BooleanTransformer(BaseTransformer):

null_transformer = None

def __init__(self, nan=-1, null_column=None):
self.nan = nan
self.null_column = null_column
def __init__(self, missing_value_replacement=None, model_missing_values=False):
self.missing_value_replacement = missing_value_replacement
self.model_missing_values = model_missing_values

def get_output_types(self):
"""Return the output types returned by this transformer.
Expand All @@ -47,7 +48,7 @@ def get_output_types(self):
output_types = {
'value': 'float',
}
if self.null_transformer and self.null_transformer.creates_null_column():
if self.null_transformer and self.null_transformer.models_missing_values():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)
Expand All @@ -59,7 +60,10 @@ def _fit(self, data):
data (pandas.Series):
Data to fit to.
"""
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer = NullTransformer(
self.missing_value_replacement,
self.model_missing_values
)
self.null_transformer.fit(data)

def _transform(self, data):
Expand Down Expand Up @@ -92,7 +96,7 @@ def _reverse_transform(self, data):
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if self.nan is not None:
if self.missing_value_replacement is not None:
data = self.null_transformer.reverse_transform(data)

if isinstance(data, np.ndarray):
Expand Down
53 changes: 28 additions & 25 deletions rdt/transformers/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,16 @@ class DatetimeTransformer(BaseTransformer):
Null values are replaced using a ``NullTransformer``.
Args:
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
missing_value_replacement (object or None):
Indicate what to do with the null values. If an object is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
model_missing_values (bool):
Whether to create a new column to indicate which values were null or not. The column
will be created only if there are null values. If ``True``, create the new column if
there are null values. If ``False``, do not create the new column even if there
are null values. Defaults to ``False``.
strip_constant (bool):
Whether to optimize the output values by finding the smallest time unit that
is not zero on the training datetimes and dividing the generated numerical
Expand All @@ -45,9 +44,10 @@ class DatetimeTransformer(BaseTransformer):
null_transformer = None
divider = None

def __init__(self, nan='mean', null_column=None, strip_constant=False, datetime_format=None):
self.nan = nan
self.null_column = null_column
def __init__(self, missing_value_replacement=None, model_missing_values=False,
strip_constant=False, datetime_format=None):
self.missing_value_replacement = missing_value_replacement
self.model_missing_values = model_missing_values
self.strip_constant = strip_constant
self.datetime_format = datetime_format

Expand All @@ -58,7 +58,7 @@ def is_composition_identity(self):
bool:
Whether or not transforming and then reverse transforming returns the input data.
"""
if self.null_transformer and not self.null_transformer.creates_null_column():
if self.null_transformer and not self.null_transformer.models_missing_values():
return False

return self.COMPOSITION_IS_IDENTITY
Expand All @@ -73,7 +73,7 @@ def get_output_types(self):
output_types = {
'value': 'float',
}
if self.null_transformer and self.null_transformer.creates_null_column():
if self.null_transformer and self.null_transformer.models_missing_values():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)
Expand Down Expand Up @@ -124,7 +124,10 @@ def _fit(self, data):
Data to fit the transformer to.
"""
transformed = self._transform_helper(data)
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer = NullTransformer(
self.missing_value_replacement,
self.model_missing_values
)
self.null_transformer.fit(transformed)

def _transform(self, data):
Expand Down Expand Up @@ -153,7 +156,7 @@ def _reverse_transform(self, data):
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if self.nan is not None:
if self.missing_value_replacement is not None:
data = self.null_transformer.reverse_transform(data)

if isinstance(data, np.ndarray) and (data.ndim == 2):
Expand All @@ -180,18 +183,18 @@ class DatetimeRoundedTransformer(DatetimeTransformer):
This class behaves exactly as the ``DatetimeTransformer`` with ``strip_constant=True``.
Args:
nan (int, str or None):
Indicate what to do with the null values. If an integer is given, replace them
missing_value_replacement (object or None):
Indicate what to do with the null values. If an object is given, replace them
with the given value. If the strings ``'mean'`` or ``'mode'`` are given, replace
them with the corresponding aggregation. If ``None`` is given, do not replace them.
Defaults to ``'mean'``.
null_column (bool):
Whether to create a new column to indicate which values were null or not.
If ``None``, only create a new column when the data contains null values.
If ``True``, always create the new column whether there are null values or not.
If ``False``, do not create the new column.
Defaults to ``None``.
model_missing_values (bool):
Whether to create a new column to indicate which values were null or not. The column
will be created only if there are null values. If ``True``, create the new column if
there are null values. If ``False``, do not create the new column even if there
are null values. Defaults to ``False``.
"""

def __init__(self, nan='mean', null_column=None):
super().__init__(nan=nan, null_column=null_column, strip_constant=True)
def __init__(self, missing_value_replacement=None, model_missing_values=False):
super().__init__(missing_value_replacement=missing_value_replacement,
model_missing_values=model_missing_values, strip_constant=True)
Loading

0 comments on commit c1a3b3a

Please sign in to comment.