-
Notifications
You must be signed in to change notification settings - Fork 24
/
null.py
170 lines (128 loc) · 5.83 KB
/
null.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""Transformer for data that contains Null values."""
import logging
import numpy as np
import pandas as pd
from rdt.errors import TransformerInputError
LOGGER = logging.getLogger(__name__)
class NullTransformer():
"""Transformer for data that contains Null values.
Args:
missing_value_replacement (object or None):
Indicate what to do with the null values. If an integer, float or string is given,
replace them with the given value. If the strings ``'mean'`` or ``'mode'`` are given,
replace them with the corresponding aggregation (``'mean'`` only works for numerical
values). If ``None`` is given, do not replace them. Defaults to ``None``.
missing_value_generation (str or None):
The way missing values are being handled. There are three strategies:
* ``random``: Randomly generates missing values based on the percentage of
missing values.
* ``from_column``: Creates a binary column that describes whether the original
value was missing. Then use it to recreate missing values.
* ``None``: Do nothing with the missing values on the reverse transform. Simply
pass whatever data we get through.
"""
nulls = None
_missing_value_generation = None
_missing_value_replacement = None
_null_percentage = None
def __init__(self, missing_value_replacement=None, missing_value_generation='random'):
self._missing_value_replacement = missing_value_replacement
if missing_value_generation not in (None, 'from_column', 'random'):
raise TransformerInputError(
"'missing_value_generation' must be one of the following values: "
"None, 'from_column' or 'random'."
)
self._missing_value_generation = missing_value_generation
def models_missing_values(self):
"""Indicate whether this transformer creates a null column on transform.
Returns:
bool:
Whether a null column is created on transform.
"""
return self._missing_value_generation == 'from_column'
def _get_missing_value_replacement(self, data):
"""Get the fill value to use for the given data.
Args:
data (pd.Series):
The data that is being transformed.
Return:
object:
The fill value that needs to be used.
Raise:
TransformerInputError:
Error raised when data only contains nans and ``_missing_value_replacement``
is set to 'mean' or 'mode'.
"""
if self._missing_value_replacement is None:
return None
if self._missing_value_replacement in {'mean', 'mode'} and pd.isna(data).all():
msg = (
f"'missing_value_replacement' cannot be set to '{self._missing_value_replacement}'"
' when the provided data only contains NaNs. Using 0 instead.'
)
LOGGER.info(msg)
return 0
if self._missing_value_replacement == 'mean':
return data.mean()
if self._missing_value_replacement == 'mode':
return data.mode(dropna=True)[0]
return self._missing_value_replacement
def fit(self, data):
"""Fit the transformer to the data.
Evaluate if the transformer has to create the null column or not.
Args:
data (pandas.Series):
Data to transform.
"""
if self._missing_value_generation is not None:
null_values = data.isna().to_numpy()
self.nulls = null_values.any()
self._missing_value_replacement = self._get_missing_value_replacement(data)
if not self.nulls and self.models_missing_values():
self._missing_value_generation = None
guidance_message = (
f'Guidance: There are no missing values in column {data.name}. '
'Extra column not created.'
)
LOGGER.info(guidance_message)
if self._missing_value_generation == 'random':
self._null_percentage = null_values.sum() / len(data)
def transform(self, data):
"""Replace null values with the indicated ``missing_value_replacement``.
If required, create the null indicator column.
Args:
data (pandas.Series or numpy.ndarray):
Data to transform.
Returns:
numpy.ndarray
"""
if self._missing_value_generation is None:
return data.to_numpy()
isna = data.isna()
if isna.any() and self._missing_value_replacement is not None:
data = data.fillna(self._missing_value_replacement)
if self._missing_value_generation == 'from_column':
return pd.concat([data, isna.astype(np.float64)], axis=1).to_numpy()
return data.to_numpy()
def reverse_transform(self, data):
"""Restore null values to the data.
If a null indicator column was created during fit, use it as a reference.
Otherwise, randomly replace values with ``np.nan``. The percentage of values
that will be replaced is the percentage of null values seen in the fitted data.
Args:
data (numpy.ndarray):
Data to transform.
Returns:
pandas.Series
"""
data = data.copy()
if self._missing_value_generation == 'from_column':
if self.nulls:
isna = data[:, 1] > 0.5
data = data[:, 0]
elif self.nulls:
isna = np.random.random((len(data), )) < self._null_percentage
data = pd.Series(data)
if self.nulls and isna.any():
data.loc[isna] = np.nan
return data