Skip to content

Commit

Permalink
Update datetime transformer (#230)
Browse files Browse the repository at this point in the history
* Update datetime transformer

* Update datetime transformer to match the new baseclass transformer

* Fix bugs

* Disable duplicate code lint error + fix readme.md error

* Fix lint
  • Loading branch information
fealho authored Sep 24, 2021
1 parent 3be22ab commit 3a83496
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 19 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ Before being able to transform the data, we need the transformer to learn from i
We will do this by calling its `fit` method passing the column that we want to transform.

```python3
transformer.fit(data['3_datetime'])
transformer.fit(data, columns=['3_datetime'])
```

### 4. Transform the data
Expand All @@ -113,7 +113,7 @@ Once the transformer is fitted, we can pass the data again to its `transform` me
to get the transformed version of the data.

```python3
transformed = transformer.transform(data['3_datetime'])
transformed = transformer.transform(data)
```

The output will be a `numpy.ndarray` with two columns, one with the datetimes transformed
Expand Down
47 changes: 41 additions & 6 deletions rdt/transformers/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ class DatetimeTransformer(BaseTransformer):
are zero on the lower time units.
"""

INPUT_TYPE = 'datetime'
DETERMINISTIC_TRANSFORM = True
DETERMINISTIC_REVERSE = True
COMPOSITION_IS_IDENTITY = True

null_transformer = None
divider = None

Expand All @@ -42,6 +47,33 @@ def __init__(self, nan='mean', null_column=None, strip_constant=False):
self.null_column = null_column
self.strip_constant = strip_constant

def is_composition_identity(self):
"""Return whether composition of transform and reverse transform produces the input data.
Returns:
bool:
Whether or not transforming and then reverse transforming returns the input data.
"""
if self.null_transformer and not self.null_transformer.creates_null_column():
return False

return self.COMPOSITION_IS_IDENTITY

def get_output_types(self):
"""Return the output types supported by the transformer.
Returns:
dict:
Mapping from the transformed column names to supported data types.
"""
output_types = {
'value': 'float',
}
if self.null_transformer and self.null_transformer.creates_null_column():
output_types['is_null'] = 'float'

return self._add_prefix(output_types)

def _find_divider(self, transformed):
self.divider = 1
multipliers = [10] * 9 + [60, 60, 24]
Expand All @@ -52,7 +84,7 @@ def _find_divider(self, transformed):

self.divider = candidate

def _transform(self, datetimes):
def _transform_helper(self, datetimes):
"""Transform datetime values to integer."""
nulls = datetimes.isnull()
integers = pd.to_numeric(datetimes, errors='coerce').values.astype(np.float64)
Expand All @@ -65,7 +97,7 @@ def _transform(self, datetimes):

return transformed

def fit(self, data):
def _fit(self, data):
"""Fit the transformer to the data.
Args:
Expand All @@ -75,11 +107,11 @@ def fit(self, data):
if isinstance(data, np.ndarray):
data = pd.Series(data)

transformed = self._transform(data)
transformed = self._transform_helper(data)
self.null_transformer = NullTransformer(self.nan, self.null_column, copy=True)
self.null_transformer.fit(transformed)

def transform(self, data):
def _transform(self, data):
"""Transform datetime values to float values.
Args:
Expand All @@ -92,11 +124,11 @@ def transform(self, data):
if isinstance(data, np.ndarray):
data = pd.Series(data)

data = self._transform(data)
data = self._transform_helper(data)

return self.null_transformer.transform(data)

def reverse_transform(self, data):
def _reverse_transform(self, data):
"""Convert float values back to datetimes.
Args:
Expand All @@ -106,6 +138,9 @@ def reverse_transform(self, data):
Returns:
pandas.Series
"""
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if self.nan is not None:
data = self.null_transformer.reverse_transform(data)

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,4 @@ max-attributes = 11
ignore-comments = yes
ignore-docstrings = yes
ignore-imports = yes
disable = R0903, R0913, R0914, C0209, W0223, W0221, W0237
disable = R0801, R0903, R0913, R0914, C0209, W0223, W0221, W0237
20 changes: 10 additions & 10 deletions tests/unit/transformers/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ def test_no_strip(self):
data = pd.to_datetime(pd.Series([None, '1996-10-17', '1965-05-23']))

# Run
dtt.fit(data.copy().to_numpy())
transformed = dtt.transform(data.copy().to_numpy())
reverted = dtt.reverse_transform(transformed)
dtt._fit(data.copy().to_numpy())
transformed = dtt._transform(data.copy().to_numpy())
reverted = dtt._reverse_transform(transformed)

# Asserts
expect_trans = np.array([
Expand All @@ -29,9 +29,9 @@ def test_strip(self):
data = pd.to_datetime(pd.Series([None, '1996-10-17', '1965-05-23']))

# Run
dtt.fit(data.copy().to_numpy())
transformed = dtt.transform(data.copy().to_numpy())
reverted = dtt.reverse_transform(transformed)
dtt._fit(data.copy().to_numpy())
transformed = dtt._transform(data.copy().to_numpy())
reverted = dtt._reverse_transform(transformed)

# Asserts
expect_trans = np.array([
Expand All @@ -45,20 +45,20 @@ def test_strip(self):
def test_reverse_transform_all_none(self):
dt = pd.to_datetime(['2020-01-01'])
dtt = DatetimeTransformer(strip_constant=True)
dtt.fit(dt)
dtt._fit(dt)

output = dtt.reverse_transform(np.array([None]))
output = dtt._reverse_transform(np.array([None]))

expected = pd.to_datetime(['NaT'])
pd.testing.assert_series_equal(output.to_series(), expected.to_series())

def test_reverse_transform_2d_ndarray(self):
dt = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01'])
dtt = DatetimeTransformer(strip_constant=True)
dtt.fit(dt)
dtt._fit(dt)

transformed = np.array([[18262.], [18293.], [18322.]])
output = dtt.reverse_transform(transformed)
output = dtt._reverse_transform(transformed)

expected = pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01'])
pd.testing.assert_series_equal(output.to_series(), expected.to_series())

0 comments on commit 3a83496

Please sign in to comment.