Skip to content

refactor code to work with pandas 2.0 #660

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
2 changes: 1 addition & 1 deletion feature_engine/datetime/datetime_subtraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def _sub(self, dt_df: pd.DataFrame):
new_df[new_varnames] = (
dt_df[self.variables_]
.sub(dt_df[reference], axis=0)
.apply(lambda s: s / np.timedelta64(1, self.output_unit))
.div(np.timedelta64(1, self.output_unit).astype("timedelta64[ns]"))
)

if self.new_variables_names is not None:
Expand Down
2 changes: 1 addition & 1 deletion feature_engine/imputation/drop_missing_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def return_na_data(self, X: pd.DataFrame) -> pd.DataFrame:
idx = pd.isnull(X[self.variables_]).mean(axis=1) >= self.threshold
idx = idx[idx]
else:
idx = pd.isnull(X[self.variables_]).any(1)
idx = pd.isnull(X[self.variables_]).any(axis=1)
idx = idx[idx]

return X.loc[idx.index, :]
Expand Down
6 changes: 3 additions & 3 deletions feature_engine/transformation/reciprocal.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ class ReciprocalTransformer(BaseNumericalTransformer):
def __init__(
self, variables: Union[None, int, str, List[Union[str, int]]] = None
) -> None:

self.variables = _check_init_parameter_variables(variables)

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
Expand Down Expand Up @@ -152,8 +151,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:

# transform
# for some reason reciprocal does not work with integers
X.loc[:, self.variables_] = X.loc[:, self.variables_].astype("float")
X.loc[:, self.variables_] = np.reciprocal(X.loc[:, self.variables_])
X.loc[:, self.variables_] = np.reciprocal(
X.loc[:, self.variables_].astype("float")
)

return X

Expand Down
8 changes: 5 additions & 3 deletions feature_engine/variable_handling/_variable_type_checks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import pandas as pd
from pandas.core.dtypes.common import is_categorical_dtype as is_categorical
from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime
Expand All @@ -6,7 +8,6 @@


def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:

# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
if is_object(column):
Expand All @@ -25,15 +26,16 @@ def _is_categories_num(column: pd.Series) -> bool:


def _is_convertible_to_dt(column: pd.Series) -> bool:
return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))


def _is_convertible_to_num(column: pd.Series) -> bool:
return is_numeric(pd.to_numeric(column, errors="ignore"))


def _is_categorical_and_is_datetime(column: pd.Series) -> bool:

# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
if is_object(column):
Expand Down
26 changes: 20 additions & 6 deletions tests/test_datetime/test_datetime_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def test_extract_datetime_features_with_default_options(
df_datetime_transformed[
vars_non_dt + [var + feat for var in vars_dt for feat in feat_names_default]
],
check_dtype=False,
)


Expand All @@ -198,6 +199,7 @@ def test_extract_datetime_features_from_specified_variables(
+ ["datetime_range", "date_obj2", "time_obj"]
+ ["date_obj1" + feat for feat in feat_names_default]
],
check_dtype=False,
)

# multiple datetime variables
Expand All @@ -215,6 +217,7 @@ def test_extract_datetime_features_from_specified_variables(
for feat in feat_names_default
]
],
check_dtype=False,
)

# multiple datetime variables in different order than they appear in the df
Expand All @@ -232,6 +235,7 @@ def test_extract_datetime_features_from_specified_variables(
for feat in feat_names_default
]
],
check_dtype=False,
)

# datetime variable is index
Expand All @@ -251,12 +255,15 @@ def test_extract_datetime_features_from_specified_variables(
],
axis=1,
),
check_dtype=False,
)


def test_extract_all_datetime_features(df_datetime, df_datetime_transformed):
X = DatetimeFeatures(features_to_extract="all").fit_transform(df_datetime)
pd.testing.assert_frame_equal(X, df_datetime_transformed.drop(vars_dt, axis=1))
pd.testing.assert_frame_equal(
X, df_datetime_transformed.drop(vars_dt, axis=1), check_dtype=False
)


def test_extract_specified_datetime_features(df_datetime, df_datetime_transformed):
Expand All @@ -269,6 +276,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme
vars_non_dt
+ [var + "_" + feat for var in vars_dt for feat in ["semester", "week"]]
],
check_dtype=False,
)

# different order than they appear in the glossary
Expand All @@ -281,6 +289,7 @@ def test_extract_specified_datetime_features(df_datetime, df_datetime_transforme
vars_non_dt
+ [var + "_" + feat for var in vars_dt for feat in ["hour", "day_of_week"]]
],
check_dtype=False,
)


Expand All @@ -290,7 +299,9 @@ def test_extract_features_from_categorical_variable(
cat_date = pd.DataFrame({"date_obj1": df_datetime["date_obj1"].astype("category")})
X = DatetimeFeatures(variables="date_obj1").fit_transform(cat_date)
pd.testing.assert_frame_equal(
X, df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]]
X,
df_datetime_transformed[["date_obj1" + feat for feat in feat_names_default]],
check_dtype=False,
)


Expand All @@ -311,6 +322,7 @@ def test_extract_features_from_different_timezones(
df_datetime_transformed[["time_obj_hour"]].apply(
lambda x: x.subtract(time_zones)
),
check_dtype=False,
)
exp_err_msg = (
"ValueError: variable(s) time_obj "
Expand Down Expand Up @@ -356,7 +368,7 @@ def test_extract_features_from_localized_tz_variables():
# transform
X = transformer.transform(tz_df)
df_expected = pd.DataFrame({"date_var_hour": [1, 2, 2, 2, 2, 3, 3]})
pd.testing.assert_frame_equal(X, df_expected)
pd.testing.assert_frame_equal(X, df_expected, check_dtype=False)

# when utc is True
transformer = DatetimeFeatures(features_to_extract=["hour"], utc=True).fit(tz_df)
Expand All @@ -372,7 +384,7 @@ def test_extract_features_from_localized_tz_variables():
# transform
X = transformer.transform(tz_df)
df_expected = pd.DataFrame({"date_var_hour": [5, 6, 6, 6, 6, 7, 7]})
pd.testing.assert_frame_equal(X, df_expected)
pd.testing.assert_frame_equal(X, df_expected, check_dtype=False)


def test_extract_features_without_dropping_original_variables(
Expand All @@ -399,6 +411,7 @@ def test_extract_features_without_dropping_original_variables(
],
axis=1,
),
check_dtype=False,
)


Expand Down Expand Up @@ -435,6 +448,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime):
pd.testing.assert_frame_equal(
X,
pd.DataFrame({"date_obj2_day_of_month": [10, 31, 30, 17]}),
check_dtype=False,
)

X = DatetimeFeatures(features_to_extract=["year"], yearfirst=True).fit_transform(
Expand All @@ -443,6 +457,7 @@ def test_extract_features_with_different_datetime_parsing_options(df_datetime):
pd.testing.assert_frame_equal(
X,
pd.DataFrame({"date_obj2_year": [2010, 2009, 1995, 2004]}),
check_dtype=False,
)


Expand All @@ -457,8 +472,7 @@ def test_get_feature_names_out(df_datetime, df_datetime_transformed):
transformer.get_feature_names_out(input_features=vars_dt)

with pytest.raises(ValueError):
transformer.get_feature_names_out(input_features=["date_obj1"])\

transformer.get_feature_names_out(input_features=["date_obj1"])
# default features from 1 variable
transformer = DatetimeFeatures(variables="date_obj1")
X = transformer.fit_transform(df_datetime)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,10 @@ def test_multiple_windows(df_time):
X = df_time.copy()
num_vars = ["ambient_temp", "module_temp", "irradiation"]
tmp = X[num_vars].rolling(2).agg(["sum", "mean"]).shift(periods=15, freq="min")
tmp.columns = tmp.columns.droplevel()
X_tr = X.merge(tmp, left_index=True, right_index=True, how="left")
tmp = X[num_vars].rolling(3).agg(["sum", "mean"]).shift(periods=15, freq="min")
tmp.columns = tmp.columns.droplevel()
X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left")
X_tr.columns = transformer.get_feature_names_out()

Expand All @@ -404,13 +406,15 @@ def test_multiple_windows(df_time):
.agg(["sum", "mean"])
.shift(freq="30min")
)
tmp.columns = tmp.columns.droplevel()
X_tr = X.merge(tmp, left_index=True, right_index=True, how="left")
tmp = (
X[["ambient_temp", "irradiation"]]
.rolling(3)
.agg(["sum", "mean"])
.shift(freq="30min")
)
tmp.columns = tmp.columns.droplevel()
X_tr = X_tr.merge(tmp, left_index=True, right_index=True, how="left")
X_tr.columns = transformer.get_feature_names_out()

Expand Down