import numpy as np
import pandas as pd
from pathlib import Path
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

Regression Analysis: Seasonal Effects with Sklearn Linear Regression

In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with lagged Yen futures returns.

# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
    Path("yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
yen_futures.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Open	High	Low	Last	Change	Settle	Volume	Previous Day Open Interest
Date
1976-08-02	3398.0	3401.0	3398.0	3401.0	NaN	3401.0	2.0	1.0
1976-08-03	3401.0	3401.0	3401.0	3401.0	NaN	3401.0	0.0	1.0
1976-08-04	3401.0	3401.0	3401.0	3401.0	NaN	3401.0	0.0	1.0
1976-08-05	3401.0	3401.0	3401.0	3401.0	NaN	3401.0	0.0	1.0
1976-08-06	3401.0	3401.0	3401.0	3401.0	NaN	3401.0	0.0	1.0

# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
yen_futures.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Open	High	Low	Last	Change	Settle	Volume	Previous Day Open Interest
Date
1990-01-02	6954.0	6954.0	6835.0	6847.0	NaN	6847.0	48336.0	51473.0
1990-01-03	6877.0	6910.0	6865.0	6887.0	NaN	6887.0	38206.0	53860.0
1990-01-04	6937.0	7030.0	6924.0	7008.0	NaN	7008.0	49649.0	55699.0
1990-01-05	6952.0	6985.0	6942.0	6950.0	NaN	6950.0	29944.0	53111.0
1990-01-08	6936.0	6972.0	6936.0	6959.0	NaN	6959.0	19763.0	52072.0

Data Preparation

Returns

# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s

returns = yen_futures["Settle"].pct_change() * 100

returns = returns.replace(-np.inf, np.nan).dropna()
yen_futures['Returns']= returns

returns.dropna(inplace=True)
yen_futures.tail()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Open	High	Low	Last	Change	Settle	Volume	Previous Day Open Interest	Returns
Date
2019-10-09	9381.0	9391.5	9330.5	9343.5	38.5	9338.0	99153.0	145470.0	-0.410601
2019-10-10	9343.5	9380.5	9293.5	9301.0	34.5	9303.5	159397.0	144474.0	-0.369458
2019-10-11	9308.5	9309.0	9240.0	9267.0	52.5	9251.0	158810.0	147471.0	-0.564304
2019-10-14	9259.0	9292.0	9250.5	9261.0	14.0	9265.0	69457.0	153902.0	0.151335
2019-10-15	9264.5	9280.0	9216.5	9220.0	43.5	9221.5	108342.0	151564.0	-0.469509

Lagged Returns

# Create a lagged return using the shift function

yen_futures['Lagged_Return'] = yen_futures.Returns.shift()
yen_futures.dropna(inplace=True)
yen_futures.tail()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Open	High	Low	Last	Change	Settle	Volume	Previous Day Open Interest	Returns	Lagged_Return
Date
2019-10-09	9381.0	9391.5	9330.5	9343.5	38.5	9338.0	99153.0	145470.0	-0.410601	0.170931
2019-10-10	9343.5	9380.5	9293.5	9301.0	34.5	9303.5	159397.0	144474.0	-0.369458	-0.410601
2019-10-11	9308.5	9309.0	9240.0	9267.0	52.5	9251.0	158810.0	147471.0	-0.564304	-0.369458
2019-10-14	9259.0	9292.0	9250.5	9261.0	14.0	9265.0	69457.0	153902.0	0.151335	-0.564304
2019-10-15	9264.5	9280.0	9216.5	9220.0	43.5	9221.5	108342.0	151564.0	-0.469509	0.151335

Train Test Split

# Create a train/test split for the data using 2018-2019 for testing and the rest for training

train = yen_futures[:'2017']
test = yen_futures['2018':]

# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
X_train = train["Lagged_Return"].to_frame()
X_test = test["Lagged_Return"].to_frame()

y_train = train["Returns"]
y_test = test["Returns"]

X_train.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Lagged_Return
Date
2014-02-18	0.409123
2014-02-19	-0.427829
2014-02-20	-0.020460
2014-02-21	0.020465
2014-02-24	-0.204604

Linear Regression Model

# Create a Linear Regression model and fit it to the training data
model = LinearRegression()

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model.fit(X_train, y_train)

LinearRegression()

Make predictions using the Testing Data

Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.

# Make a prediction of "y" values using just the test dataset
predictions = model.predict(X_test)

# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results['Predicted Returns'] = predictions

# Plot the first 20 predictions vs the true values
Results[:20].plot(subplots=True)

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f97c2942a10>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f97c2968a10>],
      dtype=object)

Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (X_test and y_test)

# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 

mse = mean_squared_error(Results["Returns"], Results['Predicted Returns']
)
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):

rmse = np.sqrt(mse)
print(f"Out-of-Sample Performance Root Mean Squared Error (RMSE): {rmse}")

Out-of-Sample Performance Root Mean Squared Error (RMSE): 0.41545437184712763

In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)

# Construct a dataframe using just the "y" training data:

in_sample_results = y_train.to_frame()

# Add a column of "in-sample" predictions to that dataframe:  
in_sample_results['In-sample Predictions'] = model.predict(X_train)

# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(in_sample_results['Returns'], in_sample_results['In-sample Predictions']
)

# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)
print(f"In-of-Sample Performance Root Mean Squared Error (RMSE): {in_sample_rmse}")

In-of-Sample Performance Root Mean Squared Error (RMSE): 0.5962037920929946

Conclusions

YOUR CONCLUSIONS HERE!

print(f"Out-of-Sample Performance Root Mean Squared Error (RMSE): {rmse} is lower than In-of-Sample Performance Root Mean Squared Error (RMSE): {in_sample_rmse} so Out-of-Sample data are more significant")

Out-of-Sample Performance Root Mean Squared Error (RMSE): 0.41545437184712763 is lower than In-of-Sample Performance Root Mean Squared Error (RMSE): 0.5962037920929946 so Out-of-Sample data are more significant

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

regression_analysis.md

regression_analysis.md

Regression Analysis: Seasonal Effects with Sklearn Linear Regression

Data Preparation

Returns

Lagged Returns

Train Test Split

Linear Regression Model

Make predictions using the Testing Data

Out-of-Sample Performance

In-Sample Performance

Conclusions

Files

regression_analysis.md

Latest commit

History

regression_analysis.md

File metadata and controls

Regression Analysis: Seasonal Effects with Sklearn Linear Regression

Data Preparation

Returns

Lagged Returns

Train Test Split

Linear Regression Model

Make predictions using the Testing Data

Out-of-Sample Performance

In-Sample Performance

Conclusions