import numpy as np
import pandas as pd
from pathlib import Path
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with lagged Yen futures returns.
# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
Path("yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
yen_futures.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Open | High | Low | Last | Change | Settle | Volume | Previous Day Open Interest | |
---|---|---|---|---|---|---|---|---|
Date | ||||||||
1976-08-02 | 3398.0 | 3401.0 | 3398.0 | 3401.0 | NaN | 3401.0 | 2.0 | 1.0 |
1976-08-03 | 3401.0 | 3401.0 | 3401.0 | 3401.0 | NaN | 3401.0 | 0.0 | 1.0 |
1976-08-04 | 3401.0 | 3401.0 | 3401.0 | 3401.0 | NaN | 3401.0 | 0.0 | 1.0 |
1976-08-05 | 3401.0 | 3401.0 | 3401.0 | 3401.0 | NaN | 3401.0 | 0.0 | 1.0 |
1976-08-06 | 3401.0 | 3401.0 | 3401.0 | 3401.0 | NaN | 3401.0 | 0.0 | 1.0 |
# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
yen_futures.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Open | High | Low | Last | Change | Settle | Volume | Previous Day Open Interest | |
---|---|---|---|---|---|---|---|---|
Date | ||||||||
1990-01-02 | 6954.0 | 6954.0 | 6835.0 | 6847.0 | NaN | 6847.0 | 48336.0 | 51473.0 |
1990-01-03 | 6877.0 | 6910.0 | 6865.0 | 6887.0 | NaN | 6887.0 | 38206.0 | 53860.0 |
1990-01-04 | 6937.0 | 7030.0 | 6924.0 | 7008.0 | NaN | 7008.0 | 49649.0 | 55699.0 |
1990-01-05 | 6952.0 | 6985.0 | 6942.0 | 6950.0 | NaN | 6950.0 | 29944.0 | 53111.0 |
1990-01-08 | 6936.0 | 6972.0 | 6936.0 | 6959.0 | NaN | 6959.0 | 19763.0 | 52072.0 |
# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s
returns = yen_futures["Settle"].pct_change() * 100
returns = returns.replace(-np.inf, np.nan).dropna()
yen_futures['Returns']= returns
returns.dropna(inplace=True)
yen_futures.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Open | High | Low | Last | Change | Settle | Volume | Previous Day Open Interest | Returns | |
---|---|---|---|---|---|---|---|---|---|
Date | |||||||||
2019-10-09 | 9381.0 | 9391.5 | 9330.5 | 9343.5 | 38.5 | 9338.0 | 99153.0 | 145470.0 | -0.410601 |
2019-10-10 | 9343.5 | 9380.5 | 9293.5 | 9301.0 | 34.5 | 9303.5 | 159397.0 | 144474.0 | -0.369458 |
2019-10-11 | 9308.5 | 9309.0 | 9240.0 | 9267.0 | 52.5 | 9251.0 | 158810.0 | 147471.0 | -0.564304 |
2019-10-14 | 9259.0 | 9292.0 | 9250.5 | 9261.0 | 14.0 | 9265.0 | 69457.0 | 153902.0 | 0.151335 |
2019-10-15 | 9264.5 | 9280.0 | 9216.5 | 9220.0 | 43.5 | 9221.5 | 108342.0 | 151564.0 | -0.469509 |
# Create a lagged return using the shift function
yen_futures['Lagged_Return'] = yen_futures.Returns.shift()
yen_futures.dropna(inplace=True)
yen_futures.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Open | High | Low | Last | Change | Settle | Volume | Previous Day Open Interest | Returns | Lagged_Return | |
---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||
2019-10-09 | 9381.0 | 9391.5 | 9330.5 | 9343.5 | 38.5 | 9338.0 | 99153.0 | 145470.0 | -0.410601 | 0.170931 |
2019-10-10 | 9343.5 | 9380.5 | 9293.5 | 9301.0 | 34.5 | 9303.5 | 159397.0 | 144474.0 | -0.369458 | -0.410601 |
2019-10-11 | 9308.5 | 9309.0 | 9240.0 | 9267.0 | 52.5 | 9251.0 | 158810.0 | 147471.0 | -0.564304 | -0.369458 |
2019-10-14 | 9259.0 | 9292.0 | 9250.5 | 9261.0 | 14.0 | 9265.0 | 69457.0 | 153902.0 | 0.151335 | -0.564304 |
2019-10-15 | 9264.5 | 9280.0 | 9216.5 | 9220.0 | 43.5 | 9221.5 | 108342.0 | 151564.0 | -0.469509 | 0.151335 |
# Create a train/test split for the data using 2018-2019 for testing and the rest for training
train = yen_futures[:'2017']
test = yen_futures['2018':]
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
X_train = train["Lagged_Return"].to_frame()
X_test = test["Lagged_Return"].to_frame()
y_train = train["Returns"]
y_test = test["Returns"]
X_train.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Lagged_Return | |
---|---|
Date | |
2014-02-18 | 0.409123 |
2014-02-19 | -0.427829 |
2014-02-20 | -0.020460 |
2014-02-21 | 0.020465 |
2014-02-24 | -0.204604 |
# Create a Linear Regression model and fit it to the training data
model = LinearRegression()
# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model.fit(X_train, y_train)
LinearRegression()
Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.
# Make a prediction of "y" values using just the test dataset
predictions = model.predict(X_test)
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results['Predicted Returns'] = predictions
# Plot the first 20 predictions vs the true values
Results[:20].plot(subplots=True)
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f97c2942a10>,
<matplotlib.axes._subplots.AxesSubplot object at 0x7f97c2968a10>],
dtype=object)
Evaluate the model using "out-of-sample" data (X_test and y_test)
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y"
mse = mean_squared_error(Results["Returns"], Results['Predicted Returns']
)
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
rmse = np.sqrt(mse)
print(f"Out-of-Sample Performance Root Mean Squared Error (RMSE): {rmse}")
Out-of-Sample Performance Root Mean Squared Error (RMSE): 0.41545437184712763
Evaluate the model using in-sample data (X_train and y_train)
# Construct a dataframe using just the "y" training data:
in_sample_results = y_train.to_frame()
# Add a column of "in-sample" predictions to that dataframe:
in_sample_results['In-sample Predictions'] = model.predict(X_train)
# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(in_sample_results['Returns'], in_sample_results['In-sample Predictions']
)
# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)
print(f"In-of-Sample Performance Root Mean Squared Error (RMSE): {in_sample_rmse}")
In-of-Sample Performance Root Mean Squared Error (RMSE): 0.5962037920929946
YOUR CONCLUSIONS HERE!
print(f"Out-of-Sample Performance Root Mean Squared Error (RMSE): {rmse} is lower than In-of-Sample Performance Root Mean Squared Error (RMSE): {in_sample_rmse} so Out-of-Sample data are more significant")
Out-of-Sample Performance Root Mean Squared Error (RMSE): 0.41545437184712763 is lower than In-of-Sample Performance Root Mean Squared Error (RMSE): 0.5962037920929946 so Out-of-Sample data are more significant