Skip to content

Commit

Permalink
Creating the model directly with LookupFactor and Term works and is m…
Browse files Browse the repository at this point in the history
…ore elegant than the Q('') escaping.
  • Loading branch information
saroele committed Feb 8, 2018
1 parent a72c7e4 commit baa2cc6
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 48 deletions.
95 changes: 50 additions & 45 deletions opengrid/library/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np
import statsmodels.formula.api as fm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from patsy import ModelDesc, Term, LookupFactor
from copy import deepcopy
import re

Expand All @@ -22,9 +23,9 @@ class MultiVarLinReg(Analysis):
"""
Multi-variable linear regression based on statsmodels and Ordinary Least Squares (ols)
Pass a dataframe with the variable to be modelled (endogenous variable) and the possible independent (exogenous)
variables. Specify as string the name of the endogenous variable, and optionally pass a list with names of
exogenous variables to try (by default all other columns will be tried as exogenous variables).
Pass a dataframe with the variable to be modelled y (dependent variable) and the possible independent variables x.
Specify as string the name of the dependent variable, and optionally pass a list with names of
independent variables to try (by default all other columns will be tried as independent variables).
The analysis is based on a forward-selection approach: starting from a simple model, the model is iteratively
refined and verified until no statistical relevant improvements can be obtained. Each model in the iteration loop
Expand All @@ -38,25 +39,25 @@ class MultiVarLinReg(Analysis):
--------
>> mvlr = MultiVarLinReg(df, 'gas', p_max=0.04)
>> mvlr = MultiVarLinReg(df, 'gas', list_of_exog=['heatingDegreeDays14', 'GlobalHorizontalIrradiance', 'WindSpeed'])
>> mvlr = MultiVarLinReg(df, 'gas', list_of_x=['heatingDegreeDays14', 'GlobalHorizontalIrradiance', 'WindSpeed'])
"""

def __init__(self, df, endog, **kwargs):
def __init__(self, df, y, **kwargs):
"""
Parameters
----------
df : pd.DataFrame
Datetimeindex and both endogenous and exogenous variables as columns
endog : str
y : str
Name of the endogeneous variable to model
p_max : float (default=0.05)
Acceptable p-value of the t-statistic for estimated parameters
list_of_exog : list of str (default=None)
list_of_x : list of str (default=None)
If None (default), try to build a model with all columns in the dataframe
If a list with column names is given, only try these columns as exogenous variables
If a list with column names is given, only try these columns as independent variables
confint : float, default=0.95
Two-sided confidence interval for predictions.
cross_validation : bool, default=False
Expand All @@ -68,20 +69,20 @@ def __init__(self, df, endog, **kwargs):
For gas consumption or PV production, this is not physical so allow_negative_predictions should be False
"""
self.df = df.copy() # type: pd.DataFrame
assert endog in self.df.columns, "The endogenous variable {} is not a column in the dataframe".format(endog)
self.endog = endog
assert y in self.df.columns, "The dependent variable {} is not a column in the dataframe".format(y)
self.y = y

self.p_max = kwargs.get('p_max', 0.05)
self.list_of_exog = kwargs.get('list_of_exog', self.df.columns.tolist())
self.list_of_x = kwargs.get('list_of_x', self.df.columns.tolist())
self.confint = kwargs.get('confint', 0.95)
self.cross_validation = kwargs.get('cross_validation', False)
self.allow_negative_predictions = kwargs.get('allow_negative_predictions', False)
try:
self.list_of_exog.remove(self.endog)
self.list_of_x.remove(self.y)
except ValueError:
pass

self.do_analysis()
#self.do_analysis()

def do_analysis(self):
"""
Expand All @@ -96,34 +97,44 @@ def do_analysis(self):
def _do_analysis_no_cross_validation(self):
"""
Find the best model (fit) and create self.list_of_fits and self.fit
"""

self.list_of_fits = []
# first model is just the mean
self.list_of_fits.append(fm.ols(formula="Q('{}') ~ 1".format(self.endog), data=self.df).fit())
response_term = [Term([LookupFactor(self.y)])]
model_terms = [Term([])] # empty term is the intercept
all_model_terms_dict = {x:Term([LookupFactor(x)]) for x in self.list_of_x}
# ...then add another term for each candidate
#model_terms += [Term([LookupFactor(c)]) for c in candidates]
model_desc = ModelDesc(response_term, model_terms)
self.list_of_fits.append(fm.ols(model_desc, data=self.df).fit())
# try to improve the model until no improvements can be found
all_exog = self.list_of_exog[:]
while all_exog:
# try each x in all_exog and overwrite the best_fit if we find a better one

while all_model_terms_dict:
# try each x and overwrite the best_fit if we find a better one
# the first best_fit is the one from the previous round
best_fit = deepcopy(self.list_of_fits[-1])
for x in all_exog:
best_fit = self.list_of_fits[-1]
best_bic = best_fit.bic
for x, term in all_model_terms_dict.items():
# make new_fit, compare with best found so far
formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x)
fit = fm.ols(formula=formula, data=self.df).fit()
best_fit = self.find_best_bic([best_fit, fit])
model_desc = ModelDesc(response_term, best_fit.model.formula.rhs_termlist + [term])
fit = fm.ols(model_desc, data=self.df).fit()
if fit.bic < best_bic:
best_bic = fit.bic
best_fit = fit
best_x = x
#best_fit = self.find_best_bic([best_fit, fit])

# Sometimes, the obtained fit may be better, but contains unsignificant parameters.
# Correct the fit by removing the unsignificant parameters and estimate again
best_fit = self._prune(best_fit, p_max=self.p_max)
#best_fit = self._prune(best_fit, p_max=self.p_max)

# if best_fit does not contain more variables than last fit in self.list_of_fits, exit
if best_fit.model.formula in self.list_of_fits[-1].model.formula:
if len(best_fit.model.formula.rhs_termlist) == len(self.list_of_fits[-1].model.formula.rhs_termlist):
break
else:
self.list_of_fits.append(best_fit)
all_exog.remove(x)
all_model_terms_dict.pop(best_x)
self.fit = self.list_of_fits[-1]

def _do_analysis_cross_validation(self):
Expand All @@ -135,19 +146,19 @@ def _do_analysis_cross_validation(self):

# initialization: first model is the mean, but compute cv correctly.
errors = []
formula = "Q('{}') ~ 1".format(self.endog)
formula = "Q('{}') ~ 1".format(self.y)
for i in self.df.index:
# make new_fit, compute cross-validation and store error
df_ = self.df.drop(i, axis=0)
fit = fm.ols(formula=formula, data=df_).fit()
cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])
errors.append(cross_prediction['predicted'] - cross_prediction[self.y])

self.list_of_fits = [fm.ols(formula=formula, data=self.df).fit()]
self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]

# try to improve the model until no improvements can be found
all_exog = self.list_of_exog[:]
all_exog = self.list_of_x[:]
while all_exog:
# import pdb;pdb.set_trace()
# try each x in all_exog and overwrite if we find a better one
Expand All @@ -164,7 +175,7 @@ def _do_analysis_cross_validation(self):
df_ = self.df.drop(i, axis=0)
fit = fm.ols(formula=formula, data=df_).fit()
cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])
errors.append(cross_prediction['predicted'] - cross_prediction[self.y])
cverror = np.mean(np.abs(np.array(errors)))
# compare the model with the current fit
if cverror < best['cverror']:
Expand Down Expand Up @@ -300,14 +311,8 @@ def _predict(self, fit, df):
if not self.allow_negative_predictions:
df_res.loc[df_res['predicted'] < 0, 'predicted'] = 0

def rename(x):
if x == 'Intercept':
return x
else:
return self.quote(x)

prstd, interval_l, interval_u = wls_prediction_std(fit,
df_res.rename(columns=rename)[fit.model.exog_names],
df_res[fit.model.exog_names],
alpha=1 - self.confint)
df_res['interval_l'] = interval_l
df_res['interval_u'] = interval_u
Expand Down Expand Up @@ -384,7 +389,7 @@ def plot(self, model=True, bar_chart=True, **kwargs):
try:
exog1 = fit.model.formula.split('+')[1].strip()
except IndexError:
exog1 = self.list_of_exog[0]
exog1 = self.list_of_x[0]
exog1 = self._unquote(exog1)

# plot model as an adjusted trendline
Expand All @@ -397,10 +402,10 @@ def plot(self, model=True, bar_chart=True, **kwargs):
plt.plot(dfmodel.index, dfmodel['interval_u'], ':', color='royalblue')
# plot dots for the measurements
if len(df_auto) > 0:
plt.plot(df_auto[exog1], df_auto[self.endog], 'o', mfc='orangered', mec='orangered', ms=8,
plt.plot(df_auto[exog1], df_auto[self.y], 'o', mfc='orangered', mec='orangered', ms=8,
label='Data used for model fitting')
if len(df_prog) > 0:
plt.plot(df_prog[exog1], df_prog[self.endog], 'o', mfc='seagreen', mec='seagreen', ms=8,
plt.plot(df_prog[exog1], df_prog[self.y], 'o', mfc='seagreen', mec='seagreen', ms=8,
label='Data not used for model fitting')
plt.title('{} - rsquared={} - BIC={}'.format(fit.model.formula, fit.rsquared, fit.bic))
figures.append(plt.gcf())
Expand All @@ -413,17 +418,17 @@ def plot(self, model=True, bar_chart=True, **kwargs):
title = 'Measured' # will be appended based on the available data
if len(df_auto) > 0:
model = ax.bar(ind[:len(df_auto)], df_auto['predicted'], width * 2, color='#FDD787', ecolor='#FDD787',
yerr=df_auto['interval_u'] - df_auto['predicted'], label=self.endog + ' modelled')
yerr=df_auto['interval_u'] - df_auto['predicted'], label=self.y + ' modelled')
title = title + ', modelled'
if len(df_prog) > 0:
prog = ax.bar(ind[len(df_auto):], df_prog['predicted'], width * 2, color='#6CD5A1', ecolor='#6CD5A1',
yerr=df_prog['interval_u'] - df_prog['predicted'], label=self.endog + ' expected')
yerr=df_prog['interval_u'] - df_prog['predicted'], label=self.y + ' expected')
title = title + ' and predicted'

meas = ax.bar(ind + width / 2., df[self.endog], width, label=self.endog + ' measured', color='#D5756C')
meas = ax.bar(ind + width / 2., df[self.y], width, label=self.y + ' measured', color='#D5756C')
# add some text for labels, title and axes ticks
ax.set_ylabel(self.endog)
ax.set_title('{} {}'.format(title, self.endog))
ax.set_ylabel(self.y)
ax.set_title('{} {}'.format(title, self.y))
ax.set_xticks(ind + width)
ax.set_xticklabels([x.strftime('%d-%m-%Y') for x in df.index], rotation='vertical')
ax.yaxis.grid(True)
Expand Down
14 changes: 11 additions & 3 deletions opengrid/tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,23 @@ def test_init(self):
df = datasets.get('gas_2016_hour')
df_month = df.resample('MS').sum()
mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04)
mvlr.do_analysis()
self.assertTrue(hasattr(mvlr, 'list_of_fits'))

def test_strange_names(self):
df = datasets.get('gas_2016_hour')
df_month = df.resample('MS').sum()
df_month.rename(columns={'d5a7': '3*tempête !'}, inplace=True)
mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04)
mvlr.do_analysis()
self.assertTrue(hasattr(mvlr, 'list_of_fits'))

def test_predict(self):
df = datasets.get('gas_2016_hour')
df_month = df.resample('MS').sum()
df_month.rename(columns={'d5a7': '3*tempête !'}, inplace=True)
mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04)
mvlr.do_analysis()
mvlr.add_prediction()

self.assertListEqual(mvlr.df.columns.tolist(),
Expand All @@ -48,6 +51,7 @@ def test_cross_validation(self):
df = datasets.get('gas_2016_hour')
df_month = df.resample('MS').sum()
mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04, cross_validation=True)
mvlr.do_analysis()
self.assertTrue(hasattr(mvlr, 'list_of_fits'))

def test_prediction(self):
Expand All @@ -56,6 +60,7 @@ def test_prediction(self):
df_training = df_month.iloc[:-1, :]
df_pred = df_month.iloc[[-1], :]
mvlr = og.MultiVarLinReg(df_training, '313b', p_max=0.04)
mvlr.do_analysis()
df_pred_95 = mvlr._predict(mvlr.fit, df=df_pred)
mvlr.confint = 0.98
df_pred_98 = mvlr._predict(mvlr.fit, df=df_pred)
Expand All @@ -73,6 +78,7 @@ def test_plot(self):
df = datasets.get('gas_2016_hour')
df_month = df.resample('MS').sum()
mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04)
mvlr.do_analysis()

with mock.patch.object(plt_mocked, 'subplots', return_value=(fig_mock, ax_mock)):
mvlr.plot()
Expand All @@ -81,6 +87,7 @@ def test_alternative_metrics(self):
df = datasets.get('gas_2016_hour')
df_month = df.resample('MS').sum()
mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04)
mvlr.do_analysis()
best_rsquared = mvlr.find_best_rsquared(mvlr.list_of_fits)
best_akaike = mvlr.find_best_akaike(mvlr.list_of_fits)
best_bic = mvlr.find_best_bic(mvlr.list_of_fits)
Expand All @@ -92,11 +99,12 @@ def test_prune(self):
df = datasets.get('gas_2016_hour')
df_month = df.resample('MS').sum()
mvlr = og.MultiVarLinReg(df_month, '313b')
self.assertTrue("Q('d5a7')" in mvlr.fit.model.exog_names)
mvlr.do_analysis()
self.assertTrue("d5a7" in mvlr.fit.model.exog_names)
pruned = mvlr._prune(mvlr.fit, 0.05)
self.assertTrue("Q('d5a7')" in pruned.model.exog_names)
self.assertTrue("d5a7" in pruned.model.exog_names)
pruned = mvlr._prune(mvlr.fit, 0.0001)
self.assertFalse("Q('d5a7')" in pruned.model.exog_names)
self.assertFalse("d5a7" in pruned.model.exog_names)


if __name__ == '__main__':
Expand Down

0 comments on commit baa2cc6

Please sign in to comment.