diff --git a/opengrid/library/regression.py b/opengrid/library/regression.py index 5243b68..e1499de 100644 --- a/opengrid/library/regression.py +++ b/opengrid/library/regression.py @@ -12,6 +12,7 @@ import numpy as np import statsmodels.formula.api as fm from statsmodels.sandbox.regression.predstd import wls_prediction_std +from patsy import ModelDesc, Term, LookupFactor from copy import deepcopy import re @@ -22,9 +23,9 @@ class MultiVarLinReg(Analysis): """ Multi-variable linear regression based on statsmodels and Ordinary Least Squares (ols) - Pass a dataframe with the variable to be modelled (endogenous variable) and the possible independent (exogenous) - variables. Specify as string the name of the endogenous variable, and optionally pass a list with names of - exogenous variables to try (by default all other columns will be tried as exogenous variables). + Pass a dataframe with the variable to be modelled y (dependent variable) and the possible independent variables x. + Specify as string the name of the dependent variable, and optionally pass a list with names of + independent variables to try (by default all other columns will be tried as independent variables). The analysis is based on a forward-selection approach: starting from a simple model, the model is iteratively refined and verified until no statistical relevant improvements can be obtained. Each model in the iteration loop @@ -38,25 +39,25 @@ class MultiVarLinReg(Analysis): -------- >> mvlr = MultiVarLinReg(df, 'gas', p_max=0.04) - >> mvlr = MultiVarLinReg(df, 'gas', list_of_exog=['heatingDegreeDays14', 'GlobalHorizontalIrradiance', 'WindSpeed']) + >> mvlr = MultiVarLinReg(df, 'gas', list_of_x=['heatingDegreeDays14', 'GlobalHorizontalIrradiance', 'WindSpeed']) """ - def __init__(self, df, endog, **kwargs): + def __init__(self, df, y, **kwargs): """ Parameters ---------- df : pd.DataFrame Datetimeindex and both endogenous and exogenous variables as columns - endog : str + y : str Name of the endogeneous variable to model p_max : float (default=0.05) Acceptable p-value of the t-statistic for estimated parameters - list_of_exog : list of str (default=None) + list_of_x : list of str (default=None) If None (default), try to build a model with all columns in the dataframe - If a list with column names is given, only try these columns as exogenous variables + If a list with column names is given, only try these columns as independent variables confint : float, default=0.95 Two-sided confidence interval for predictions. cross_validation : bool, default=False @@ -68,20 +69,20 @@ def __init__(self, df, endog, **kwargs): For gas consumption or PV production, this is not physical so allow_negative_predictions should be False """ self.df = df.copy() # type: pd.DataFrame - assert endog in self.df.columns, "The endogenous variable {} is not a column in the dataframe".format(endog) - self.endog = endog + assert y in self.df.columns, "The dependent variable {} is not a column in the dataframe".format(y) + self.y = y self.p_max = kwargs.get('p_max', 0.05) - self.list_of_exog = kwargs.get('list_of_exog', self.df.columns.tolist()) + self.list_of_x = kwargs.get('list_of_x', self.df.columns.tolist()) self.confint = kwargs.get('confint', 0.95) self.cross_validation = kwargs.get('cross_validation', False) self.allow_negative_predictions = kwargs.get('allow_negative_predictions', False) try: - self.list_of_exog.remove(self.endog) + self.list_of_x.remove(self.y) except ValueError: pass - self.do_analysis() + #self.do_analysis() def do_analysis(self): """ @@ -96,34 +97,44 @@ def do_analysis(self): def _do_analysis_no_cross_validation(self): """ Find the best model (fit) and create self.list_of_fits and self.fit - """ self.list_of_fits = [] # first model is just the mean - self.list_of_fits.append(fm.ols(formula="Q('{}') ~ 1".format(self.endog), data=self.df).fit()) + response_term = [Term([LookupFactor(self.y)])] + model_terms = [Term([])] # empty term is the intercept + all_model_terms_dict = {x:Term([LookupFactor(x)]) for x in self.list_of_x} + # ...then add another term for each candidate + #model_terms += [Term([LookupFactor(c)]) for c in candidates] + model_desc = ModelDesc(response_term, model_terms) + self.list_of_fits.append(fm.ols(model_desc, data=self.df).fit()) # try to improve the model until no improvements can be found - all_exog = self.list_of_exog[:] - while all_exog: - # try each x in all_exog and overwrite the best_fit if we find a better one + + while all_model_terms_dict: + # try each x and overwrite the best_fit if we find a better one # the first best_fit is the one from the previous round - best_fit = deepcopy(self.list_of_fits[-1]) - for x in all_exog: + best_fit = self.list_of_fits[-1] + best_bic = best_fit.bic + for x, term in all_model_terms_dict.items(): # make new_fit, compare with best found so far - formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x) - fit = fm.ols(formula=formula, data=self.df).fit() - best_fit = self.find_best_bic([best_fit, fit]) + model_desc = ModelDesc(response_term, best_fit.model.formula.rhs_termlist + [term]) + fit = fm.ols(model_desc, data=self.df).fit() + if fit.bic < best_bic: + best_bic = fit.bic + best_fit = fit + best_x = x + #best_fit = self.find_best_bic([best_fit, fit]) # Sometimes, the obtained fit may be better, but contains unsignificant parameters. # Correct the fit by removing the unsignificant parameters and estimate again - best_fit = self._prune(best_fit, p_max=self.p_max) + #best_fit = self._prune(best_fit, p_max=self.p_max) # if best_fit does not contain more variables than last fit in self.list_of_fits, exit - if best_fit.model.formula in self.list_of_fits[-1].model.formula: + if len(best_fit.model.formula.rhs_termlist) == len(self.list_of_fits[-1].model.formula.rhs_termlist): break else: self.list_of_fits.append(best_fit) - all_exog.remove(x) + all_model_terms_dict.pop(best_x) self.fit = self.list_of_fits[-1] def _do_analysis_cross_validation(self): @@ -135,19 +146,19 @@ def _do_analysis_cross_validation(self): # initialization: first model is the mean, but compute cv correctly. errors = [] - formula = "Q('{}') ~ 1".format(self.endog) + formula = "Q('{}') ~ 1".format(self.y) for i in self.df.index: # make new_fit, compute cross-validation and store error df_ = self.df.drop(i, axis=0) fit = fm.ols(formula=formula, data=df_).fit() cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :]) - errors.append(cross_prediction['predicted'] - cross_prediction[self.endog]) + errors.append(cross_prediction['predicted'] - cross_prediction[self.y]) self.list_of_fits = [fm.ols(formula=formula, data=self.df).fit()] self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))] # try to improve the model until no improvements can be found - all_exog = self.list_of_exog[:] + all_exog = self.list_of_x[:] while all_exog: # import pdb;pdb.set_trace() # try each x in all_exog and overwrite if we find a better one @@ -164,7 +175,7 @@ def _do_analysis_cross_validation(self): df_ = self.df.drop(i, axis=0) fit = fm.ols(formula=formula, data=df_).fit() cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :]) - errors.append(cross_prediction['predicted'] - cross_prediction[self.endog]) + errors.append(cross_prediction['predicted'] - cross_prediction[self.y]) cverror = np.mean(np.abs(np.array(errors))) # compare the model with the current fit if cverror < best['cverror']: @@ -300,14 +311,8 @@ def _predict(self, fit, df): if not self.allow_negative_predictions: df_res.loc[df_res['predicted'] < 0, 'predicted'] = 0 - def rename(x): - if x == 'Intercept': - return x - else: - return self.quote(x) - prstd, interval_l, interval_u = wls_prediction_std(fit, - df_res.rename(columns=rename)[fit.model.exog_names], + df_res[fit.model.exog_names], alpha=1 - self.confint) df_res['interval_l'] = interval_l df_res['interval_u'] = interval_u @@ -384,7 +389,7 @@ def plot(self, model=True, bar_chart=True, **kwargs): try: exog1 = fit.model.formula.split('+')[1].strip() except IndexError: - exog1 = self.list_of_exog[0] + exog1 = self.list_of_x[0] exog1 = self._unquote(exog1) # plot model as an adjusted trendline @@ -397,10 +402,10 @@ def plot(self, model=True, bar_chart=True, **kwargs): plt.plot(dfmodel.index, dfmodel['interval_u'], ':', color='royalblue') # plot dots for the measurements if len(df_auto) > 0: - plt.plot(df_auto[exog1], df_auto[self.endog], 'o', mfc='orangered', mec='orangered', ms=8, + plt.plot(df_auto[exog1], df_auto[self.y], 'o', mfc='orangered', mec='orangered', ms=8, label='Data used for model fitting') if len(df_prog) > 0: - plt.plot(df_prog[exog1], df_prog[self.endog], 'o', mfc='seagreen', mec='seagreen', ms=8, + plt.plot(df_prog[exog1], df_prog[self.y], 'o', mfc='seagreen', mec='seagreen', ms=8, label='Data not used for model fitting') plt.title('{} - rsquared={} - BIC={}'.format(fit.model.formula, fit.rsquared, fit.bic)) figures.append(plt.gcf()) @@ -413,17 +418,17 @@ def plot(self, model=True, bar_chart=True, **kwargs): title = 'Measured' # will be appended based on the available data if len(df_auto) > 0: model = ax.bar(ind[:len(df_auto)], df_auto['predicted'], width * 2, color='#FDD787', ecolor='#FDD787', - yerr=df_auto['interval_u'] - df_auto['predicted'], label=self.endog + ' modelled') + yerr=df_auto['interval_u'] - df_auto['predicted'], label=self.y + ' modelled') title = title + ', modelled' if len(df_prog) > 0: prog = ax.bar(ind[len(df_auto):], df_prog['predicted'], width * 2, color='#6CD5A1', ecolor='#6CD5A1', - yerr=df_prog['interval_u'] - df_prog['predicted'], label=self.endog + ' expected') + yerr=df_prog['interval_u'] - df_prog['predicted'], label=self.y + ' expected') title = title + ' and predicted' - meas = ax.bar(ind + width / 2., df[self.endog], width, label=self.endog + ' measured', color='#D5756C') + meas = ax.bar(ind + width / 2., df[self.y], width, label=self.y + ' measured', color='#D5756C') # add some text for labels, title and axes ticks - ax.set_ylabel(self.endog) - ax.set_title('{} {}'.format(title, self.endog)) + ax.set_ylabel(self.y) + ax.set_title('{} {}'.format(title, self.y)) ax.set_xticks(ind + width) ax.set_xticklabels([x.strftime('%d-%m-%Y') for x in df.index], rotation='vertical') ax.yaxis.grid(True) diff --git a/opengrid/tests/test_regression.py b/opengrid/tests/test_regression.py index eefce9d..2aaa4b0 100644 --- a/opengrid/tests/test_regression.py +++ b/opengrid/tests/test_regression.py @@ -25,6 +25,7 @@ def test_init(self): df = datasets.get('gas_2016_hour') df_month = df.resample('MS').sum() mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04) + mvlr.do_analysis() self.assertTrue(hasattr(mvlr, 'list_of_fits')) def test_strange_names(self): @@ -32,6 +33,7 @@ def test_strange_names(self): df_month = df.resample('MS').sum() df_month.rename(columns={'d5a7': '3*tempĂȘte !'}, inplace=True) mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04) + mvlr.do_analysis() self.assertTrue(hasattr(mvlr, 'list_of_fits')) def test_predict(self): @@ -39,6 +41,7 @@ def test_predict(self): df_month = df.resample('MS').sum() df_month.rename(columns={'d5a7': '3*tempĂȘte !'}, inplace=True) mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04) + mvlr.do_analysis() mvlr.add_prediction() self.assertListEqual(mvlr.df.columns.tolist(), @@ -48,6 +51,7 @@ def test_cross_validation(self): df = datasets.get('gas_2016_hour') df_month = df.resample('MS').sum() mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04, cross_validation=True) + mvlr.do_analysis() self.assertTrue(hasattr(mvlr, 'list_of_fits')) def test_prediction(self): @@ -56,6 +60,7 @@ def test_prediction(self): df_training = df_month.iloc[:-1, :] df_pred = df_month.iloc[[-1], :] mvlr = og.MultiVarLinReg(df_training, '313b', p_max=0.04) + mvlr.do_analysis() df_pred_95 = mvlr._predict(mvlr.fit, df=df_pred) mvlr.confint = 0.98 df_pred_98 = mvlr._predict(mvlr.fit, df=df_pred) @@ -73,6 +78,7 @@ def test_plot(self): df = datasets.get('gas_2016_hour') df_month = df.resample('MS').sum() mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04) + mvlr.do_analysis() with mock.patch.object(plt_mocked, 'subplots', return_value=(fig_mock, ax_mock)): mvlr.plot() @@ -81,6 +87,7 @@ def test_alternative_metrics(self): df = datasets.get('gas_2016_hour') df_month = df.resample('MS').sum() mvlr = og.MultiVarLinReg(df_month, '313b', p_max=0.04) + mvlr.do_analysis() best_rsquared = mvlr.find_best_rsquared(mvlr.list_of_fits) best_akaike = mvlr.find_best_akaike(mvlr.list_of_fits) best_bic = mvlr.find_best_bic(mvlr.list_of_fits) @@ -92,11 +99,12 @@ def test_prune(self): df = datasets.get('gas_2016_hour') df_month = df.resample('MS').sum() mvlr = og.MultiVarLinReg(df_month, '313b') - self.assertTrue("Q('d5a7')" in mvlr.fit.model.exog_names) + mvlr.do_analysis() + self.assertTrue("d5a7" in mvlr.fit.model.exog_names) pruned = mvlr._prune(mvlr.fit, 0.05) - self.assertTrue("Q('d5a7')" in pruned.model.exog_names) + self.assertTrue("d5a7" in pruned.model.exog_names) pruned = mvlr._prune(mvlr.fit, 0.0001) - self.assertFalse("Q('d5a7')" in pruned.model.exog_names) + self.assertFalse("d5a7" in pruned.model.exog_names) if __name__ == '__main__':