diff --git a/README.md b/README.md index 9d1de1d..4095641 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ## Description -``mssm`` is a toolbox to estimate Generalized Additive Mixed Models (GAMMs) and more general smooth models such as semi Markov-switching GAMMs (sMs-GAMMs; experimental) and sMs Impulse Response GAMMs (sMs-IR-GAMMs; experimental). The ``main`` branch is updated frequently to reflect new developments. The ``stable`` branch should reflect the latest releases. If you don't need the newest functionality, you should install from the ``stable`` branch (see below for instructions). +``mssm`` is a toolbox to estimate Generalized Additive Mixed Models (GAMMs), Generalized Additive Mixed Models of Location Scale and Shape (GAMMLSS), and more general smooth models such as semi Markov-switching GAMMs (sMs-GAMMs; experimental) and sMs Impulse Response GAMMs (sMs-IR-GAMMs; experimental). The ``main`` branch is updated frequently to reflect new developments. The ``stable`` branch should reflect the latest releases. If you don't need the newest functionality, you should install from the ``stable`` branch (see below for instructions). ## Installation diff --git a/src/mssm/models.py b/src/mssm/models.py index 40759f8..64ba301 100644 --- a/src/mssm/models.py +++ b/src/mssm/models.py @@ -2,10 +2,10 @@ import scipy as scp import copy from collections.abc import Callable -from .src.python.formula import Formula,PFormula,PTerm,build_sparse_matrix_from_formula,VarType,lhs,ConstType,Constraint,pd -from .src.python.exp_fam import Link,Logit,Family,Binomial,Gaussian +from .src.python.formula import Formula,PFormula,PTerm,build_sparse_matrix_from_formula,VarType,lhs,ConstType,Constraint,pd,embed_shared_penalties +from .src.python.exp_fam import Link,Logit,Identity,LOG,Family,Binomial,Gaussian,GAMLSSFamily,GAUMLSS from .src.python.sem import anneal_temps_zero,const_temps,compute_log_probs,pre_ll_sms_gamm,se_step_sms_gamm,decode_local,se_step_sms_dc_gamm,pre_ll_sms_IR_gamm,init_states_IR,compute_hsmm_probabilities -from .src.python.gamm_solvers import solve_gamm_sparse,mp,repeat,tqdm,cpp_cholP,apply_eigen_perm,compute_Linv,solve_gamm_sparse2 +from .src.python.gamm_solvers import solve_gamm_sparse,mp,repeat,tqdm,cpp_cholP,apply_eigen_perm,compute_Linv,solve_gamm_sparse2,solve_gammlss_sparse from .src.python.terms import TermType,GammTerm,i,f,fs,irf,l,li,ri,rs from .src.python.penalties import PenType from .src.python.utils import sample_MVN,REML @@ -39,8 +39,8 @@ def __init__(self, self.cpus=cpus # Current estimates - self.__coef = None - self.__scale = None + self.coef = None + self.scale = None self.__TR = None self.__pi = None @@ -109,7 +109,7 @@ def __init__(self, def get_pars(self): """ Returns a tuple. The first entry is a np.array with all estimated coefficients. The second entry is the estimated scale parameter. Will contain Nones before fitting.""" - return self.__coef,self.__scale + return self.coef,self.scale def get_llk(self,penalized:bool=True,ext_scale:float or None=None): """Get the (penalized) log-likelihood of the estimated model given the trainings data. LLK can optionally be evaluated for an external scale parameter ``ext_scale``.""" @@ -122,7 +122,7 @@ def get_llk(self,penalized:bool=True,ext_scale:float or None=None): if isinstance(self.family,Gaussian) == False: mu = self.family.link.fi(self.pred) if self.family.twopar: - scale = self.__scale + scale = self.scale if not ext_scale is None: scale = ext_scale return self.family.llk(self.formula.y_flat[self.formula.NOT_NA_flat],mu,scale) - pen @@ -233,16 +233,16 @@ def get_reml(self): if not isinstance(self.family,Gaussian): raise TypeError("REML score can currently only be obtained for Gaussian additive models. It can be computed via the REML function in mssm.src.python.utils when H=X.T@W@X is formed manually.") - if self.__coef is None or self.formula.penalties is None: + if self.coef is None or self.formula.penalties is None: raise ValueError("Model needs to be estimated before evaluating the REML score. Call model.fit()") scale = self.family.scale if self.family.twopar: - scale = self.__scale # Estimated scale + scale = self.scale # Estimated scale X = self.get_mmat() llk = self.get_llk(False) - reml = REML(llk,(X.T@X).tocsc(),self.__coef,scale,self.formula.penalties) + reml = REML(llk,(X.T@X).tocsc(),self.coef,scale,self.formula.penalties) return reml @@ -358,8 +358,8 @@ def fit(self,maxiter=50,conv_tol=1e-7,extend_lambda=True,control_lambda=True,exc len(self.formula.discretize) == 0, progress_bar,n_cores) - self.__coef = coef - self.__scale = scale # ToDo: scale name is used in another context for more general mssm.. + self.coef = coef + self.scale = scale # ToDo: scale name is used in another context for more general mssm.. self.pred = eta self.res = wres self.edf = edf @@ -386,9 +386,9 @@ def sample_post(self,n_ps,use_post=None,deviations=False,seed=None): :param use_post: The indices corresponding to coefficients for which to actually obtain samples. """ if deviations: - post = sample_MVN(n_ps,0,self.__scale,P=None,L=None,LI=self.lvi,use=use_post,seed=seed) + post = sample_MVN(n_ps,0,self.scale,P=None,L=None,LI=self.lvi,use=use_post,seed=seed) else: - post = sample_MVN(n_ps,self.__coef,self.__scale,P=None,L=None,LI=self.lvi,use=use_post,seed=seed) + post = sample_MVN(n_ps,self.coef,self.scale,P=None,L=None,LI=self.lvi,use=use_post,seed=seed) return post @@ -399,7 +399,7 @@ def __adjust_CI(self,n_ps,b,predi_mat,use_terms,alpha,seed): self.coef +- b gives point-wise interval, so for interval to be whole-interval 1-alpha% of posterior samples should fall completely within these boundaries. - From section 6.10 in Wood (2017) we have that *coef | y, lambda ~ N(coef,V), where V is self.lvi.T @ self.lvi * self.__scale. + From section 6.10 in Wood (2017) we have that *coef | y, lambda ~ N(coef,V), where V is self.lvi.T @ self.lvi * self.scale. Implication is that deviations [*coef - coef] | y, lambda ~ N(0,V). In line with definition above, 1-alpha% of predi_mat@[*coef - coef] should fall within [b,-b]. Wood (2017) suggests to find a so that [a*b,a*-b] achieves this. @@ -521,12 +521,12 @@ def predict(self, use_terms, n_dat,alpha=0.05,ci=False,whole_interval=False,n_ps state_est,use_only=use_terms) # Now we calculate the prediction - pred = predi_mat @ self.__coef + pred = predi_mat @ self.coef # Optionally calculate the boundary for a 1-alpha CI if ci: # Wood (2017) 6.10 - c = predi_mat @ self.lvi.T @ self.lvi * self.__scale @ predi_mat.T + c = predi_mat @ self.lvi.T @ self.lvi * self.scale @ predi_mat.T c = c.diagonal() b = scp.stats.norm.ppf(1-(alpha/2)) * np.sqrt(c) @@ -582,10 +582,10 @@ def predict_diff(self,dat1,dat2,use_terms,alpha=0.05,whole_interval=False,n_ps=1 pmat_diff = pmat1 - pmat2 # Predicted difference - diff = pmat_diff @ self.__coef + diff = pmat_diff @ self.coef # Difference CI - c = pmat_diff @ self.lvi.T @ self.lvi * self.__scale @ pmat_diff.T + c = pmat_diff @ self.lvi.T @ self.lvi * self.scale @ pmat_diff.T c = c.diagonal() b = scp.stats.norm.ppf(1-(alpha/2)) * np.sqrt(c) @@ -596,3 +596,229 @@ def predict_diff(self,dat1,dat2,use_terms,alpha=0.05,whole_interval=False,n_ps=1 b = self.__adjust_CI(n_ps,b,pmat_diff,use_terms,alpha,seed) return diff,b + +class GAMLSS(GAMM): + """ + Class to fit Generalized Additive Mixed Models of Location Scale and Shape. + + References: + - Rigby, R. A., & Stasinopoulos, D. M. (2005). Generalized Additive Models for Location, Scale and Shape. + - Wood, S. N., & Fasiolo, M. (2017). A generalized Fellner-Schall method for smoothing parameter optimization with application to Tweedie location, scale and shape models. https://doi.org/10.1111/biom.12666 + - Wood, S. N. (2011). Fast stable restricted maximum likelihood and marginal likelihood estimation of semiparametric generalized linear models: Estimation of Semiparametric Generalized Linear Models. https://doi.org/10.1111/j.1467-9868.2010.00749.x + - Wood, Pya, & Säfken (2016). Smoothing Parameter and Model Selection for General Smooth Models. + - Wood, S. N. (2017). Generalized Additive Models: An Introduction with R, Second Edition (2nd ed.). + + Parameters: + :param formulas: A list of formulas for the GAMMLS model + :type variables: Formula + :param family: A GAMLSS family. Currently only ``Gaussian`` is implemented. + :type Family + """ + def __init__(self, formulas: [Formula], family: GAMLSSFamily): + super().__init__(formulas[0], family) + self.formulas = copy.deepcopy(formulas) # self.formula can hold formula for single parameter later on for predictions. + self.overall_lvi = None + self.__overall_coef = None + self.__overall_preds = None # etas + + + def get_pars(self): + """ Returns a list containing the coef vector for each parameter of the distribution. Will contain Nones before fitting.""" + return self.__overall_coef + + + def get_llk(self,penalized:bool=True): + """Get the (penalized) log-likelihood of the estimated model given the trainings data.""" + + pen = 0 + if penalized: + pen = self.penalty + if self.pred is not None: + mus = [self.family.links[i].fi(self.__overall_preds[i]) for i in range(self.family.n_par)] + return self.family.llk(self.formulas[0].y_flat[self.formulas[0].NOT_NA_flat],*mus) - pen + + return None + + def get_mmat(self): + raise NotImplementedError("Not yet supported.") + + def print_smooth_terms(self,pen_cutoff=0.2): + raise NotImplementedError("Not yet supported.") + + def get_reml(self): + raise NotImplementedError("Not yet supported.") + + def fit(self,max_outer=50,max_inner=50,min_inner=1,conv_tol=1e-7,extend_lambda=True,progress_bar=True,n_cores=10): + """ + Fit the specified model. Additional keyword arguments not listed below should not be modified unless you really know what you are doing. + + Parameters: + + :param max_outer: The maximum number of fitting iterations. + :type max_outer: int,optional + :param max_inner: The maximum number of fitting iterations to use by the inner Newton step for coefficients. + :type max_inner: int,optional + :param min_inner: The minimum number of fitting iterations to use by the inner Newton step for coefficients. + :type min_inner: int,optional + :param conv_tol: The relative (change in penalized deviance is compared against ``conv_tol`` * previous penalized deviance) criterion used to determine convergence. + :type conv_tol: float,optional + :param extend_lambda: Whether lambda proposals should be accelerated or not. Can lower the number of new smoothing penalty proposals necessary. Enabled by default. + :type extend_lambda: bool,optional + :param progress_bar: Whether progress should be displayed (convergence info and time estimate). Defaults to True. + :type progress_bar: bool,optional + :param n_cores: Number of cores to use during parts of the estimation that can be done in parallel. Defaults to 10. + :type n_cores: int,optional + """ + + # Get y + y = self.formulas[0].y_flat[self.formulas[0].NOT_NA_flat] + + # Build penalties and model matrices for all formulas + Xs = [] + for form in self.formulas: + mod = GAMM(form,family=Gaussian()) + form.build_penalties() + Xs.append(mod.get_mmat()) + + # Fit mean model + if self.family.mean_init_fam is not None: + mean_model = GAMM(self.formulas[0],family=self.family.mean_init_fam) + mean_model.fit(progress_bar=False,restart=True) + m_coef,_ = mean_model.get_pars() + else: + m_coef = scp.stats.norm.rvs(size=self.formulas[0].n_coef).reshape(-1,1) + + # Get GAMLSS penalties + shared_penalties = embed_shared_penalties(self.formulas) + gamlss_pen = [pen for pens in shared_penalties for pen in pens] + + # Start with much weaker penalty than for GAMs + for pen_i in range(len(gamlss_pen)): + gamlss_pen[pen_i].lam = 0.01 + + # Initialize overall coefficients + form_n_coef = [form.n_coef for form in self.formulas] + coef = np.concatenate((m_coef.reshape(-1,1), + *[np.ones((self.formulas[ix].n_coef)).reshape(-1,1) for ix in range(1,self.family.n_par)])) + coef_split_idx = form_n_coef[:-1] + + coef,etas,mus,wres,LV,total_edf,term_edfs,penalty = solve_gammlss_sparse(self.family,y,Xs,form_n_coef,coef,coef_split_idx, + gamlss_pen,max_outer,max_inner,min_inner,conv_tol, + extend_lambda,progress_bar,n_cores) + + self.__overall_coef = coef + self.__overall_preds = etas + self.res = wres + self.edf = total_edf + self.term_edf = term_edfs + self.penalty = penalty + self.coef_split_idx = coef_split_idx + self.overall_lvi = LV + + + def predict(self, par, use_terms, n_dat, alpha=0.05, ci=False, whole_interval=False, n_ps=10000, seed=None): + """ + Make a prediction using the fitted model for new data ``n_dat`` using only the terms indexed by ``use_terms`` and for distribution parameter ``par``. + + References: + - Wood, S. N. (2017). Generalized Additive Models: An Introduction with R, Second Edition (2nd ed.). + - Simpson, G. (2016). Simultaneous intervals for smooths revisited. + + Parameters: + + :param par: The index corresponding to the parameter for which to make the prediction (e.g., 0 = mean) + :type par: int + :param use_terms: The indices corresponding to the terms that should be used to obtain the prediction or ``None`` + in which case all terms will be used. + :type use_terms: list[int] or None + :param n_dat: A pandas DataFrame containing new data for which to make the prediction. Importantly, all variables present in + the data used to fit the model also need to be present in this DataFrame. Additionally, factor variables must only include levels + also present in the data used to fit the model. If you want to exclude a specific factor from the prediction (for example the factor + subject) don't include the terms that involve it in the ``use_terms`` argument. + :type n_dat: pd.DataFrame + :param alpha: The alpha level to use for the standard error calculation. Specifically, 1 - (``alpha``/2) will be used to determine the critical cut-off value according to a N(0,1). + :type alpha: float, optional + :param ci: Whether the standard error ``se`` for credible interval (CI; see Wood, 2017) calculation should be returned. The CI is then [``pred`` - ``se``, ``pred`` + ``se``] + :type ci: bool, optional + :param whole_interval: Whether or not to adjuste the point-wise CI to behave like whole-interval (based on Wood, 2017; section 6.10.2 and Simpson, 2016). Defaults to False. The CI is then [``pred`` - ``se``, ``pred`` + ``se``] + :type whole_interval: bool, optional + :param n_ps: How many samples to draw from the posterior in case the point-wise CI is adjusted to behave like a whole-interval CI. + :type n_ps: int, optional + :param seed: Can be used to provide a seed for the posterior sampling step in case the point-wise CI is adjusted to behave like a whole-interval CI. + :type seed: int or None, optional + + + Returns: + :return: A tuple with 3 entries. The first entry is the prediction ``pred`` based on the new data ``n_dat``. The second entry is the model + matrix built for ``n_dat`` that was post-multiplied with the model coefficients to obtain ``pred``. The third entry is ``None`` if ``ci``==``False`` else + the standard error ``se`` in the prediction. + :rtype: tuple + + """ + # Prepare so that we can just call gamm.predict() + self.formula = self.formulas[par] + split_coef = np.split(self.__overall_coef,self.coef_split_idx) + self.coef = split_coef[par] + self.scale=1 + start = 0 + + end = self.coef_split_idx[0] + for pari in range(0,par): + start = end + end += self.coef_split_idx[pari] + self.lvi = self.overall_lvi[:,start:end] + + return super().predict(use_terms, n_dat, alpha, ci, whole_interval, n_ps, seed) + + def predict_diff(self, dat1, dat2, par, use_terms, alpha=0.05, whole_interval=False, n_ps=10000, seed=None): + """ + Get the difference in the predictions for two datasets and for distribution parameter ``par``. Useful to compare a smooth estimated for + one level of a factor to the smooth estimated for another level of a factor. In that case, ``dat1`` and + ``dat2`` should only differ in the level of said factor. + + References: + - Wood, S. N. (2017). Generalized Additive Models: An Introduction with R, Second Edition (2nd ed.). + - Simpson, G. (2016). Simultaneous intervals for smooths revisited. + - ``get_difference`` function from ``itsadug`` R-package: https://rdrr.io/cran/itsadug/man/get_difference.html + + Parameters: + + :param dat1: A pandas DataFrame containing new data for which to make the prediction. Importantly, all variables present in + the data used to fit the model also need to be present in this DataFrame. Additionally, factor variables must only include levels + also present in the data used to fit the model. If you want to exclude a specific factor from the prediction (for example the factor + subject) don't include the terms that involve it in the ``use_terms`` argument. + :type n_dat: pd.DataFrame + :param dat2: A second pandas DataFrame for which to also make a prediction. The difference in the prediction between this ``dat1`` will be returned. + :type dat2: pd.DataFrame + :param par: The index corresponding to the parameter for which to make the prediction (e.g., 0 = mean) + :type par: int + :param use_terms: The indices corresponding to the terms that should be used to obtain the prediction or ``None`` + in which case all terms will be used. + :type use_terms: list[int] or None + :param alpha: The alpha level to use for the standard error calculation. Specifically, 1 - (``alpha``/2) will be used to determine the critical cut-off value according to a N(0,1). + :type alpha: float, optional + :param whole_interval: Whether or not to adjuste the point-wise CI to behave like whole-interval (based on Wood, 2017; section 6.10.2 and Simpson, 2016). Defaults to False. + :type whole_interval: bool, optional + :param n_ps: How many samples to draw from the posterior in case the point-wise CI is adjusted to behave like a whole-interval CI. + :type n_ps: int, optional + :param seed: Can be used to provide a seed for the posterior sampling step in case the point-wise CI is adjusted to behave like a whole-interval CI. + :type seed: int or None, optional + + Returns: + :return: A tuple with 2 entries. The first entry is the predicted difference (between the two data sets ``dat1`` & ``dat2``) ``diff``. The second entry is the standard error ``se`` of the predicted difference. The difference CI is then [``diff`` - ``se``, ``diff`` + ``se``] + :rtype: tuple + """ + # Prepare so that we can just call gamm.predict_diff() + self.formula = self.formulas[par] + split_coef = np.split(self.__overall_coef,self.coef_split_idx) + self.coef = split_coef[par] + self.scale=1 + start = 0 + + end = self.coef_split_idx[0] + for pari in range(0,par): + start = end + end += self.coef_split_idx[pari] + self.lvi = self.overall_lvi[:,start:end] + + return super().predict_diff(dat1, dat2, use_terms, alpha, whole_interval, n_ps, seed) \ No newline at end of file diff --git a/src/mssm/src/python/exp_fam.py b/src/mssm/src/python/exp_fam.py index bae08ba..f8de40d 100644 --- a/src/mssm/src/python/exp_fam.py +++ b/src/mssm/src/python/exp_fam.py @@ -52,6 +52,34 @@ def dy1(self,mu): """ return 1 / ((1 - mu) * mu) +class Identity(Link): + def f(self, mu): + # Canonical link for normal distribution + # mu = eta + return mu + + def fi(self,eta): + return eta + + def dy1(self,mu): + return np.ones_like(mu) + + def dy2(self,mu): + return np.zeros_like(mu) + +class LOG(Link): + # Log link + def f(self,mu): + return np.log(mu) + + def fi(self,eta): + return np.exp(eta) + + def dy1(self,mu): + return 1/mu + + def dy2(self,mu): + return -1*(1/np.power(mu,2)) def est_scale(res,rows_X,total_edf): """ @@ -161,4 +189,38 @@ def deviance(self,y,mu): # Based on Faraway (2016) res = y - mu rss = res.T @ res - return rss[0,0] \ No newline at end of file + return rss[0,0] + +class GAMLSSFamily: + def __init__(self,pars:int,links:[Link]) -> None: + self.n_par = pars + self.links = links + self.d1 = [] # list with functions to evaluate derivative of llk with respect to corresponding mean + self.d2 = [] # list with function to evaluate pure second derivative of llk with respect to corresponding mean + self.d2m = [] # list with functions to evaluate mixed second derivative of llk. Order is 12,13,1k,23,24,... + self.mean_init_fam:Family or None = None # Family to fit for the mean model to initialize coef. + + def llk(self,y,*args): + # log-likelihood of y under this family + pass + + def lp(self,y,*args): + # Log-probability of observing every value in y under this family + pass + +class GAUMLSS(GAMLSSFamily): + def __init__(self, links: [Link]) -> None: + super().__init__(2, links) + + # All derivatives taken from gamlss.dist: https://github.com/gamlss-dev/gamlss.dist + # see also: Rigby, R. A., & Stasinopoulos, D. M. (2005). Generalized Additive Models for Location, Scale and Shape. + self.d1 = [lambda y, mu, sigma: (1/np.power(sigma,2))*(y-mu),lambda y, mu, sigma: (np.power(y-mu,2)-np.power(sigma,2))/(np.power(sigma,3))] + self.d2 = [lambda y, mu, sigma: -(1/np.power(sigma,2)), lambda y, mu, sigma: -(2/np.power(sigma,2))] + self.d2m = [lambda y, mu, sigma: np.zeros_like(y)] + self.mean_init_fam = Gaussian() + + def lp(self,y,mu,sigma): + return scp.stats.norm.logpdf(y,loc=mu,scale=sigma) + + def llk(self,y,mu,sigma): + return sum(self.lp(y,mu,sigma))[0] \ No newline at end of file diff --git a/src/mssm/src/python/formula.py b/src/mssm/src/python/formula.py index f1f3946..cba91f9 100644 --- a/src/mssm/src/python/formula.py +++ b/src/mssm/src/python/formula.py @@ -2353,6 +2353,48 @@ def embed_in_Sj_sparse(pen_data,pen_rows,pen_cols,Sj,SJ_col): return Sj +def embed_shared_penalties(formulas): + """ + Embed penalties from individual model into overall penalties for GAMLSS models. + """ + + shared_penalties = [copy.deepcopy(form.penalties) for form in formulas] + + for fi,form in enumerate(formulas): + for ofi,other_form in enumerate(formulas): + if fi == ofi: + continue + + if ofi < fi: + for lterm in shared_penalties[fi]: + lterm.S_J_emb = scp.sparse.vstack([scp.sparse.csc_array((other_form.n_coef,form.n_coef)), + lterm.S_J_emb]) + lterm.D_J_emb = scp.sparse.vstack([scp.sparse.csc_array((other_form.n_coef,form.n_coef)), + lterm.D_J_emb]) + + lterm.S_J_emb = scp.sparse.hstack([scp.sparse.csc_array((lterm.S_J_emb.shape[0],other_form.n_coef)), + lterm.S_J_emb]) + + lterm.D_J_emb = scp.sparse.hstack([scp.sparse.csc_array((lterm.S_J_emb.shape[0],other_form.n_coef)), + lterm.D_J_emb]) + + lterm.start_index += other_form.n_coef + + elif ofi > fi: + for lterm in shared_penalties[fi]: + lterm.S_J_emb = scp.sparse.vstack([lterm.S_J_emb, + scp.sparse.csc_array((other_form.n_coef,form.n_coef))]) + + lterm.D_J_emb = scp.sparse.vstack([lterm.D_J_emb, + scp.sparse.csc_array((other_form.n_coef,form.n_coef))]) + + lterm.S_J_emb = scp.sparse.hstack([lterm.S_J_emb, + scp.sparse.csc_array((lterm.S_J_emb.shape[0],other_form.n_coef))]) + + lterm.D_J_emb = scp.sparse.hstack([lterm.D_J_emb, + scp.sparse.csc_array((lterm.S_J_emb.shape[0],other_form.n_coef))]) + + return shared_penalties def build_linear_term_matrix(ci,n_y,has_intercept,lti,lterm,var_types,var_map,factor_levels,ridx,cov_flat,use_only): """Parameterize model matrix for a linear term.""" diff --git a/src/mssm/src/python/gamm_solvers.py b/src/mssm/src/python/gamm_solvers.py index da95e4d..4c1e453 100644 --- a/src/mssm/src/python/gamm_solvers.py +++ b/src/mssm/src/python/gamm_solvers.py @@ -1,7 +1,7 @@ import numpy as np import scipy as scp import warnings -from .exp_fam import Family,Gaussian,est_scale +from .exp_fam import Family,Gaussian,est_scale,GAMLSSFamily from .penalties import PenType,id_dist_pen,translate_sparse,dataclass from .formula import build_sparse_matrix_from_formula,setup_cache,clear_cache,cpp_solvers,pd,Formula,mp,repeat,os,map_csc_to_eigen,math,tqdm,sys from functools import reduce @@ -1503,3 +1503,366 @@ def solve_gamm_sparse2(formula:Formula,penalties,col_S,family:Family, clear_cache(CACHE_DIR,SHOULD_CACHE) return coef,eta,wres,scale,InvCholXXS,total_edf,term_edfs,penalty,fit_info + + +################################################ GAMMLSS code ################################################ + +def deriv_transform_mu_eta(y,means,family:GAMLSSFamily): + """ + Compute derivatives (first and second order) of llk with respect to each mean for all observations following steps outlined by Wood, Pya, & Säfken (2016) + + References: + - Wood, Pya, & Säfken (2016). Smoothing Parameter and Model Selection for General Smooth Models. + """ + d1 = [fd1(y,*means) for fd1 in family.d1] + d2 = [fd2(y,*means) for fd2 in family.d2] + d2m = [fd2m(y,*means) for fd2m in family.d2m] + + # Link derivatives + ld1 = [family.links[mui].dy1(means[mui]) for mui in range(len(means))] + ld2 = [family.links[mui].dy2(means[mui]) for mui in range(len(means))] + + # Transform first order derivatives via A.1 in Wood, Pya, & Säfken (2016) + """ + WPS (2016) provide that $l_{\eta}$ is obtained as $l^i_{\mu}/h'(\mu^i)$ - where $h'$ is the derivative of the link function $h$. + This follows from applying the chain rule and the inversion rule of derivatives + $\frac{\partial llk(h^{-1}(\eta))}{\partial \eta} = \frac{\partial llk(\mu)}{\partial \mu} \frac{\partial h^{-1}(\eta)}{\partial \eta} = \frac{\partial llk(\mu)}{\partial \mu}\frac{1}{\frac{\partial h(\mu)}{\mu}}$. + """ + d1eta = [d1[mui]/ld1[mui] for mui in range(len(means))] + + # Pure second order derivatives are transformed via A.2 in WPS (2016) + """ + For second derivatives we need pure and mixed. Computation of $l^\mathbf{i}_{\eta^l,\eta^m}$ in general is again obtained by applying the steps outlined for first order and provided by WPS (2016) + for the pure case. For the mixed case it is even simpler: We need $\frac{\partial^2 llk(h_1^{-1}(\eta^1),h_2^{-1}(\eta^2))}{\partial \eta^1 \partial \eta^2}$, + which is $\frac{\partial llk /\ \partial \eta^1}{\partial \eta^2}$. With the first partial being equal to + $\frac{\partial llk}{\partial \mu^1}\frac{1}{\frac{\partial h_1(\mu^1)}{\mu^1}}$ (see comment above for first order) we now have + $\frac{\partial \frac{\partial llk}{\partial \mu^1}\frac{1}{\frac{\partial h_1(\mu^1)}{\mu^1}}}{\partial \eta^2}$. + + We now apply the product rule (the second term in the sum disappears, because + $\partial \frac{1}{\frac{\partial h_1(\mu^1)}{\mu^1}} /\ \partial \eta^2 = 0$, this is not the case for pure second derivatives as shown in WPS, 2016) + to get $\frac{\partial \frac{\partial llk}{\partial \mu^1}}{\partial \eta^2} \frac{1}{\frac{\partial h_1(\mu^1)}{\mu^1}}$. + We can now again rely on the same steps taken to get the first derivatives (chain rule + inversion rule) to get + $\frac{\partial \frac{\partial llk}{\partial \mu^1}}{\partial \eta^2} = + \frac{\partial^2 llk}{\partial \mu^1 \partial \mu^2}\frac{1}{\frac{\partial h_2(\mu^2)}{\mu^2}}$. + + Thus, $\frac{\partial llk /\ \partial \eta^1}{\partial \eta^2} = + \frac{\partial^2 llk}{\partial \mu^1 \partial \mu^2}\frac{1}{\frac{\partial h_2(\mu^2)}{\mu^2}}\frac{1}{\frac{\partial h_1(\mu^1)}{\mu^1}}$. + """ + d2eta = [d2[mui]/np.power(ld1[mui],2) - d1[mui]*ld2[mui]/np.power(ld1[mui],3) for mui in range(len(means))] + + # Mixed second derivatives thus also are transformed as proposed by WPS (2016) + d2meta = [] + mixed_idx = 0 + for mui in range(len(means)): + for muj in range(len(means)): + if muj <= mui: + continue + + d2meta.append(d2m[mixed_idx] * (1/ld1[mui]) * (1/ld1[muj])) + mixed_idx += 1 + + return d1eta,d2eta,d2meta + +def deriv_transform_eta_beta(d1eta,d2eta,d2meta,Xs): + """ + Further transforms derivatives of llk with respect to eta to get derivatives of llk with respect to coefficients + Based on section 3.2 and Appendix A in Wood, Pya, & Säfken (2016) + + References: + - Wood, Pya, & Säfken (2016). Smoothing Parameter and Model Selection for General Smooth Models. + """ + + # Gradient: First order partial derivatives of llk with respect to coefficients + """ + WPS (2016) provide $l^j_{\beta^l} = l^\mathbf{i}_{\eta^l}\mathbf{X}^l_{\mathbf{i},j}$. See ```deriv_transform_mu_eta```. + """ + grad = [] + + for etai in range(len(d1eta)): + # Naive: + # for j in range(Xs[etai].shape[1]): + # grad.append(np.sum([d1eta[etai][i]*Xs[etai][i,j] for i in range(Xs[etai].shape[0])])) + # but this is just product of d1eta[etai][:] and Xs[etai], so we can skip inner loop + grad.extend((d1eta[etai].T @ Xs[etai]).T) + + # Hessian: Second order partial derivatives of llk with respect to coefficients + """ + WPS (2016) provide in general: + $l^{j,k}_{\beta^l,\beta^m} = l^\mathbf{i}_{\eta^l,\eta^m}\mathbf{X}^l_{\mathbf{i},j}\mathbf{X}^m_{\mathbf{i},k}$. + See ```deriv_transform_mu_eta```. + """ + + mixed_idx = 0 + + hr_idx = 0 + h_rows = [] + h_cols = [] + h_vals = [] + for etai in range(len(d1eta)): + hc_idx = 0 + for etaj in range(len(d1eta)): + + if etaj < etai: + hc_idx += Xs[etaj].shape[1] + continue + + if etai == etaj: + # Pure 2nd + d2 = d2eta[etai] + else: + # Mixed partial + d2 = d2meta[mixed_idx] + mixed_idx += 1 + + + for coefi in range(Xs[etai].shape[1]): + for coefj in range(Xs[etaj].shape[1]): + + if hc_idx+coefj < hr_idx+coefi: + continue + + # Naive: + # d2beta = np.sum([d2[i]*Xs[etai][i,coefi]*Xs[etaj][i,coefj] for i in range(Xs[etai].shape[0])]) + # But this is again just a dot product, preceded by element wise multiplication. In principle we + # could even skip these loops but that might get a bit tricky with sparse matrix set up- for now + # I just leave it like this... + d2beta = ((d2*Xs[etai][:,[coefi]]).T @ Xs[etaj][:,[coefj]])[0,0] + + h_rows.append(hr_idx+coefi) + h_cols.append(hc_idx+coefj) + h_vals.append(d2beta) + if hr_idx+coefi != hc_idx+coefj: # Symmetric 2nd deriv.. + h_rows.append(hc_idx+coefj) + h_cols.append(hr_idx+coefi) + h_vals.append(d2beta) + + hc_idx += Xs[etaj].shape[1] + hr_idx += Xs[etaj].shape[1] + + hessian = scp.sparse.csc_array((h_vals,(h_rows,h_cols))) + return np.array(grad).reshape(-1,1),hessian + +def newton_coef_smooth(coef,grad,H,S_emb): + """ + Follows sections 3.1.2 and 3.14 in WPS (2016) to update the coefficients of the GAMLSS model via a + newton step. + 1) Computes gradient of the penalized likelihood (grad - S_emb@coef) + 2) Computes negative Hessian of the penalized likelihood (-1*H + S_emb) and it's inverse. + 3) Uses these two to compute the Netwon step. + 4) Step size control - happens outside + + References: + - Wood, Pya, & Säfken (2016). Smoothing Parameter and Model Selection for General Smooth Models. + """ + + pgrad = np.array([grad[i] - (S_emb[i,:]@coef)[0] for i in range(len(grad))]) + nH = -1*H + S_emb + + # Diagonal pre-conditioning as suggested by WPS (2016) + D = scp.sparse.diags(np.abs(nH.diagonal())**0.5) + nH2 = (D@nH@D).tocsc() + + # Compute V, inverse of nH + eps = 0 + code = 1 + while code != 0: + + Lp, Pr, code = cpp_cholP(nH2+eps*scp.sparse.identity(nH2.shape[1],format='csc')) + LVp = compute_Linv(Lp,10) + LV = apply_eigen_perm(Pr,LVp) + V = LV.T @ LV + + if code == 0: + continue + + if eps == 0: + eps += 1e-14 + else: + eps *= 2 + + # Undo conditioning. + V = D@V@D + + # Update coef + n_coef = coef + (V@pgrad) + + return n_coef,V,eps + +def correct_coef_step_gen_smooth(family,y,Xs,coef,next_coef,coef_split_idx,c_llk,S_emb): + """ + Apply step size correction to Newton update for general smooth + models and GAMLSS models, as discussed by WPS (2016). + + References: + - Wood, Pya, & Säfken (2016). Smoothing Parameter and Model Selection for General Smooth Models. + """ + # Update etas and mus + next_split_coef = np.split(next_coef,coef_split_idx) + next_etas = [Xs[i]@next_split_coef[i] for i in range(family.n_par)] + next_mus = [family.links[i].fi(next_etas[i]) for i in range(family.n_par)] + + # Step size control for newton step. + next_llk = family.llk(y,*next_mus) + + # Evaluate improvement of penalized llk under new and old coef - but in both + # cases for current lambda (see Wood, Li, Shaddick, & Augustin; 2017) + next_pen_llk = next_llk - next_coef.T@S_emb@next_coef + prev_llk_cur_pen = c_llk - coef.T@S_emb@coef + n_checks = 0 + while next_pen_llk < prev_llk_cur_pen: + if n_checks > 30: + next_coef = coef + + # Half it if we do not observe an increase in penalized likelihood (WPS, 2016) + next_coef = (coef + next_coef)/2 + next_split_coef = np.split(next_coef,coef_split_idx) + + # Update etas and mus again + next_etas = [Xs[i]@next_split_coef[i] for i in range(family.n_par)] + + next_mus = [family.links[i].fi(next_etas[i]) for i in range(family.n_par)] + + # Step size control for newton step. Half it if we do not + # observe an increase in penalized likelihood (WPS, 2016) + next_llk = family.llk(y,*next_mus) + next_pen_llk = next_llk - next_coef.T@S_emb@next_coef + n_checks += 1 + + return next_coef,next_split_coef,next_mus,next_etas,next_llk,next_pen_llk + + +def solve_gammlss_sparse(family,y,Xs,form_n_coef,coef,coef_split_idx,gamlss_pen, + max_outer=50,max_inner=30,min_inner=1,conv_tol=1e-7, + extend_lambda=True,progress_bar=True,n_c=10): + """ + Fits a GAMLSS model, following steps outlined by Wood, Pya, & Säfken (2016). + + References: + - Wood, Pya, & Säfken (2016). Smoothing Parameter and Model Selection for General Smooth Models. + """ + # total number of coefficients + n_coef = np.sum(form_n_coef) + + extend_by = initialize_extension("nesterov",gamlss_pen) + was_extended = [False for _ in enumerate(gamlss_pen)] + + split_coef = np.split(coef,coef_split_idx) + + # Initialize etas and mus + etas = [Xs[i]@split_coef[i] for i in range(family.n_par)] + mus = [family.links[i].fi(etas[i]) for i in range(family.n_par)] + + # Build current penalties + S_emb,S_pinv,FS_use_rank = compute_S_emb_pinv_det(n_coef,gamlss_pen,"svd") + c_llk = family.llk(y,*mus) + c_pen_llk = c_llk - coef.T@S_emb@coef + + iterator = range(max_outer) + if progress_bar: + iterator = tqdm(iterator,desc="Fitting",leave=True) + + for outer in iterator: + + # Update coefficients: + for inner in range(max_inner): + + # Get derivatives with respect to eta + d1eta,d2eta,d2meta = deriv_transform_mu_eta(y,mus,family) + + # Get derivatives with respect to coef + grad,H = deriv_transform_eta_beta(d1eta,d2eta,d2meta,Xs) + + # Update coef and perform step size control + if outer > 0 or inner > 0: + # Update Coefficients + next_coef,V,eps = newton_coef_smooth(coef,grad,H,S_emb) + + # Prepare to check convergence + prev_llk_cur_pen = c_llk - coef.T@S_emb@coef + + # Perform step length control + coef,split_coef,mus,etas,c_llk,c_pen_llk = correct_coef_step_gen_smooth(family,y,Xs,coef,next_coef,coef_split_idx,c_llk,S_emb) + + if np.abs(c_pen_llk - prev_llk_cur_pen) < conv_tol*np.abs(c_pen_llk): + break + + if eps <= 0 and outer > 0 and inner >= (min_inner-1): + break # end inner loop and immediately optimize lambda again. + else: + # Simply accept next coef step on first iteration + coef,V,_ = newton_coef_smooth(coef,grad,H,S_emb) + split_coef = np.split(coef,coef_split_idx) + etas = [Xs[i]@split_coef[i] for i in range(family.n_par)] + mus = [family.links[i].fi(etas[i]) for i in range(family.n_par)] + c_llk = family.llk(y,*mus) + c_pen_llk = c_llk - coef.T@S_emb@coef + + # Check overall convergence + if outer > 0: + + if progress_bar: + iterator.set_description_str(desc="Fitting - Conv.: " + "{:.2e}".format((np.abs(prev_pen_llk - c_pen_llk) - conv_tol*np.abs(c_pen_llk))[0,0]), refresh=True) + + if np.abs(prev_pen_llk - c_pen_llk) < conv_tol*np.abs(c_pen_llk): + if progress_bar: + iterator.set_description_str(desc="Converged!", refresh=True) + iterator.close() + break + + # We need the penalized likelihood of the model at this point for convergence control (step 2 in Wood, 2017 based on step 4 in Wood, Goude, & Shaw, 2016) + prev_pen_llk = c_pen_llk + + # Compute cholesky of V without pre-conditioning. + LVp, Pr, code = cpp_cholP(scp.sparse.csc_array(V)) + if code != 0: + raise ValueError("Failed to solve cholesky of V.") + LV = apply_eigen_perm(Pr,LVp).T + + # Given new coefficients compute lgdetDs and bsbs - needed for EFS step + lgdetDs = [] + bsbs = [] + lam_delta = [] + for lti,lTerm in enumerate(gamlss_pen): + + lt_rank = None + if FS_use_rank[lti]: + lt_rank = lTerm.rank + + lgdetD,bsb = compute_lgdetD_bsb(lt_rank,lTerm.lam,S_pinv,lTerm.S_J_emb,coef) + dLam = step_fellner_schall_sparse(lgdetD,(LV@lTerm.D_J_emb).power(2).sum(),bsb[0,0],lTerm.lam,1) + if extend_lambda: + dLam,extend_by,was_extended = extend_lambda_step(lti,lTerm.lam,dLam,extend_by,was_extended,"nesterov") + lTerm.lam += dLam + + lam_delta.append(dLam) + lgdetDs.append(lgdetD) + bsbs.append(bsb) + + # Compute approximate!!! gradient of REML with respect to lambda + # to check if step size needs to be reduced (part of step 6 in Wood, 2017). + total_edf,term_edfs, Bs = calculate_edf(None,None,LV,gamlss_pen,lgdetDs,n_coef,n_c) + lam_grad = [grad_lambda(lgdetDs[lti],Bs[lti],bsbs[lti],1) for lti in range(len(gamlss_pen))] + lam_grad = np.array(lam_grad).reshape(-1,1) + check = lam_grad.T @ lam_delta + + # Step size control for smoothing parameters. Not obvious - because we have only approximate REMl + # and approximate derivative, because we drop the last term involving the derivative of the negative penalized + # Hessian with respect to the smoothing parameters (see section 4 in Wood & Fasiolo, 2017). However, what we + # can do is at least undo the acceleration if we over-shoot the approximate derivative... + if check[0] < 0: + for lti,lTerm in enumerate(gamlss_pen): + if was_extended[lti]: + lTerm.lam -= extend_by["acc"][lti] + + # Build new penalties + S_emb,S_pinv,FS_use_rank = compute_S_emb_pinv_det(n_coef,gamlss_pen,"svd") + + #print([lterm.lam for lterm in gamlss_pen]) + + # "Residuals" + wres = y - Xs[0]@split_coef[0] + + # Total penalty + penalty = coef.T@S_emb@coef + + return coef,etas,mus,wres,LV,total_edf,term_edfs,penalty \ No newline at end of file