diff --git a/build/lib/scientisttools/__init__.py b/build/lib/scientisttools/__init__.py deleted file mode 100644 index 67370a0..0000000 --- a/build/lib/scientisttools/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- -from scientisttools.version import __version__ - -__name__ = "scientisttools" -__author__ = 'Duverier DJIFACK ZEBAZE' -__email__ = 'duverierdjifack@gmail.com' \ No newline at end of file diff --git a/build/lib/scientisttools/clustering.py b/build/lib/scientisttools/clustering.py deleted file mode 100644 index e69de29..0000000 diff --git a/build/lib/scientisttools/dashboard.py b/build/lib/scientisttools/dashboard.py deleted file mode 100644 index 8b13789..0000000 --- a/build/lib/scientisttools/dashboard.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/build/lib/scientisttools/datasets.py b/build/lib/scientisttools/datasets.py deleted file mode 100644 index e69de29..0000000 diff --git a/build/lib/scientisttools/decomposition.py b/build/lib/scientisttools/decomposition.py deleted file mode 100644 index fe4c0e1..0000000 --- a/build/lib/scientisttools/decomposition.py +++ /dev/null @@ -1,3396 +0,0 @@ -# -*- coding: utf-8 -*- - -##################################### Chargement des librairies -import functools -from functools import reduce -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from mapply.mapply import mapply -import pingouin as pg -import statsmodels.formula.api as smf -from scipy.spatial.distance import pdist,squareform -from scipy import linalg -from scipy.sparse import issparse -import scipy.stats as st -import scipy as sp -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array -from sklearn.metrics import mean_squared_error -from scientisttools.pyplot import plotPPCA,plotCA,plotPCA,plotMCA -from scientisttools.utils import ( - orthonormalize, - random_orthonormal, - weighted_mean, - solve_weighted, - check_array_with_weights, - global_kmo_index, - per_item_kmo_index, - from_dummies) - -#################################################################################################### -# PRINCIPAL COMPONENTS ANALYSIS (PCA) -##################################################################################################### - -class PCA(BaseEstimator,TransformerMixin): - """Principal Component Analysis - - This class inherits from sklearn BaseEstimator and TransformerMixin class - - This is a standard Principal Component Analysis implementation - bases on the Singular Value Decomposition - - Performs Principal Component Analysis (PCA) with supplementary - individuals, supplementary quantitative variables and supplementary - categorical variables. - - Parameters - ---------- - normalize : bool, default = True - - If True : the data are scaled to unit variance. - - If False : the data are not scaled to unit variance. - - n_components : int, float or None, default = None - Number of components to keep. - - If n_components is None, keep all the components. - - If 0 <= n_components < 1, select the number of components such - that the amount of variance that needs to be explained is - greater than the percentage specified by n_components. - - If 1 <= n_components : - - If n_components is int, select a number of components - equal to n_components - - If n_components is float, select the higher number of - components lower than n_components - - row_labels : array of strings or None, default = None - - If row_labels is an array of strings : this array provides the - row labels. - If the shape of the array doesn't match with the number of - rows : labels are automatically computed for each row. - - If row_labels is None : labels are automatically computed for - each row. - - col_labels : array of strings or None, default = None - - If col_labels is an array of strings : this array provides the - column labels. - If the shape of the array doesn't match with the number of - columns : labels are automatically computed for each - column. - - If col_labels is None : labels are automatically computed for - each column. - - row_sup_labels : array of strings or None, defulat = None - This array provides the supplementary individuals labels - - quanti_sup_labels : arrays of strings or None, default = None - This array provides the quantitative supplementary variables labels - - quali_sup_labels : array of strings or None, default = None - This array provides the categorical supplementary variables labels - - graph : bool, default = True - if True a graph is displayed - - figsize : tuple of int, default = None - Width, height in inches. - - - Attributes - ---------- - n_components_ : int - The estimated number of components. - - row_labels_ : array of strings - Labels for the rows. - - col_labels_ : array of strings - Labels for the columns. - - row_sup_labels_ : array of strings or None - Labels of supplementary individuals labels - - quanti_sup_labels_ : arrays of strings or None - Labels of quantitative supplementary variables - - quali_sup_labels_ : arrays of strings or None - - mod_sup_labels_ : list of strings - labels for the categories supplementary - - short_sup_labels_ : list of strings - Short labels for the categories supplementary - - eig_ : array of float - A 4 x n_components_ matrix containing all the eigenvalues - (1st row), difference (2nd row) the percentage of variance (3rd row) and the - cumulative percentage of variance (4th row). - - eigen_vectors_ : array of float - Eigen vectors extracted from the Principal Components Analysis. - - row_coord_ : ndarray of shape (n_rows,n_components_) - A n_rows x n_components_ matrix containing the row coordinates. - - col_coord_ : ndarray of shape (n_columns,n_components_) - A n_columns x n_components_ matrix containing the column - coordinates. - - row_contrib_ : ndarray of shape (n_rows,n_components_) - A n_rows x n_components_ matrix containing the row - contributions. - - col_contrib_ : ndarray of shape (n_columns,n_components_) - A n_columns x n_components_ matrix containing the column - contributions. - - row_cos2_ : ndarray of shape (n_rows,n_components_) - A n_rows x n_components_ matrix containing the row cosines. - - col_cos2_ : ndarray of shape (n_columns,n_components_) - A n_columns x n_components_ matrix containing the column - cosines. - - col_cor_ : ndarray of shape (n_columns,n_components_) - A n_columns x n_components_ matrix containing the correlations - between variables (= columns) and axes. - - means_ : ndarray of shape (n_columns,) - The mean for each variable (= for each column). - - std_ : ndarray of shape (n_columns,) - The standard deviation for each variable (= for each column). - - ss_col_coord_ : ndarray of shape (n_columns,) - The sum of squared of columns coordinates. - - model_ : string - The model fitted = 'pca' - """ - - def __init__(self, - normalize=True, - n_components=None, - row_labels=None, - col_labels=None, - row_sup_labels =None, - quanti_sup_labels = None, - quali_sup_labels = None, - graph=False, - figsize=None): - self.normalize = normalize - self.n_components = n_components - self.row_labels = row_labels - self.col_labels = col_labels - self.row_sup_labels = row_sup_labels - self.quanti_sup_labels = quanti_sup_labels - self.quali_sup_labels = quali_sup_labels - self.graph = graph - self.figsize = figsize - - def fit(self,X,y=None): - """Fit the model to X - - Parameters - ---------- - X : DataFrame of float, shape (n_rows, n_columns) - - y : None - y is ignored - - Returns: - -------- - self : object - Returns the instance itself - """ - - # Return data - - - # Check if sparse matrix - if issparse(X): - raise TypeError("PCA does not support sparse input.") - # Check if X is an instance of pd.DataFrame class - elif not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Extract supplementary rows - self.row_sup_labels_ = self.row_sup_labels - if self.row_sup_labels_ is not None: - _X = X.drop(index = self.row_sup_labels_) - row_sup = X.loc[self.row_sup_labels_,:] - else: - _X = X - - # Extract supplementary numeric or categorical columns - self.quanti_sup_labels_ = self.quanti_sup_labels - self.quali_sup_labels_ = self.quali_sup_labels - if ((self.quali_sup_labels_ is not None) and (self.quanti_sup_labels_ is not None)): - X_ = _X.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - elif self.quali_sup_labels_ is not None: - X_= _X.drop(columns = self.quali_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_) - elif self.quanti_sup_labels_ is not None: - X_ = _X.drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quanti_sup_labels_) - else: - X_ = _X - - # Store data and active data - self.data_ = X - self.active_data_ = X_ - - # Supplementary initialization - self.row_sup_coord_ = None - self.row_sup_cos2_ = None - - # Additionnal continuous variables - self.col_sup_coord_ = None - self.col_sup_cos2_ = None - self.col_sup_ftest_ = None - - # Additionnal categories - self.mod_sup_coord_ = None - self.mod_sup_cos2_ = None - self.mod_sup_disto_ = None - self.mod_sup_vtest_ = None - - # Additionnal categorical variables - self.quali_sup_eta2_ = None - - # Pearson correlation - self.col_corr_ = np.array(X_.corr(method="pearson")) - - # Partial correlation variables - self.col_pcorr_ = np.array(X_.pcorr()) - - # Compute SVD - self._computed_svd(X_.values) - - # Compute supplementary quantitatives variables statistics - if self.quanti_sup_labels_ is not None: - self._compute_quanti_sup_stats(_X[self.quanti_sup_labels_]) - - # Compute supllementary qualitatives variables statistics - if self.quali_sup_labels_ is not None: - self._compute_quali_sup_stats(X=_X[self.quali_sup_labels_]) - - # Compute supplementrary rows statistics - if self.row_sup_labels_ is not None: - self._compute_row_sup_stats(X=row_sup) - - if self.graph: - fig, axe = plt.subplots(1,2,figsize=self.figsize) - plotPCA(self,choice="ind",repel=True,ax=axe[0]) - plotPCA(self,choice="var",repel=True,ax=axe[1],xlim=(-1.1,1.1),ylim=(-1.1,1.1)) - - return self - - def _computed_svd(self,X): - """Compute a Singular Value Decomposition - - Then, this function computes : - n_components_ : number of computer. - eig_ : eigen values. - eigen_vectors_ : eigen vectors. - row_coord_ : row coordinates. - col_coord_ : columns coordinates. - _compute_stats : - row_labels_ : row labels. - col_labels_ : columns labels. - row_infos : row informations (distance, weight, inertia). - inertia_ : inertia. - data_ : X - normalized_data_ : Z - bartlett_sphericity_test_ : Bartlett sphericity test - kaiser_threshold_ : Kaiser threshold. - kaiser_proportion_threshold_ : Kaiser proportional threshold - kss_threshold_ : Kaiser - S - S threshold. - broken_stick_threshold_ : Broken stick threshold - - Parameters - ---------- - X : DataFrame of float, shape (n_row,n_columns) - Training data, where n_rows is the number of rows and - n_columns is the number of columns. - X is a table of numeric values. - - Returns - ------- - None - """ - - self.n_rows_, self.n_cols_ = X.shape - - # Set row labels - self.row_labels_ = self.row_labels - if ((self.row_labels_ is None) or (len(self.row_labels_) != self.n_rows_)): - self.row_labels_ = ["row_" + str(i+1) for i in np.arange(0,self.n_rows_)] - - # Set col labels - self.col_labels_ = self.col_labels - if ((self.col_labels_ is None) or (len(self.col_labels_) != self.n_cols_)): - self.col_labels_ = ["col_" + str(k+1) for k in np.arange(0,self.n_cols_)] - - # Initializations - scale data - self.means_ = np.mean(X, axis=0).reshape(1,-1) - if self.normalize: - self.std_ = np.std(X,axis=0,ddof=0).reshape(1,-1) - Z = (X - self.means_)/self.std_ - else: - Z = X - self.means_ - - # Row information - row_disto = np.apply_along_axis(func1d=lambda x : np.sum(x**2),arr=Z,axis=1) - row_weight = np.ones(self.n_rows_)/self.n_rows_ - row_inertia = row_disto*row_weight - row_infos = np.c_[np.sqrt(row_disto),row_weight,row_inertia] - - # total inertia - inertia = np.sum(row_inertia) - - # Singular Value Decomposition - U, delta, V_T = np.linalg.svd(Z,full_matrices=False) - - # Eigen - values - eigen_values = delta**2/self.n_rows_ - difference = np.insert(-np.diff(eigen_values),len(eigen_values)-1,np.nan) - proportion = 100*eigen_values/np.sum(eigen_values) - cumulative = np.cumsum(proportion) - - # Set n_components_ - self.n_components_ = self.n_components - if self.n_components_ is None: - self.n_components_ = len(eigen_values) - elif (self.n_components_ >= 0) and (self.n_components_ < 1): - i = 0 - threshold = 100 * self.n_components_ - while cumulative[i] < threshold: - i = i + 1 - self.n_components_ = i - elif ((self.n_components_ >= 1) - and (self.n_components_ <= len(eigen_values)) - and (isinstance(self.n_components_, int))): - self.n_components_ = int(np.trunc(self.n_components_)) - elif ((self.n_components_ >= 1) - and (self.n_components_ <= len(eigen_values)) - and (isinstance(self.n_components_, float))): - self.n_components_ = int(np.floor(self.n_components_)) - else: - self.n_components_ = len(eigen_values) - - # Row coordinates - row_coord = U * delta.reshape(1,-1) - - # Columns coordinates - col_coord = V_T.T.dot(np.diag(np.sqrt(eigen_values))) - # Test de significativité de Fisher - col_ftest = np.apply_along_axis(func1d=lambda x : (1/2)*np.sqrt(self.n_rows_-3)*np.log((1+x)/(1-x)),axis=0,arr=col_coord) - self.ss_col_coord_ = (np.sum(col_coord ** 2, axis=1)).reshape(-1, 1) - - # Correlation between variables and axes - col_cor = np.transpose(np.corrcoef(x=row_coord,y=Z,rowvar=False)[:self.n_cols_,self.n_cols_:]) - - # Store all informations - self.eig_ = np.array([eigen_values[:self.n_components_], - difference[:self.n_components_], - proportion[:self.n_components_], - cumulative[:self.n_components_]]) - - # Bartlett - statistics - bartlett_stats = -(self.n_rows_-1-(2*self.n_cols_+5)/6)*np.sum(np.log(eigen_values)) - - # Broken stick threshold - broken_stick_threshold = np.flip(np.cumsum(1/np.arange(self.n_cols_,0,-1))) - - # Karlis - Saporta - Spinaki threshold - kss = 1 + 2*np.sqrt((self.n_rows_-1)/(self.n_rows_-1)) - - # Store all informations - self.eigen_vectors_= V_T.T[:,:self.n_components_] - # Factor coordinates for rows - self.row_coord_ = row_coord[:,:self.n_components_] - - # Factor coordinates for columns - self.col_coord_ = col_coord[:,:self.n_components_] - self.col_cor_ = col_cor[:,:self.n_components_] - self.col_ftest_ = col_ftest[:,:self.n_components_] - - self.row_infos_ = row_infos - self.inertia_ = inertia - self.normalized_data_ = Z - self.dim_index_ = ["Dim."+str(x+1) for x in np.arange(0,self.n_components_)] - - # Add eigenvalue threshold informations - self.bartlett_sphericity_test_ = dict({ - "statistic" : bartlett_stats, - "p-value" : 1-st.chi2.cdf(bartlett_stats,df=(self.n_cols_*(self.n_cols_-1)/2)), - "dof" : self.n_cols_*(self.n_cols_-1)/2 - }) - self.kaiser_threshold_ = np.mean(eigen_values) - self.kaiser_proportion_threshold_ = 100/inertia - self.kss_threshold_ = kss - self.broken_stick_threshold_ = broken_stick_threshold[:self.n_components_] - - # Compute stats : contribution and cos2 - self._compute_stats() - - # store model name - self.model_ = "pca" - - def _compute_stats(self): - """Computed statistics - row_contrib_ : row contributions. - col_contrib_ : columns contributions. - row_cos2_ : row cosines - col_cos2_ : columns cosines - - Parameters - ---------- - X : DataFrame of float, shape (n_row, n_columns) - Training data, where n_rows is the number of rows and - n_columns is the number of columns - X is a table containing numeric values - """ - # Row and col contributions - row_contrib = 100 * ((1/self.n_rows_)*(self.row_coord_**2)*(1/self.eig_[0].T)) - col_contrib = 100 * (self.col_coord_ ** 2) * (1/self.eig_[0].T) - - # Row and col cos2 - row_cos2 = ((self.row_coord_ ** 2)/ (np.linalg.norm(self.normalized_data_, axis=1).reshape(-1, 1) ** 2)) - col_cos2 = (self.col_coord_ ** 2) / self.ss_col_coord_ - self.ss_col_coord_ = None - - # Store row and col contrib and cos2 with additional informations - self.row_contrib_ = row_contrib[:, :self.n_components_] - self.col_contrib_ = col_contrib[:, :self.n_components_] - self.row_cos2_ = row_cos2[:, :self.n_components_] - self.col_cos2_ = col_cos2[:, :self.n_components_] - - def _compute_quanti_sup_stats(self,X,y=None): - """Comupute supplementary continuous variables statistics - - Parameters - ---------- - self : An instance of class FAMD - X : DataFrame (n_rows,n_columns) - y : None - y is ignored - - Return - ------ - col_sup_corr_ : Pearson correlation between new and old continuous variables - col_sup_coord_ : Supplementary continuous coordinates - col_sup_cos2_ : Supplementary continuous cosines - col_sup_ftest_ : Supplementary continuous Fisher - test - """ - - # Test if X is a DataFrame - if isinstance(X,pd.Series): - X = X.to_frame() - elif not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Correlation between New and old continuous variables - col_sup_corr = np.zeros((len(X.columns),len(self.col_labels_))) - for i, lab in enumerate(X.columns): - for j, name in enumerate(self.col_labels_): - col_sup_corr[i,j] = st.pearsonr(X[lab],self.active_data_[name]).statistic - - # Supplementary quantitatives coordinates - col_sup_coord = np.transpose(np.corrcoef(x=self.row_coord_,y=X.values,rowvar=False)[:self.n_components_,self.n_components_:]) - - # Test de significativité de Fisher - col_sup_ftest = np.apply_along_axis(func1d=lambda x : (1/2)*np.sqrt(self.n_rows_-3)*np.log((1+x)/(1-x)),axis=0,arr=col_sup_coord) - - # Supplementary quantitatives cos2 - col_sup_cos2 = np.apply_along_axis(func1d=lambda x : x**2,arr = col_sup_coord,axis=0) - - # Store supplementary quantitatives informations - self.col_sup_corr_ = col_sup_corr - self.col_sup_coord_ = col_sup_coord[:,:self.n_components_] - self.col_sup_cos2_ = col_sup_cos2[:,:self.n_components_] - self.col_sup_ftest_ = col_sup_ftest[:,:self.n_components_] - - # Supplementray continuous labels - self.col_sup_labels_ = X.columns - - return dict({"corr" : pd.DataFrame(self.col_sup_corr_,index=self.quanti_sup_labels_,columns=self.col_labels_), - "coord" : pd.DataFrame(self.col_sup_coord_,index=self.col_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_sup_cos2_,index=self.col_sup_labels_,columns=self.dim_index_), - "ftest" : pd.DataFrame(self.col_sup_ftest_,index=self.col_sup_labels_,columns=self.dim_index_) - }) - - def _compute_quali_sup_stats(self,X,y=None): - """Compute statistics supplementary categorical variables - - Parameters - ---------- - self : An instance of class FAMD - X : DataFrame (n_rows,n_columns) - y : None - y is ignored - - Return - ------ - mod_sup_coord_ : Supplementary categories coordinates - mod_sup_cos2_ : Supplementary categories cosines - mod_sup_disto_ : Supplementary categories distance - """ - # Test if X is a DataFrame - if isinstance(X,pd.Series): - X = X.to_frame() - elif not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Dummies variables - dummies = pd.concat((pd.get_dummies(X[cols],prefix=cols,prefix_sep='_') for cols in X.columns),axis=1) - mod_sup_stats = dummies.agg(func=[np.sum,np.mean]).T - n_k = dummies.sum(axis=0) - p_k = dummies.mean(axis=0) - mod_sup_labels = dummies.columns - short_sup_labels = list([x.split("_",1)[-1] for x in mod_sup_labels]) - - # Supplementary categories coordinates - mod_sup_coord = pd.concat((pd.concat((pd.DataFrame(self.row_coord_,index=self.row_labels_, - columns=self.dim_index_),dummies[cols]),axis=1) - .groupby(cols).mean().iloc[1,:].to_frame(name=cols).T for cols in dummies.columns),axis=0) - - # Rapport de corrélation - quali_sup_eta2 = pd.concat(((mapply(mod_sup_coord,lambda x : x**2,axis=0,progressbar=False).mul(p_k,axis="index") - .loc[filter(lambda x: x.startswith(cols),mod_sup_coord.index),:] - .sum(axis=0).to_frame(name=cols).T.div(self.eig_[0])) for cols in X.columns),axis=0) - - # Supplementary categories v-test - mod_sup_vtest = mapply(mapply(mod_sup_coord,lambda x : x/np.sqrt((self.n_rows_-n_k)/((self.n_rows_-1)*n_k)), - axis=0,progressbar=False), - lambda x : x/np.sqrt(self.eig_[0]),axis=1,progressbar=False) - - # Moyennes conditionnelles sur la variable Z - mz_g = pd.concat((pd.concat((pd.DataFrame(self.normalized_data_,index=self.row_labels_, - columns=self.col_labels_),dummies[cols]),axis=1) - .groupby(cols).mean().iloc[1,:].to_frame(name=cols).T for cols in dummies.columns),axis=0) - - # Distance des modalités à l'origine - mod_sup_disto = mapply(mz_g,lambda x : np.sum(x**2),axis=1,progressbar=False) - - # Supplementary categories cos2 - mod_sup_cos2 = mapply(mod_sup_coord,lambda x : x**2,axis=0,progressbar=False).div(mod_sup_disto,axis="index") - - # Supplementary categories eta2 - correlation - quali_sup_eta2 = pd.concat((mapply(mod_sup_coord.loc[filter(lambda x: x.startswith(cols),mod_sup_coord.index),:], - lambda x : x**2,axis=1,progressbar=False) - .mul(p_k.loc[filter(lambda x: x.startswith(cols),mod_sup_coord.index)],axis="index") - .div(self.eig_[0],axis="columns") - .sum(axis=0).to_frame(name=cols).T for cols in X.columns),axis=0) - - # Supplementary categories informations - self.mod_sup_stats_ = np.array(mod_sup_stats) - self.mod_sup_disto_ = np.array(mod_sup_disto) - self.mod_sup_coord_ = np.array(mod_sup_coord) - self.mod_sup_cos2_ = np.array(mod_sup_cos2) - self.mod_sup_vtest_ = np.array(mod_sup_vtest) - - self.mod_sup_labels_ = mod_sup_labels - self.short_sup_labels_ = short_sup_labels - - # Supplementary qualitatives variables - self.quali_sup_eta2_ = quali_sup_eta2 - - return dict({"stats" : pd.DataFrame(self.mod_sup_stats_,columns=["n(k)","p(k)"],index=self.mod_sup_labels_), - "coord" : pd.DataFrame(self.mod_sup_coord_,index=self.mod_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_sup_cos2_,index=self.mod_sup_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.mod_sup_disto_,index=self.mod_sup_labels_,columns=["dist"]), - "eta2" : pd.DataFrame(self.quali_sup_eta2_,index=self.quali_sup_labels_,columns=self.dim_index_), - "vtest" : pd.DataFrame(self.mod_sup_vtest_,index=self.mod_sup_labels_,columns=self.dim_index_) - }) - - def _compute_row_sup_stats(self,X,y=None): - """Compute supplementary individuals coordinates - - Parameters - ---------- - X : DataFrame, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - row_sup_coord_ : DataFrame of float, shape (n_rows_sup, n_components_) - row_sup_coord_ : coordinates of the projections of the supplementary - row points on the axes. - - row_sup_cos2_ : DataFrame of float, shape (n_rows_sup,n_compoents_) - row_sup_cos2_ : Cosines of the projection of the supplementary - row points - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - if self.normalize: - Z = (X - self.means_)/self.std_ - else: - Z = X - self.means_ - - row_sup_coord = np.array(Z.dot(self.eigen_vectors_)) - row_sup_cos2 = ((row_sup_coord ** 2)/ (np.linalg.norm(Z, axis=1).reshape(-1, 1) ** 2)) - - # Store all informations - self.row_sup_coord_ = row_sup_coord[:,:self.n_components_] - self.row_sup_cos2_ = row_sup_cos2[:,:self.n_components_] - - return dict({"coord" : row_sup_coord[:,:self.n_components_], - "cos2" : row_sup_cos2[:,:self.n_components_]}) - - - def transform(self,X,y=None): - """Apply the dimensionality reduction on X - - X is projected on the first axes previous extracted from a training set. - - Parameters - ---------- - X : DataFrame of float, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - if self.normalize: - Z = (X - self.means_)/self.std_ - else: - Z = X - self.means_ - return np.array(Z.dot(self.eigen_vectors_))[:,:self.n_components_] - - def fit_transform(self,X,y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : pd.DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - self.fit(X) - - return self.row_coord_ - - - -########################################################################################## -# Partial PRINCIPAL COMPONENTS ANALYSIS (PPCA) -########################################################################################## - -class PartialPCA(BaseEstimator,TransformerMixin): - """ - Partial Principal Components Analysis - """ - def __init__(self, - n_components=None, - normalize=True, - row_labels=None, - col_labels=None, - partial_labels=None, - graph = False, - figsize=None): - self.n_components = n_components - self.normalize = normalize - self.row_labels = row_labels - self.col_labels = col_labels - self.partial_labels = partial_labels - self.graph = graph - self.figsize = figsize - - def fit(self,X,y=None): - """ - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - self.n_rows_, self.n_cols_ = X.shape - self.data_ = X - - self._compute_stats(X) - self._compute_svds(X) - - if self.graph: - fig,(axe1,axe2) = plt.subplots(1,2,figsize=self.figsize) - plotPPCA(self,choice="ind",ax=axe1) - plotPPCA(self,choice="var",ax=axe2) - - return self - - - def _compute_stats(self,X,y=None): - """ - - - """ - - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - global_kmo = global_kmo_index(X) - per_var_kmo = per_item_kmo_index(X) - corr = X.corr(method="pearson") - pcorr = X.pcorr() - - self.global_kmo_index_ = global_kmo - self.partial_kmo_index_ = per_var_kmo - self.pearson_correlation_ = corr - self.partial_correlation_ = pcorr - - def _compute_svds(self,X,y=None): - """ - - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - - self.partial_labels_ = self.partial_labels - X = X.drop(columns = self.partial_labels_) - - # Extract coefficients and intercept - coef = pd.DataFrame(np.zeros((len(self.partial_labels_)+1,X.shape[1])), - index = ["intercept"]+self.partial_labels_,columns=X.columns) - rsquared = pd.DataFrame(np.zeros((1,X.shape[1])),index = ["R carré"],columns=X.columns) - rmse = pd.DataFrame(np.zeros((1,X.shape[1])),index = ["RMSE"],columns=X.columns) - E = pd.DataFrame(np.zeros((self.n_rows_,X.shape[1])),index=X.index,columns=X.columns) # Résidu de régression - - for lab in X.columns: - res = smf.ols(formula="{}~{}".format(lab,"+".join(self.partial_labels_)), data=self.data_).fit() - coef.loc[:,lab] = res.params.values - rsquared.loc[:,lab] = res.rsquared - rmse.loc[:,lab] = mean_squared_error(self.data_[lab],res.fittedvalues,squared=False) - E.loc[:,lab] = res.resid - - # Coefficients normalisés - normalized_data = mapply(self.data_,lambda x : (x - x.mean())/x.std(),axis=0,progressbar=False) - normalized_coef = pd.DataFrame(np.zeros((len(self.partial_labels_),X.shape[1])), - index = self.partial_labels_,columns=X.columns) - for lab in X.columns: - normalized_coef.loc[:,lab] = smf.ols(formula="{}~{}".format(lab,"+".join(self.partial_labels_)),data=normalized_data).fit().params[1:] - - # Matrice des corrélations partielles vers y - resid_corr = E.corr(method="pearson") - - # Matrice des corrélations brutes - R = X.corr(method="pearson") - - # ACP sur les résidus - self.row_labels_ = self.row_labels - my_pca = PCA(normalize=self.normalize,n_components=self.n_components,row_labels=self.row_labels_,col_labels=E.columns).fit(E) - - self.resid_corr_ = resid_corr - - self.n_components_ = my_pca.n_components_ - - self.eig_ = my_pca.eig_ - self.eigen_vectors_ = my_pca.eigen_vectors_ - self.inertia_ = my_pca.inertia_ - self.dim_index_ = my_pca.dim_index_ - - self.row_coord_ = my_pca.row_coord_ - self.row_contrib_ = my_pca.row_contrib_ - self.row_cos2_ = my_pca.row_cos2_ - self.row_infos_ = my_pca.row_infos_ - - self.col_coord_ = my_pca.col_coord_ - self.col_cor_ = my_pca.col_cor_ - self.col_ftest = my_pca.col_ftest_ - self.col_cos2_ = my_pca.col_cos2_ - self.col_contrib_ = my_pca.col_contrib_ - - self.bartlett_sphericity_test_ = my_pca.bartlett_sphericity_test_ - self.kaiser_proportion_threshold_ = my_pca.kaiser_proportion_threshold_ - self.kaiser_threshold_ = my_pca.kaiser_threshold_ - self.broken_stick_threshold_ = my_pca.broken_stick_threshold_ - self.kss_threshold_ = my_pca.kss_threshold_ - self.col_labels_ = my_pca.col_labels_ - - self.rsquared_ = rsquared - self.rmse_ = rmse - self.coef_ = coef - self.normalized_coef_ = normalized_coef - self.normalized_data_ = normalized_data - self.resid_ = E - self.R_ = R - - self.model_ = "ppca" - - def fit_transform(self,X,y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : pd.DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - self.fit(X) - - return self.row_coord_ - - def transform(self,X,y=None): - """Apply the Partial Principal Components Analysis reduction on X - - X is projected on the first axes previous extracted from a training set. - - Parameters - ---------- - X : DataFrame of float, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - raise NotImplementedError("Error : This method is not implemented yet.") - -############################################################################################# -# Weighted PRINCIPAL COMPONENTS ANALYSIS (WPCA) -############################################################################################ - -class WPCA(BaseEstimator, TransformerMixin): - """Weighted Principal Component Analysis - - This is a direct implementation of weighted PCA based on the eigenvalue - decomposition of the weighted covariance matrix following - Delchambre (2014) [1]_. - - Parameters - ---------- - n_components : int (optional) - Number of components to keep. If not specified, all components are kept - xi : float (optional) - Degree of weight enhancement. - regularization : float (optional) - Control the strength of ridge regularization used to compute the - transform. - copy_data : boolean, optional, default True - If True, X and weights will be copied; else, they may be overwritten. - Attributes - ---------- - components_ : array, [n_components, n_features] - Principal axes in feature space, representing the directions of - maximum variance in the data. - explained_variance_ : array, [n_components] - The amount of variance explained by each of the selected components. - explained_variance_ratio_ : array, [n_components] - Percentage of variance explained by each of the selected components. - mean_ : array, [n_features] - Per-feature empirical mean, estimated from the training set. - See Also - -------- - - PCA - - sklearn.decomposition.PCA - References - ---------- - .. [1] Delchambre, L. MNRAS 2014 446 (2): 3545-3555 (2014) - http://arxiv.org/abs/1412.4533 - """ - def __init__(self, n_components=None, xi=0, regularization=None, - copy_data=True): - self.n_components = n_components - self.xi = xi - self.regularization = regularization - self.copy_data = copy_data - - def _center_and_weight(self, X, weights, fit_mean=False): - """Compute centered and weighted version of X. - If fit_mean is True, then also save the mean to self.mean_ - """ - X, weights = check_array_with_weights(X, weights, dtype=float, - copy=self.copy_data) - - if fit_mean: - self.mean_ = weighted_mean(X, weights, axis=0) - - # now let X <- (X - mean) * weights - X -= self.mean_ - - if weights is not None: - X *= weights - else: - weights = np.ones_like(X) - - return X, weights - - def fit(self, X, y=None, weights=None): - """Compute principal components for X - Parameters - ---------- - X: array-like, shape (n_samples, n_features) - Training data, where n_samples in the number of samples - and n_features is the number of features. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - self : object - Returns the instance itself. - """ - # let X <- (X - mean) * weights - X, weights = self._center_and_weight(X, weights, fit_mean=True) - self._fit_precentered(X, weights) - return self - - def _fit_precentered(self, X, weights): - """fit pre-centered data""" - if self.n_components is None: - n_components = X.shape[1] - else: - n_components = self.n_components - - # TODO: filter NaN warnings - covar = np.dot(X.T, X) - covar /= np.dot(weights.T, weights) - covar[np.isnan(covar)] = 0 - - # enhance weights if desired - if self.xi != 0: - Ws = weights.sum(0) - covar *= np.outer(Ws, Ws) ** self.xi - - eigvals = (X.shape[1] - n_components, X.shape[1] - 1) - evals, evecs = linalg.eigh(covar, eigvals=eigvals) - self.components_ = evecs[:, ::-1].T - self.explained_variance_ = evals[::-1] - self.explained_variance_ratio_ = evals[::-1] / covar.trace() - - def transform(self, X, weights=None): - """Apply dimensionality reduction on X. - X is projected on the first principal components previous extracted - from a training set. - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - X, weights = self._center_and_weight(X, weights, fit_mean=False) - return self._transform_precentered(X, weights) - - def _transform_precentered(self, X, weights): - """transform pre-centered data""" - # TODO: parallelize this? - Y = np.zeros((X.shape[0], self.components_.shape[0])) - for i in range(X.shape[0]): - cW = self.components_ * weights[i] - cWX = np.dot(cW, X[i]) - cWc = np.dot(cW, cW.T) - if self.regularization is not None: - cWc += np.diag(self.regularization / self.explained_variance_) - Y[i] = np.linalg.solve(cWc, cWX) - return Y - - def fit_transform(self, X, y=None, weights=None): - """Fit the model with X and apply the dimensionality reduction on X. - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - X, weights = self._center_and_weight(X, weights, fit_mean=True) - self._fit_precentered(X, weights) - return self._transform_precentered(X, weights) - - def inverse_transform(self, X): - """Transform data back to its original space. - Returns an array X_original whose transform would be X. - Parameters - ---------- - X : array-like, shape (n_samples, n_components) - Data in transformed representation. - Returns - ------- - X_original : array-like, shape (n_samples, n_features) - """ - X = check_array(X) - return self.mean_ + np.dot(X, self.components_) - - def reconstruct(self, X, weights=None): - """Reconstruct the data using the PCA model - This is equivalent to calling transform followed by inverse_transform. - Parameters - ---------- - X : array-like, shape (n_samples, n_components) - Data in transformed representation. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_reconstructed : ndarray, shape (n_samples, n_components) - Reconstructed version of X - """ - return self.inverse_transform(self.transform(X, weights=weights)) - - def fit_reconstruct(self, X, weights=None): - """Fit the model and reconstruct the data using the PCA model - This is equivalent to calling fit_transform() - followed by inverse_transform(). - Parameters - ---------- - X : array-like, shape (n_samples, n_components) - Data in transformed representation. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_reconstructed : ndarray, shape (n_samples, n_components) - Reconstructed version of X - """ - return self.inverse_transform(self.fit_transform(X, weights=weights)) - -##################################################################################### -# Expected - Maximization PRINCIPAL COMPONENTS ANALYSIS ( EMPCA) -####################################################################################### - -class EMPCA(BaseEstimator, TransformerMixin): - """Expectation-Maximization PCA - - This is an iterative implementation of weighted PCA based on an - Expectation-Maximization approach, following Bailey (2012) [1]_. - - Parameters - ---------- - n_components : int (optional) - Number of components to keep. If not specified, all components are kept - max_iter : int (default=100) - Maximum number of Expectation-Maximization iterations - random_state : int or None - Seed for the random initialization of eigenvectors - Attributes - ---------- - components_ : array, [n_components, n_features] - Principal axes in feature space, representing the directions of - maximum variance in the data. - explained_variance_ : array, [n_components] - The amount of variance explained by each of the selected components. - explained_variance_ratio_ : array, [n_components] - Percentage of variance explained by each of the selected components. - mean_ : array, [n_features] - Per-feature empirical mean, estimated from the training set. - See Also - -------- - - PCA - - WPCA - - sklearn.decomposition.PCA - References - ---------- - .. [1] Bailey, S. PASP 124:919 (2012) - http://arxiv.org/abs/1208.4122 - """ - def __init__(self, n_components=None, max_iter=100, random_state=None): - self.n_components = n_components - self.max_iter = max_iter - self.random_state = random_state - - def _Estep(self, data, weights, eigvec): - """E-step: solve for coeff given eigvec""" - if weights is None: - return np.dot(data, eigvec.T) - else: - return np.array([solve_weighted(eigvec.T, data[i], weights[i]) - for i in range(data.shape[0])]) - - def _Mstep(self, data, weights, eigvec, coeff): - """M-step: solve for eigvec given coeff""" - w2 = 1 if weights is None else weights ** 2 - - for i in range(eigvec.shape[0]): - # remove contribution of previous eigenvectors from data - d = data - np.dot(coeff[:, :i], eigvec[:i]) - c = coeff[:, i:i + 1] - eigvec[i] = np.dot(c.T, w2 * d) / np.dot(c.T, w2 * c) - # orthonormalize computed vectors: in theory not necessary, - # but numerically it's a good idea - # TODO: perhaps do this more efficiently? - eigvec[:i + 1] = orthonormalize(eigvec[:i + 1]) - return eigvec - - def fit_transform(self, X, y=None, weights=None): - """Fit the model with X and apply the dimensionality reduction on X. - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - X, weights = check_array_with_weights(X, weights) - - if self.n_components is None: - n_components = X.shape[1] - else: - n_components = self.n_components - - self.mean_ = weighted_mean(X, weights, axis=0) - X_c = X - self.mean_ - - eigvec = random_orthonormal(n_components, X.shape[1], - random_state=self.random_state) - - # TODO: add a convergence check - for k in range(self.max_iter): - coeff = self._Estep(X_c, weights, eigvec) - eigvec = self._Mstep(X_c, weights, eigvec, coeff) - coeff = self._Estep(X_c, weights, eigvec) - - self.components_ = eigvec - self.explained_variance_ = (coeff ** 2).sum(0) / X.shape[0] - - if weights is None: - total_var = X_c.var(0).sum() - else: - XW = X_c * weights - total_var = np.sum((XW ** 2).sum(0) / (weights ** 2).sum(0)) - self.explained_variance_ratio_ = (self.explained_variance_ / total_var) - return coeff - - def fit(self, X, y=None, weights=None): - """Compute principal components for X - Parameters - ---------- - X: array-like, shape (n_samples, n_features) - Training data, where n_samples in the number of samples - and n_features is the number of features. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - self : object - Returns the instance itself. - """ - self.fit_transform(X, weights=weights) - return self - - def transform(self, X, weights=None): - """Apply dimensionality reduction on X. - X is projected on the first principal components previous extracted - from a training set. - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - X, weights = check_array_with_weights(X, weights) - - X_c = X - self.mean_ - if weights is not None: - assert X.shape == weights.shape - X_c[weights == 0] = 0 - return self._Estep(X_c, weights, self.components_) - - def inverse_transform(self, X): - """Transform data back to its original space. - Returns an array X_original whose transform would be X. - Parameters - ---------- - X : array-like, shape (n_samples, n_components) - Data in transformed representation. - Returns - ------- - X_original : array-like, shape (n_samples, n_features) - """ - X = check_array(X) - return self.mean_ + np.dot(X, self.components_) - - def reconstruct(self, X, weights=None): - """Reconstruct the data using the PCA model - This is equivalent to calling transform followed by inverse_transform. - Parameters - ---------- - X : array-like, shape (n_samples, n_components) - Data in transformed representation. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_reconstructed : ndarray, shape (n_samples, n_components) - Reconstructed version of X - """ - return self.inverse_transform(self.transform(X, weights=weights)) - - def fit_reconstruct(self, X, weights=None): - """Fit the model and reconstruct the data using the PCA model - This is equivalent to calling fit_transform() - followed by inverse_transform(). - Parameters - ---------- - X : array-like, shape (n_samples, n_components) - Data in transformed representation. - weights: array-like, shape (n_samples, n_features) - Non-negative weights encoding the reliability of each measurement. - Equivalent to the inverse of the Gaussian errorbar. - Returns - ------- - X_reconstructed : ndarray, shape (n_samples, n_components) - Reconstructed version of X - """ - return self.inverse_transform(self.fit_transform(X, weights=weights)) - -############################################################################################## -# EXPLORATORY FACTOR ANALYSIS (EFA) -############################################################################################### - -class EFA(BaseEstimator,TransformerMixin): - """Exploratory Factor Analysis - - This class inherits from sklearn BaseEstimator and TransformerMixin class - - EFA performs a Exploratory Factor Analysis, given a table of - numeric variables; shape = n_rows x n_columns - - Parameters - ---------- - normalize : bool - - If true : the data are scaled to unit variance - - If False : the data are not scaled to unit variance - - n_components: int or None - number of components to keep - - row_labels : list of string or None - The list provides the row labels - - col_labels : list of strings or None - The list provides the columns labels - - method : {"principal","harris"} - - If method = "principal" : performs Exploratory Factor Analyis using principal approach - - If method = "harris" : performs Exploratory Factor Analysis using Harris approach - - row_sup_labels : list of strings or None - The list provides the supplementary row labels - - quanti_sup_labels : list of strings or None - The list provides the supplementary continuous columns - - quali_sup_labels : list of strings or None - The list provides the supplementary categorical variables - - graph : bool or None - - If True : return graph - - figsize = tuple of int or None - - Returns: - -------- - - """ - def __init__(self, - normalize=True, - n_components = None, - row_labels = None, - col_labels = None, - method = "principal", - row_sup_labels = None, - quanti_sup_labels = None, - quali_sup_labels = None, - graph =None, - figsize=None): - self.normalize = normalize - self.n_components =n_components - self.row_labels = row_labels - self.col_labels = col_labels - self.method = method - self.row_sup_labels = row_sup_labels - self.quanti_sup_labels = quanti_sup_labels - self.quali_sup_labels = quali_sup_labels - self.graph = graph - self.figsize= figsize - - def fit(self,X,y=None): - """Fit the model to X - - Parameters - ---------- - X : DataFrame of float, shape (n_rows, n_columns) - - y : None - y is ignored - - Returns: - -------- - self : object - Returns the instance itself - """ - - # Extract supplementary rows - self.row_sup_labels_ = self.row_sup_labels - if self.row_sup_labels_ is not None: - _X = X.drop(index = self.row_sup_labels_) - row_sup = X.loc[self.row_sup_labels_,:] - else: - _X = X - - # Extract supplementary numeric or categorical columns - self.quanti_sup_labels_ = self.quanti_sup_labels - self.quali_sup_labels_ = self.quali_sup_labels - if ((self.quali_sup_labels_ is not None) and (self.quanti_sup_labels_ is not None)): - X_ = _X.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - elif self.quali_sup_labels_ is not None: - X_= _X.drop(columns = self.quali_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_) - elif self.quanti_sup_labels_ is not None: - X_ = _X.drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quanti_sup_labels_) - else: - X_ = _X - - self.data_ = X - - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Dimension - self.n_rows_, self.n_cols_ = X_.shape - - # Set row labels - self.row_labels_ = self.row_labels - if ((self.row_labels_ is None) or (len(self.row_labels_) != self.n_rows_)): - self.row_labels_ = ["row_" + str(i+1) for i in np.arange(0,self.n_rows_)] - - # Set col labels - self.col_labels_ = self.col_labels - if ((self.col_labels_ is None) or (len(self.col_labels_) != self.n_cols_)): - self.col_labels_ = ["col_" + str(k+1) for k in np.arange(0,self.n_cols_)] - - # Initialisation - self.uniqueness_ = None - self.row_sup_coord_ = None - self.col_sup_coord_ = None - - # - self.estimated_communality_ = None - self.col_coord_ = None - self.col_contrib_ = None - self.explained_variance_ = None - self.percentage_variance_ = None - self.factor_score_ = None - self.factor_fidelity_ = None - self.row_coord_ = None - - # Correlation Matrix - self.correlation_matrix_ = X_.corr(method= "pearson") - - # Rsquared - self.initial_communality_ = np.array([1 - (1/x) for x in np.diag(np.linalg.inv(self.correlation_matrix_))]) - # Total inertia - self.inertia_ = np.sum(self.initial_communality_) - - # Scale - data - self.means_ = np.mean(X_.values, axis=0).reshape(1,-1) - if self.normalize: - self.std_ = np.std(X_.values,axis=0,ddof=0).reshape(1,-1) - Z = (X_ - self.means_)/self.std_ - else: - Z = X_ - self.means_ - - self.normalized_data_ = Z - - if self.method == "principal": - self._compute_principal(X_) - elif self.method == "harris": - self._compute_harris(X_) - - # Compute supplementrary rows statistics - if self.row_sup_labels_ is not None: - self._compute_row_sup_stats(X=row_sup) - - self.model_ = "efa" - - return self - - def _compute_eig(self,X): - """Compute eigen decomposition - - """ - - # Eigen decomposition - eigenvalue, eigenvector = np.linalg.eigh(X) - - # Sort eigenvalue - eigen_values = np.flip(eigenvalue) - difference = np.insert(-np.diff(eigen_values),len(eigen_values)-1,np.nan) - proportion = 100*eigen_values/np.sum(eigen_values) - cumulative = np.cumsum(proportion) - - # Set n_components_ - self.n_components_ = self.n_components - if self.n_components_ is None: - self.n_components_ = (eigenvalue > 0).sum() - - self.eig_ = np.array([eigen_values[:self.n_components_], - difference[:self.n_components_], - proportion[:self.n_components_], - cumulative[:self.n_components_]]) - - self.eigen_vectors_ = eigenvector - return eigenvalue, eigenvector - - def _compute_principal(self,X): - """Compute EFA using principal approach - - - """ - # Compute Pearson correlation matrix - corr_prim = X.corr(method="pearson") - - # Fill diagonal with nitial communality - np.fill_diagonal(corr_prim.values,self.initial_communality_) - - # eigen decomposition - eigen_value,eigen_vector = self._compute_eig(corr_prim) - eigen_value = np.flip(eigen_value) - eigen_vector = np.fliplr(eigen_vector) - - # Compute columns coordinates - col_coord = eigen_vector*np.sqrt(eigen_value) - self.col_coord_ = col_coord[:,:self.n_components_] - - # Variance restituées - explained_variance = np.sum(np.square(self.col_coord_),axis=0) - - # Communalité estimée - estimated_communality = np.sum(np.square(self.col_coord_),axis=1) - - # Pourcentage expliquée par variables - percentage_variance = estimated_communality/self.initial_communality_ - - # F - scores - factor_score = np.dot(np.linalg.inv(X.corr(method="pearson")),self.col_coord_) - - # Contribution des variances - col_contrib = np.square(factor_score)/np.sum(np.square(factor_score),axis=0) - - # Fidélité des facteurs - factor_fidelity = np.sum(factor_score*self.col_coord_,axis=0) - - # Row coordinates - row_coord = np.dot(self.normalized_data_,factor_score) - - # Broken stick threshold - broken_stick_threshold = np.flip(np.cumsum(1/np.arange(self.n_cols_,0,-1))) - - # Karlis - Saporta - Spinaki threshold - kss = 1 + 2*np.sqrt((self.n_rows_-1)/(self.n_rows_-1)) - - # Store all result - self.estimated_communality_ = estimated_communality - - self.col_contrib_ = col_contrib[:,:self.n_components_] - self.explained_variance_ = explained_variance - self.percentage_variance_ = percentage_variance - self.factor_score_ = factor_score - self.factor_fidelity_ = factor_fidelity - self.row_coord_ = row_coord[:,:self.n_components_] - self.dim_index_ = ["Dim."+str(x+1) for x in np.arange(0,self.n_components_)] - - # Add eigenvalue threshold informations - self.kaiser_threshold_ = 1.0 - self.kaiser_proportion_threshold_ = 100/self.inertia_ - self.kss_threshold_ = kss - self.broken_stick_threshold_ = broken_stick_threshold[:self.n_components_] - - - def _compute_harris(self,X): - """Compute EFA using harris method - - """ - - self.uniqueness_ = 1 - self.initial_communality_ - - # Save - corr_prim = X.corr(method="pearson") - np.fill_diagonal(corr_prim.values,self.initial_communality_) - - # New correlation matrix - corr_snd = np.zeros((self.n_cols_,self.n_cols_)) - for k in np.arange(0,self.n_cols_,1): - for l in np.arange(0,self.n_cols_,1): - corr_snd[k,l] = corr_prim.iloc[k,l]/np.sqrt(self.uniqueness_[k]*self.uniqueness_[l]) - - eigen_value,eigen_vector = self._compute_eig(corr_snd) - - def _compute_row_sup_stats(self,X,y=None): - """Compute statistics supplementary row - - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - if self.method == "principal": - if self.normalize: - Z = (X - self.means_)/self.std_ - else: - Z = X - self.means_ - - self.row_sup_coord_ = np.dot(Z,self.factor_score_)[:,:self.n_components_] - else: - raise NotImplementedError("Error : This method is not implemented yet.") - - def _compute_quanti_sup_stats(self,X,y=None): - """Compute quantitative supplementary variables - - """ - raise NotImplementedError("Error : This method is not implemented yet.") - - def _compute_quali_sup_stats(self,X,y=None): - """Compute qualitative supplementary variables - - """ - raise NotImplementedError("Error : This method is not implemented yet.") - - def transform(self,X,y=None): - """Apply the dimensionality reduction on X - - X is projected on the first axes previous extracted from a training set. - - Parameters - ---------- - X : DataFrame of float, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - if self.method == "principal": - if self.normalize: - Z = (X - self.means_)/self.std_ - else: - Z = X - self.means_ - return np.dot(Z,self.factor_score_)[:,:self.n_components_] - else: - raise NotImplementedError("Error : This method is not implemented yet.") - - def fit_transform(self,X,y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - - self.fit(X) - return self.row_coord_ - -################################################################################################ -# CORRESPONDENCE ANALYSIS (CA) -################################################################################################ - -def which(self): - try: - self = list(iter(self)) - except TypeError as e: - raise Exception("""'which' method can only be applied to iterables. - {}""".format(str(e))) - indices = [i for i, x in enumerate(self) if bool(x) == True] - return(indices) - -class CA(BaseEstimator,TransformerMixin): - """ Correspondence Analysis (CA) - - This class inherits from sklearn BaseEstimator and TransformerMixin class - - CA performs a Correspondence Analysis, given a contingency table - containing absolute frequencies ; shape= n_rows x n_columns. - This implementation only works for dense dataframe. - - Parameters - ---------- - n_components : int, float or None - Number of components to keep. - - If n_components is None, keep all the components. - - If 0 <= n_components < 1, select the number of components such - that the amount of variance that needs to be explained is - greater than the percentage specified by n_components. - - If 1 <= n_components : - - If n_components is int, select a number of components - equal to n_components. - - If n_components is float, select the higher number of - components lower than n_components. - - row_labels : list of strings or None - - If row_labels is a list of strings : this array provides the - row labels. - If the shape of the array doesn't match with the number of - rows : labels are automatically computed for each row. - - If row_labels is None : labels are automatically computed for - each row. - - col_labels : list of strings or None - - If col_labels is a list of strings : this array provides the - column labels. - If the shape of the array doesn't match with the number of - columns : labels are automatically computed for each - column. - - If col_labels is None : labels are automatically computed for - each column. - - row_sup_labels : list of strings or None - - If row_sup_labels is a list of strings : this array provides the - supplementary row labels. - - col_sup_labels : list of strings or None - - If col_sup_labels is a list of strings : this array provides the - supplementary columns labels. - - Attributes - ---------- - n_components_ : int - The estimated number of components. - - row_labels_ : array of strings - Labels for the rows. - - col_labels_ : array of strings - Labels for the columns. - - eig_ : array of float - A 4 x n_components_ matrix containing all the eigenvalues - (1st row), difference (2nd row), the percentage of variance (3th row) and the - cumulative percentage of variance (4th row). - - row_coord_ : array of float - A n_rows x n_components_ matrix containing the row coordinates. - - col_coord_ : array of float - A n_columns x n_components_ matrix containing the column - coordinates. - - row_contrib_ : array of float - A n_rows x n_components_ matrix containing the row - contributions. - - col_contrib_ : array of float - A n_columns x n_components_ matrix containing the column - contributions. - - row_cos2_ : array of float - A n_rows x n_components_ matrix containing the row cosines. - - col_cos2_ : array of float - A n_columns x n_components_ matrix containing the column - cosines. - total_ : float - The sum of the absolute frequencies in the X array. - - model_ : string - The model fitted = 'ca' - """ - - def __init__(self, - n_components=None, - row_labels=None, - col_labels=None, - row_sup_labels=None, - col_sup_labels=None, - graph=True, - figsize=None): - self.n_components = n_components - self.row_labels = row_labels - self.col_labels = col_labels - self.row_sup_labels = row_sup_labels - self.col_sup_labels = col_sup_labels - self.graph = graph - self.figsize = figsize - - def fit(self,X,y=None): - """ Fit the model to X - Parameters - ---------- - X : array of float, shape (n_rows, n_columns) - Training data, where n_rows in the number of rows and - n_columns is the number of columns. - X is a contingency table containing absolute frequencies. - - y : None - y is ignored. - Returns - ------- - self : object - Returns the instance itself. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Extract supplementary rows - self.row_sup_labels_ = self.row_sup_labels - if self.row_sup_labels_ is not None: - _X = X.drop(index = self.row_sup_labels_) - row_sup = X.loc[self.row_sup_labels_,:] - else: - _X = X - - # Extract supplementary columns - self.col_sup_labels_ = self.col_sup_labels - if self.col_sup_labels is not None: - X_= _X.drop(columns = self.col_sup_labels_) - col_sup = _X[self.col_sup_labels_] - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.col_sup_labels_) - else: - X_ = _X - - self.data_ = X - - # Supplementary initialization - self.row_sup_coord_ = None - self.row_sup_cos2_ = None - - self.col_sup_coord_ = None - self.col_sup_cos2_ = None - - self.n_rows_, self.n_cols_ = X_.shape - self.total_ = X_.sum().sum() - - # Computes Singular Values Decomposition - self._compute_svd(X=X_) - - # Computes Dependance indicators - self._compute_indicators(X_) - - if self.row_sup_labels is not None: - self._compute_sup(X=row_sup,row=True) - - if self.col_sup_labels is not None: - self._compute_sup(X=col_sup,row=False) - - if self.graph: - fig, (axe1,axe2) = plt.subplots(1,2,figsize=self.figsize) - plotCA(self,choice = "row",ax=axe1,repel=True) - plotCA(self,choice = "col",ax=axe2,repel=True) - - return self - - def _compute_stats(self,rowprob,colprob,rowdisto,coldisto): - - row_contrib = np.apply_along_axis(func1d=lambda x : x/self.eig_[0], axis=1, - arr=np.apply_along_axis(func1d=lambda x: 100*x**2*rowprob,axis=0,arr=self.row_coord_)) - col_contrib = np.apply_along_axis(func1d=lambda x : x/self.eig_[0], axis=1, - arr=np.apply_along_axis(func1d=lambda x: 100*x**2*colprob,axis=0,arr=self.col_coord_)) - - # - row_cos2 = np.apply_along_axis(func1d=lambda x: x**2/rowdisto, axis = 0, arr=self.row_coord_) - col_cos2 = np.apply_along_axis(func1d=lambda x: x**2/coldisto, axis = 0, arr=self.col_coord_) - - self.row_contrib_ = row_contrib[:,:self.n_components_] - self.col_contrib_ = col_contrib[:,:self.n_components_] - self.row_cos2_ = row_cos2[:,:self.n_components_] - self.col_cos2_ = col_cos2[:,:self.n_components_] - - def _compute_indicators(self,X): - """ - """ - # - prob_conj = mapply(X,lambda x : x/self.total_,axis=0,progressbar=False) - - # probabilité marginale de V1 - marge colonne - row_prob = prob_conj.sum(axis = 1) - - # Marge ligne (probabilité marginale) - col_prob = prob_conj.sum(axis = 0) - - # Totaux lignes - row_sum = X.sum(axis=1) - - # Totaux colonnes - col_sum = X.sum(axis=0) - - # Compute chi - squared test - statistic,pvalue,dof, _ = st.chi2_contingency(X, lambda_=None) - - # log - likelihood - tes (G - test) - g_test_res = st.chi2_contingency(X, lambda_="log-likelihood") - - # Residuaal - resid = X - self.expected_freq_ - - standardized_resid = pd.DataFrame(self.standardized_resid_,index=self.row_labels_,columns=self.col_labels_) - - adjusted_resid = mapply(mapply(standardized_resid,lambda x : x/np.sqrt(1 - col_prob),axis=1,progressbar=False), - lambda x : x/np.sqrt(1-row_prob),axis=0,progressbar=False) - - chi2_contribution = mapply(standardized_resid,lambda x : 100*(x**2)/statistic,axis=0,progressbar=False) - # - attraction_repulsion_index = X/self.expected_freq_ - - # Profils lignes - row_prof = mapply(prob_conj,lambda x : x/np.sum(x), axis=1,progressbar=False) - - ## Profils colonnes - col_prof = mapply(prob_conj,lambda x : x/np.sum(x), axis=0,progressbar=False) - - # Row distance - row_dist = squareform(pdist(row_prof,metric= "seuclidean",V=col_prob)**2) - - # Distance entre individus et l'origine - row_disto = mapply(row_prof,lambda x :np.sum((x-col_prob)**2/col_prob),axis = 1,progressbar=False) - - # Poids des observations - row_weight = row_sum/np.sum(row_sum) - # Inertie des lignes - row_inertie = row_disto*row_weight - # Affichage - row_infos = np.c_[row_disto, row_weight, row_inertie] - - ################################################################################### - # Informations sur les profils colonnes - ################################################################################### - - col_dist = squareform(pdist(col_prof.T,metric= "seuclidean",V=row_prob)**2) - - # Distance à l'origine - col_disto = mapply(col_prof.T,lambda x : np.sum((x-row_prob)**2/row_prob),axis = 1,progressbar=False) - - # Poids des colonnes - col_weight = col_sum/np.sum(col_sum) - - # Inertie des colonnes - col_inertie = col_disto*col_weight - # Affichage - col_infos = np.c_[col_disto, col_weight, col_inertie] - - inertia = np.sum(row_inertie) - - # - self._compute_stats(row_prob,col_prob,row_disto,col_disto) - - # Return indicators - self.chi2_test_ = dict({"statistic" : statistic,"pvalue":pvalue,"dof":dof}) - self.log_likelihood_test_ = dict({"statistic" : g_test_res[0],"pvalue":g_test_res[1]}) - self.contingency_association_ = dict({"cramer" : st.contingency.association(X, method="cramer"), - "tschuprow" : st.contingency.association(X, method="tschuprow"), - "pearson" : st.contingency.association(X, method="pearson")}) - self.resid_ = resid - self.row_infos_ = row_infos - self.col_infos_ = col_infos - self.adjusted_resid_ = adjusted_resid - self.chi2_contribution_ = chi2_contribution - self.attraction_repulsion_index_ = attraction_repulsion_index - self.inertia_ = inertia - self.row_dist_ = row_dist - self.col_dist_ = col_dist - - def _compute_svd(self,X): - """"Compute a Singular Value Decomposition - - Then, this function computes : - n_components_ : - """ - # Set row labels - self.row_labels_ = self.row_labels - if (self.row_labels_ is None) or (len(self.row_labels_) != self.n_rows_): - self.row_labels_ = ["row_" + str(i+1) for i in np.arange(0,self.n_rows_)] - - # Set col labels - self.col_labels_ = self.col_labels - if (self.col_labels_ is None) or (len(self.col_labels_) !=self.n_cols_): - self.col_labels_ = ["col_" + str(k+1) for k in np.arange(0,self.n_cols_)] - - # Expected frequency - self.expected_freq_ = st.contingency.expected_freq(X) - - # Standardized resid - self.standardized_resid_ = (X - self.expected_freq_)/np.sqrt(self.expected_freq_) - - # Singular Values Decomposition - U, delta, V_T = np.linalg.svd(self.standardized_resid_/np.sqrt(self.total_),full_matrices=False) - - # Eigenvalues - lamb = delta**2 - - f_max = min(self.n_rows_ -1,self.n_cols_ - 1) - eigen_values = lamb[:f_max] - difference = np.insert(-np.diff(eigen_values),len(eigen_values)-1,np.nan) - proportion = 100*eigen_values/np.sum(eigen_values) - cumulative = np.cumsum(proportion) - - # - self.n_components_ = self.n_components - if self.n_components_ is None: - self.n_components_ = (delta > 1e-16).sum() - - self.eig_ = np.array([eigen_values[:self.n_components_], - difference[:self.n_components_], - proportion[:self.n_components_], - cumulative[:self.n_components_]]) - row_weight = X.sum(axis=1)/self.total_ - col_weight = X.sum(axis=0)/self.total_ - - row_coord = np.apply_along_axis(func1d=lambda x : x/np.sqrt(row_weight),axis=0,arr=U[:,:f_max]*delta[:f_max]) - - col_coord = np.apply_along_axis(func1d=lambda x : x/np.sqrt(col_weight),axis=0,arr=V_T[:f_max,:].T*delta[:f_max]) - #self.data_ = np.array(X) - self.row_coord_ = row_coord[:,:self.n_components_] - self.col_coord_ = col_coord[:,:self.n_components_] - self.dim_index_ = ["Dim."+str(i+1) for i in np.arange(0,self.n_components_)] - self.kaiser_threshold_ = np.mean(eigen_values) - self.kaiser_proportion_threshold_ = 100/f_max - self.res_row_dist_ = squareform(pdist(self.row_coord_,metric="sqeuclidean")) - self.res_col_dist_ = squareform(pdist(self.col_coord_,metric="sqeuclidean")) - - self.model_ = "ca" - - def _compute_sup(self,X,row=True): - """Compute row/columns supplementary coordinates - - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - if row: - row_sup_prof = np.apply_along_axis(func1d=lambda x : x/np.sum(x),axis=1,arr=X).dot(self.col_coord_)/np.sqrt(self.eig_[0]) - self.row_sup_coord_ = row_sup_prof[:,:self.n_components_] - else: - col_sup_prof = np.transpose(np.apply_along_axis(func1d=lambda x : x/np.sum(x),axis=0,arr=X)).dot(self.row_coord_)/np.sqrt(self.eig_[0]) - self.col_sup_coord_ = col_sup_prof[:,:self.n_components_] - - - def transform(self,X,y=None,row=True): - """Apply the dimensionality reduction on X - - X is projected on the first axes previous extracted from a training set. - - Parameters - ---------- - X : DataFrame of float, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - if row: - row_sup_prof = np.apply_along_axis(func1d=lambda x : x/np.sum(x),axis=1,arr=X) - return row_sup_prof.dot(self.col_coord_) / np.sqrt(self.eig_[0]) - else: - col_sup_prof = np.apply_along_axis(func1d=lambda x : x/np.sum(x),axis=0,arr=X) - return col_sup_prof.T.dot(self.row_coord_)/np.sqrt(self.eig_[0]) - - def fit_transform(self,X,y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : pd.DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - y : None - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - self.fit(X) - - return self.row_coord_ - - -#################################################################################### -# MULTIPLE CORRESPONDENCE ANALYSIS (MCA) -#################################################################################### - -def _mul(*args): - """An internal method to multiply matrices.""" - return functools.reduce(np.dot,args) - -class MCA(BaseEstimator,TransformerMixin): - """Multiple Correspondence Analysis (MCA) - - This class inherits from sklearn BaseEstimator and TransformerMixin class - - This class performs Multiple Correspondence Analysis (MCA) with supplementary - individuals, supplementary quantitative variables and supplementary - categorical variables. - - """ - - - def __init__(self,n_components=None, - row_labels=None, - var_labels=None, - mod_labels= None, - matrix_type="completed", - benzecri=True, - greenacre=True, - tol = 1e-4, - approximate=False, - row_sup_labels = None, - quali_sup_labels = None, - quanti_sup_labels=None, - graph=True, - figsize=None): - self.n_components = n_components - self.row_labels = row_labels - self.var_labels = var_labels - self.mod_labels = mod_labels - self.matrix_type = matrix_type - self.benzecri = benzecri - self.greenacre = greenacre - self.tol = tol - self.approximate = approximate - self.row_sup_labels = row_sup_labels - self.quali_sup_labels = quali_sup_labels - self.quanti_sup_labels = quanti_sup_labels - self.graph = graph - self.figsize = figsize - - def fit(self, X, y=None): - """ - - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - - self.row_sup_labels_ = self.row_sup_labels - if self.row_sup_labels_ is not None: - _X = X.drop(index = self.row_sup_labels_) - row_sup = X.loc[self.row_sup_labels_,:] - else: - _X = X - - # Extract supplementary numeric or categorical columns - self.quanti_sup_labels_ = self.quanti_sup_labels - self.quali_sup_labels_ = self.quali_sup_labels - if ((self.quali_sup_labels_ is not None) and (self.quanti_sup_labels_ is not None)): - X_ = _X.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - elif self.quali_sup_labels_ is not None: - X_= _X.drop(columns = self.quali_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_) - elif self.quanti_sup_labels_ is not None: - X_ = _X.drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quanti_sup_labels_) - else: - X_ = _X - - self.data_ = X - self.original_data_ = None - - # Supplementary initialization - self.row_sup_coord_ = None - self.row_sup_cos2_ = None - - self.quanti_sup_coord_ = None - self.quanti_sup_cos2_ = None - - self.quali_sup_coord_ = None - self.quali_sup_cos2_ = None - self.quali_sup_eta2_ = None - self.quali_sup_disto_ = None - self.quali_sup_vtest_ = None - - self.var_sup_eta2_ = None - - # Benzerci and Greenacre coorection - self.benzecri_correction_ = None - self.greenacre_correction_ = None - - self.var_labels_ = self.var_labels - if ((self.var_labels_ is not None) and (len(X_.columns) < len(self.var_labels_))): - raise ValueError("length of 'var_labels' must be less or equal to number of X columns.") - - if self.n_components == 1: - raise ValueError("n_components must be grather than 1.") - - - self._compute_svds(X_) - - # Compute supplementary quantitatives variables statistics - if self.quanti_sup_labels_ is not None: - self._compute_quanti_sup_stats(_X[self.quanti_sup_labels_]) - - # Compute supllementary qualitatives variables statistics - if self.quali_sup_labels_ is not None: - self._compute_quali_sup_stats(X=_X[self.quali_sup_labels_]) - - # Compute supplementrary rows statistics - if self.row_sup_labels_ is not None: - self._compute_row_sup_stats(X=row_sup) - - if self.graph: - fig, (axe1,axe2,axe3) = plt.subplots(1,3,figsize=self.figsize) - plotMCA(self,choice="ind",repel=True,ax=axe1) - plotMCA(self,choice="mod",repel=True,ax=axe2) - plotMCA(self,choice="var",repel=True,ax=axe3,xlim=(0,1),ylim=(0,1)) - - return self - - def _get_dummies(self,X): - """Convert categorical variable into dummy/indicator variables. - Each variable is converted in as many 0/1 variables as there are different values. Columns in the - output are each named after a value; if the input is a DataFrame, the name of the original variable - is prepended to the value. - - Parameters - ---------- - X : Series, or DataFrame - Data of which to get dummy indicators. - - Return - ------ - DataFrame - Dummy-coded data. If data contains other columns than the dummy-coded - one(s), these will be prepended, unaltered, to the result. - """ - dummies = (pd.get_dummies(X[cols],prefix=cols,prefix_sep='_') for cols - in (X.columns if self.var_labels_ is None else self.var_labels_)) - return pd.concat(dummies,axis=1) - - def _compute_disjonctif_table(self,X): - """Compute dummies tables - - """ - self.mod_labels_ = self.mod_labels - if ((self.var_labels_ is None) and (self.mod_labels_ is None)): - raise ValueError("Error : You must pass either 'var_labels' or 'mod_labels'.") - - self.n_rows_ = X.shape[0] - if self.matrix_type == "completed": - self.original_data_ = X - self.disjonctif_ = self._get_dummies(X) - if self.var_labels_ is None: - self.var_labels_ = list(X.columns) - elif self.matrix_type == "disjonctif": - # Chack if duplicate columns - duplicate = {x for x in list(X.columns) if list(X.columns).count(x) > 1} - if len(duplicate)>1: - raise ValueError("Error : 'X' must have unique columns.") - - # Check if underscore <<"_">> in columns - if False in [x.__contains__('_') for x in list(X.columns)]: - raise ValueError("Error : 'X' columns must have '_' to separate 'variable name' with 'modality'.", - "\n see 'https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html'.") - - self.original_data_ = from_dummies(X,sep="_") - self.disjonctif_ = X - if self.var_labels_ is None: - var = list([x.split("_",1)[0] for x in self.disjonctif_.columns]) - self.var_labels_ = reduce(lambda re, x: re+[x] if x not in re else re, var, []) - else: - raise ValueError("Error : Allowed values for the argument matrix_type are : 'completed' or 'disjonctif'.") - - self.mod_labels_ = self.mod_labels - if self.mod_labels_ is None: - self.mod_labels_ = self.disjonctif_.columns - - self.n_mods_ = len(self.mod_labels_) - self.n_vars_ = len(self.var_labels_) - self.short_labels_ = list([x.split("_",1)[-1] for x in self.mod_labels_]) - - def _compute_svds(self,X): - - """ - - """ - - self._compute_disjonctif_table(X) - self._compute_stats() - - # Set row labels - self.row_labels_ = self.row_labels - if ((self.row_labels_ is None) or (len(self.row_labels_) != self.n_rows_)): - self.row_labels_ = ["row_" + str(i+1) for i in np.arange(0,self.n_rows_)] - - - S = self.disjonctif_.sum().sum() - Z = self.disjonctif_/S # Correspondence matrix - self.r_ = Z.sum(axis=1) - self.c_ = Z.sum(axis=0) - - eps = np.finfo(float).eps - self.D_r = np.diag(1/(eps + np.sqrt(self.r_))) - self.D_c = np.diag(1/(eps + np.sqrt(self.c_))) - Z_c = Z - np.outer(self.r_, self.c_) # standardized residuals matrix - - product = self.D_r.dot(Z_c).dot(self.D_c) - self._numitems = len(X) - U, delta, V_T = np.linalg.svd(product) - - eigen_value = delta ** 2 - difference = np.insert(-np.diff(eigen_value),len(eigen_value)-1,np.nan) - proportion = 100*eigen_value/np.sum(eigen_value) - cumulative = np.cumsum(proportion) - - - self.n_components_ = self.n_components - if self.n_components_ is None: - self.n_components_ = self.n_mods_ - self.n_vars_ - - self.dim_index_ = ["Dim."+str(i+1) for i in np.arange(0,self.n_components_)] - - if self.benzecri: - self._benzecri(eigen_value) - - if self.greenacre: - self._greenacre(eigen_value) - - # Row and columns coordinates - row_coord = (self.D_r.dot(U).dot(sp.linalg.diagsvd(delta[:self.n_components_],self._numitems,self.n_components_))) - mod_coord = _mul(self.D_c, V_T.T, sp.linalg.diagsvd(delta[:self.n_components_],len(V_T),self.n_components_)) - - # Store information - self.eig_ = np.array([eigen_value[:self.n_components_], - difference[:self.n_components_], - proportion[:self.n_components_], - cumulative[:self.n_components_]]) - - # Normalized columns coordinates : see (Saporta, p235) or (Husson, 138) - corrected_mod_coord = np.apply_along_axis(func1d=lambda x: x*np.sqrt(self.eig_[0]),axis=1,arr=mod_coord) - - # Row and columns cos2 - row_cos2 = np.apply_along_axis(lambda x : x**2/np.linalg.norm(row_coord,axis=1)**2,axis=0,arr=row_coord) - mod_cos2 = np.apply_along_axis(lambda x : x**2/np.linalg.norm(mod_coord,axis=1)**2,axis=0,arr=mod_coord) - - # Row and columns contribution - row_contrib = np.apply_along_axis(lambda x : 100*x**2/(self.n_rows_*eigen_value[:self.n_components_]),axis=1,arr=row_coord) - mod_contrib = np.apply_along_axis(lambda x : x/eigen_value[:self.n_components_],axis=1, - arr=np.apply_along_axis(lambda x : 100*x**2*self.c_,axis=0,arr=mod_coord)) - - # Valeur test des modalités - dummies_sum = self.disjonctif_.sum(axis=0) - mod_vtest = np.apply_along_axis(func1d=lambda x : x*np.sqrt(((self.n_rows_ - 1)*dummies_sum)/(self.n_rows_ - dummies_sum)), - axis=0,arr=mod_coord) - - # Qualitative informations - mod_coord_df = pd.DataFrame(mod_coord,index=self.mod_labels_,columns=self.dim_index_) - dummies_mean = self.disjonctif_.mean(axis=0) - var_eta2 = mapply(mod_coord_df,lambda x : x**2,axis=0,progressbar=False).mul(dummies_mean, axis='index') - var_eta2 = pd.concat((mapply(var_eta2.loc[filter(lambda x: x.startswith(cols),var_eta2.index),:],lambda x : np.sum(x), - axis=0,progressbar=False).to_frame(name=cols).T for cols in self.var_labels_),axis=0) - - # Cosinus carrés des variables qualitatives - denom = np.array([len(np.unique(self.original_data_[[col]]))-1 for col in self.var_labels_]) - var_cos2 = var_eta2.div(denom,axis="index") - var_contrib = mapply(var_eta2,lambda x : 100*x/self.eig_[0],axis=1,progressbar=False) - - # Store all informations - self.row_coord_ = row_coord - self.row_cos2_ = row_cos2 - self.row_contrib_ = row_contrib - - self.mod_coord_ = mod_coord - self.corrected_mod_coord_ = corrected_mod_coord - self.mod_cos2_ = mod_cos2 - self.mod_contrib_ = mod_contrib - self.mod_vtest_ = mod_vtest - - # Inertia - self.inertia_ = self.n_mods_/self.n_vars_ - 1 - - # Eigenvalue threshold - self.kaiser_threshold_ = 1/self.n_vars_ - self.kaiser_proportion_threshold_ = 100/self.inertia_ - - ## Ajout des informations sur les variables - self.var_eta2_ = np.array(var_eta2) - self.var_cos2_ = np.array(var_cos2) - self.var_contrib_ = np.array(var_contrib) - - self.model_ = "mca" - - def _compute_stats(self): - - """ - - - """ - - chi2_stats = np.zeros(shape=(self.n_vars_,self.n_vars_)) - chi2_pvalue = np.zeros(shape=(self.n_vars_,self.n_vars_)) - for i in np.arange(0,self.n_vars_): - for j in np.arange(0,self.n_vars_): - tab = pd.crosstab(self.original_data_.iloc[:,i],self.original_data_.iloc[:,j]) - chi = st.chi2_contingency(tab) - chi2_stats[i,j],chi2_pvalue[i,j]= chi[0],chi[1] - - self.chi2_test_ = dict({"statistic": pd.DataFrame(chi2_stats,index=self.var_labels_,columns=self.var_labels_), - "pvalue" : pd.DataFrame(chi2_pvalue,index=self.var_labels_,columns=self.var_labels_) - }) - - # Marke ligne - row_marge = self.disjonctif_.sum(axis=0) - - # Profil individu moyen - ind_moyen = row_marge/(self.n_rows_*self.n_vars_) - - # Distance du chi2 entre les individus - row_dist = squareform(pdist(self.disjonctif_/self.n_vars_,metric="seuclidean",V=ind_moyen)**2) - - # Distance des observations à l'origine - row_disto = mapply(self.disjonctif_,lambda x : np.sum((1/ind_moyen)*(x/self.n_vars_ - ind_moyen)**2),axis=1,progressbar=False) - - # Poids des observations - row_weight = np.ones(self.n_rows_)/self.n_rows_ - - # Inertie des observations - row_inertia = row_disto*row_weight - - row_infos = np.c_[np.sqrt(row_disto), row_weight, row_inertia] - - ######################################################################################################### - # Informations sur les modalités - ######################################################################################################### - - # Distance chi2 entre les modalités - dummies_weight = self.disjonctif_.div(row_marge,axis="columns") - mod_dist = self.n_rows_*squareform(pdist(dummies_weight.T,metric="sqeuclidean")) - - # Distance des modalités à l'origine - mod_disto = mapply(dummies_weight,lambda x : np.sum(self.n_rows_*(x-row_weight)**2),axis = 0,progressbar=False) - - # Poids des modalités - mod_weight = ind_moyen - - # Inertie des modalités - mod_inertia = mod_disto * mod_weight - - mod_infos = np.c_[np.sqrt(mod_disto),mod_weight,mod_inertia] - - ######################################################################################################### - # Informations sur les variables - ######################################################################################################### - - # Inertia for the variables - var_inertia = np.c_[np.array([(len(np.unique(self.original_data_[col]))-1)/self.n_vars_ for col in self.original_data_.columns])] - - ######################################################################################################### - # Store all informations - ######################################################################################################### - - # Store informations - self.row_dist_ = row_dist - self.row_infos_ = row_infos - self.mod_dist_ = mod_dist - self.mod_infos_ = mod_infos - self.var_inertia_ = var_inertia - - def _benzecri(self,X): - """Compute Benzécri correction - - """ - # save eigen value grather than threshold - lambd = X[X>(1/self.n_vars_)] - - if len(lambd) > 0: - # Apply benzecri correction - lambd_tilde = ((self.n_vars_/(self.n_vars_-1))*(lambd - 1/self.n_vars_))**2 - - # Cumulative percentage - s_tilde = 100*(lambd_tilde/np.sum(lambd_tilde)) - - # Benzecri correction - self.benzecri_correction_ = pd.DataFrame(np.c_[lambd_tilde,s_tilde,np.cumsum(s_tilde)], - columns=["eigenvalue","proportion","cumulative"], - index = list(["Dim."+str(x+1) for x in np.arange(0,len(lambd))])) - - def _greenacre(self,X): - """Compute Greenacre correction - - """ - # save eigen value grather than threshold - lambd = X[X>(1/self.n_vars_)] - - if len(lambd) > 0: - lambd_tilde = ((self.n_vars_/(self.n_vars_-1))*(lambd - 1/self.n_vars_))**2 - - s_tilde_tilde = self.n_vars_/(self.n_vars_-1)*(np.sum(X**2)-(self.n_mods_-self.n_vars_)/(self.n_vars_**2)) - - tau = 100*(lambd_tilde/s_tilde_tilde) - - self.greenacre_correction_ = pd.DataFrame(np.c_[lambd_tilde,tau,np.cumsum(tau)], - columns=["eigenvalue","proportion","cumulative"], - index = list(["Dim."+str(x+1) for x in np.arange(0,len(lambd))])) - - def _compute_row_sup_stats(self,X, y=None): - """ Apply the dimensionality reduction on X. X is projected on - the first axes previous extracted from a training set. - Parameters - ---------- - X : array of string, int or float, shape (n_rows_sup, n_vars) - New data, where n_rows_sup is the number of supplementary - row points and n_vars is the number of variables. - X is a data table containing a category in each cell. - Categories can be coded by strings or numeric values. - X rows correspond to supplementary row points that are - projected onto the axes. - - y : None - y is ignored. - Returns - ------- - X_new : array of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points onto the axes. - """ - if self.matrix_type == "completed": - n_rows = X.shape[0] - n_cols = len(self.mod_labels_) - Y = np.zeros((n_rows,n_cols)) - for i in np.arange(0,n_rows,1): - values = [self.var_labels_[k] +"_"+str(X.iloc[i,k]) for k in np.arange(0,self.n_vars_)] - for j in np.arange(0,n_cols,1): - if self.mod_labels_[j] in values: - Y[i,j] = 1 - row_sup_dummies = pd.DataFrame(Y,columns=self.mod_labels_,index=X.index) - else: - row_sup_dummies = X - row_sup_profil = (mapply(row_sup_dummies,lambda x : x/np.sum(x),axis=1,progressbar=False) - .dot(self.mod_coord_)/np.sqrt(self.eig_[0])) - - self.row_sup_coord_ = np.array(row_sup_profil) - self.row_sup_cos2_ = np.apply_along_axis(lambda x : x**2/np.linalg.norm(self.row_sup_coord_,axis=1)**2, - axis=0,arr=self.row_sup_coord_) - - dict({"coord" : self.row_sup_coord_, - "cos2" : self.row_sup_cos2_}) - - def _compute_quali_sup_stats(self,X,y=None): - """Find the supplementary categorical columns factor - - """ - # Test if X is a DataFrame - if isinstance(X,pd.Series): - X = X.to_frame() - elif not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - dummies = pd.concat((pd.get_dummies(X,prefix=cols,prefix_sep = "_") for cols in X.columns),axis=1) - mod_sup_stats = dummies.agg(func=[np.sum,np.mean]).T - - n_k = dummies.sum(axis=0) - p_k = dummies.mean(axis=0) - - mod_sup_labels = dummies.columns - short_sup_labels = list([x.split("_",1)[-1] for x in mod_sup_labels]) - - mod_sup_coord = mapply(dummies,lambda x : x/np.sum(x),axis=0,progressbar=False).T.dot(self.row_coord_)/np.sqrt(self.eig_[0]) - - # Rapport de corrélation - """ - quali_sup_eta2 = pd.concat(((mapply(mod_sup_coord,lambda x : x**2,axis=0,progressbar=False).mul(p_k,axis="index") - .loc[filter(lambda x: x.startswith(cols),mod_sup_coord.index),:] - .sum(axis=0).to_frame(name=cols).T.div(self.eig_[0])) for cols in X.columns),axis=0) - """ - - mod_sup_cos2 = mapply(mod_sup_coord,lambda x: x**2/np.linalg.norm(mod_sup_coord,axis=1)**2,axis=0,progressbar=False) - - mod_sup_disto = (1/p_k)-1 - mod_sup_vtest = mapply(mod_sup_coord,lambda x : x*np.sqrt(((self.n_rows_-1)*n_k.values)/(self.n_rows_ - n_k.values)),axis=0,progressbar=False) - - # Store supplementary categories informations - self.mod_sup_coord_ = np.array(mod_sup_coord) - self.mod_sup_cos2_ = np.array(mod_sup_cos2) - self.mod_sup_disto_ = np.array(mod_sup_disto) - self.mod_sup_stats_ = np.array(mod_sup_stats) - self.mod_sup_vtest_ = np.array(mod_sup_vtest) - - self.mod_sup_labels_ = mod_sup_labels - self.short_sup_labels_ = short_sup_labels - - return dict({"coord" : mod_sup_coord, - "cos2" : mod_sup_cos2, - "dist" : mod_sup_disto.to_frame("Dist"), - "stats" : mod_sup_stats, - "vtest" : mod_sup_vtest}) - - def _compute_quanti_sup_stats(self,X,y=None): - """Find the supplementary quantitative columns factor - - """ - - # Test if X is a DataFrame - if isinstance(X,pd.Series): - X = X.to_frame() - elif not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - - # Supplementary quantitatives coordinates - quanti_sup_coord = np.transpose(np.corrcoef(x=self.row_coord_,y=X.values,rowvar=False)[:self.n_components_,self.n_components_:]) - - # Supplementary quantitatives cos2 - quanti_sup_cos2 = np.apply_along_axis(func1d=lambda x : x**2,arr = quanti_sup_coord,axis=0) - - # Store supplementary quantitatives informations - self.quanti_sup_coord_ = quanti_sup_coord[:,:self.n_components_] - self.quanti_sup_cos2_ = quanti_sup_cos2[:,:self.n_components_] - - return dict({"coord" : quanti_sup_coord[:,:self.n_components_], - "cos2" : quanti_sup_cos2[:,:self.n_components_]}) - - def transform(self,X,y=None): - """ Apply the dimensionality reduction on X. X is projected on - the first axes previous extracted from a training set. - Parameters - ---------- - X : array of string, int or float, shape (n_rows_sup, n_vars) - New data, where n_rows_sup is the number of supplementary - row points and n_vars is the number of variables. - X is a data table containing a category in each cell. - Categories can be coded by strings or numeric values. - X rows correspond to supplementary row points that are - projected onto the axes. - - y : None - y is ignored. - Returns - ------- - X_new : array of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points onto the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - #self._compute_row_sup_stats(X) - if self.matrix_type == "completed": - n_rows = X.shape[0] - n_cols = len(self.mod_labels_) - Y = np.zeros((n_rows,n_cols)) - for i in np.arange(0,n_rows,1): - values = [self.var_labels_[k] +"_"+str(X.iloc[i,k]) for k in np.arange(0,self.n_vars_)] - for j in np.arange(0,n_cols,1): - if self.mod_labels_[j] in values: - Y[i,j] = 1 - row_sup_dummies = pd.DataFrame(Y,columns=self.mod_labels_,index=X.index) - else: - row_sup_dummies = X - row_sup_coord = (mapply(row_sup_dummies,lambda x : x/np.sum(x),axis=1,progressbar=False) - .dot(self.mod_coord_)/np.sqrt(self.eig_[0])) - - row_sup_coord = np.array(row_sup_coord) - return row_sup_coord - - def fit_transform(self,X,y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : pd.DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - y : None - y is ignored - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - - self.fit(X) - return self.row_coord_ - - - -############################################################################################# -# FACTOR ANALYSIS OF MIXED DATA (FAMD) -############################################################################################# - -class FAMD(BaseEstimator,TransformerMixin): - """Factor Analysis of Mixed Data - - Performs Factor Analysis of Mixed Data (FAMD) with supplementary - individuals, supplementary quantitative variables and supplementary - categorical variables. - - Parameters: - ----------- - see scientisttools.decomposition.PCA and scientisttools.decomposition.MCA - - """ - def __init__(self, - normalize=True, - n_components=None, - row_labels=None, - quanti_labels=None, - quali_labels=None, - row_sup_labels=None, - quanti_sup_labels=None, - quali_sup_labels=None, - graph=False, - figsize=None): - self.normalize =normalize - self.n_components = n_components - self.row_labels = row_labels - self.quanti_labels = quanti_labels - self.quali_labels = quali_labels - self.row_sup_labels = row_sup_labels - self.quanti_sup_labels = quanti_sup_labels - self.quali_sup_labels = quali_sup_labels - self.graph = graph - self.figsize= figsize - - def fit(self,X): - """ - - - """ - if not isinstance(X,pd.DataFrame): - raise ValueError("Error : 'X' must be a data.frame") - - # Extract supplementary rows - self.row_sup_labels_ = self.row_sup_labels - if self.row_sup_labels_ is not None: - _X = X.drop(index = self.row_sup_labels_) - row_sup = X.loc[self.row_sup_labels_,:] - else: - _X = X - - # Extract supplementary numeric or categorical columns - self.quali_sup_labels_ = self.quali_sup_labels - self.quanti_sup_labels_ = self.quanti_sup_labels - if ((self.quali_sup_labels_ is not None) and (self.quanti_sup_labels_ is not None)): - X_ = _X.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_).drop(columns = self.quanti_sup_labels_) - elif self.quali_sup_labels_ is not None: - X_= _X.drop(columns = self.quali_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quali_sup_labels_) - elif self.quanti_sup_labels_ is not None: - X_ = _X.drop(columns = self.quanti_sup_labels_) - if self.row_sup_labels_ is not None: - row_sup = row_sup.drop(columns = self.quanti_sup_labels_) - else: - X_ = _X - - # Save initial data - self.data_ = X - - # Supplementary initialization - self.row_sup_coord_ = None - self.row_sup_cos2_ = None - - # Additional information for supplementary continuous variables - self.col_sup_coord_ = None - self.col_sup_cos2_ = None - self.col_sup_ftest_ = None - - # Additionnal informations supplementary categories - self.mod_sup_stats_ = None - self.mod_sup_coord_ = None - self.mod_sup_cos2_ = None - self.mod_sup_disto_ = None - self.mod_sup_vtest_ = None - - #Additionnal informations for supplementary categorical informations - self.quali_sup_eta2_ = None - - # Compute statistics - self.n_rows_ = X_.shape[0] - X_quant = X_.select_dtypes(include=np.number) - X_qual = X_.select_dtypes(include=["object"]) - - #Initialize lables - self.row_labels_ = self.row_labels - if self.row_labels_ is None: - self.row_labels_ = X.index - - self.quali_labels_ = self.quali_labels - if self.quali_labels_ is None: - self.quali_labels_ = X_qual.columns - - self.quanti_labels_ = self.quanti_labels - if self.quanti_labels_ is None: - self.quanti_labels_ = X_quant.columns - - self.quanti_data_ = X_quant - self.quali_data_ = X_qual - - # Pearson correlation between continuous variables - self.col_corr_ = np.array(X_quant.corr(method="pearson")) - - # Partial correlation between continuous variables - self.col_pcorr_ = np.array(X_quant.pcorr()) - - chi2_stats = np.zeros(shape=(len(self.quali_labels_),len(self.quali_labels_))) - chi2_pvalue = np.zeros(shape=(len(self.quali_labels_),len(self.quali_labels_))) - for i,lab1 in enumerate(self.quali_labels_): - for j,lab2 in enumerate(self.quali_labels_): - tab = pd.crosstab(X_.iloc[:,i],X_.iloc[:,j]) - chi = st.chi2_contingency(tab) - chi2_stats[i,j],chi2_pvalue[i,j]= chi[0],chi[1] - - self.chi2_test_ = dict({"statistic": pd.DataFrame(chi2_stats,index=self.quali_labels_,columns=self.quali_labels_), - "pvalue" : pd.DataFrame(chi2_pvalue,index=self.quali_labels_,columns=self.quali_labels_) - }) - - # Normalisation des variables qualitatives - dummies = pd.concat((pd.get_dummies(X_qual[cols],prefix=cols,prefix_sep='_') for cols in self.quali_labels_),axis=1) - - n_k = dummies.sum(axis=0) - self.dummies_means_ = dummies.mean(axis=0) - self.dummies_std_ = np.sqrt(self.dummies_means_) - mod_stats = dummies.agg(func=[np.sum,np.mean]).T - - # Centrage et réduction - self.means_ = np.mean(X_quant.values, axis=0).reshape(1,-1) - if self.normalize: - self.std_ = np.std(X_quant.values,axis=0,ddof=0).reshape(1,-1) - Z1 = (X_quant - self.means_)/self.std_ - else: - Z1 = X_quant - self.means_ - - Z2 = mapply(dummies,lambda x: x/np.sqrt(self.dummies_means_.values),axis = 1,progressbar=False) - - Z = pd.concat([Z1,Z2],axis=1) - - # Distance between individuals - row_dist = squareform(pdist(Z,metric='sqeuclidean')) - - # Distance between individuals and inertia center - row_disto = (mapply(Z1,lambda x:np.sum(x**2),axis=1,progressbar=False) + - mapply(dummies,lambda x:np.sum(1/self.dummies_means_.values*(x-self.dummies_means_.values)**2), - axis=1,progressbar=False)) - # Individuals weight - row_weight = np.ones(self.n_rows_)/self.n_rows_ - - # Individuals inertia - row_inertie = row_disto*row_weight - - row_infos = np.c_[np.sqrt(row_disto),row_weight,row_inertie] - - ################################ - dummies_weight = dummies.div(n_k,axis="columns") - - mod_dist = self.n_rows_*squareform(pdist(dummies_weight.T,metric="sqeuclidean")) - - # Distance à l'origine - mod_disto = mapply(dummies_weight,lambda x : np.sum(self.n_rows_*(x-row_weight)**2),axis=0,progressbar=False) - - # Poids des modalités - mod_weight = n_k/(self.n_rows_*dummies.shape[1]) - - # Inertie des lignes - mod_inertie = mod_disto*mod_weight - - mod_infos = np.c_[np.sqrt(mod_disto), mod_weight, mod_inertie] - - self.row_infos_ = row_infos - self.mod_infos_ = mod_infos - self.row_dist_ = row_dist - self.mod_dist_ = mod_dist - self.mod_stats_ = np.array(mod_stats) - self.normalized_data_ = Z - self.mod_labels_ = dummies.columns - self.short_labels_ = list([x.split("_",1)[-1] for x in dummies.columns]) - - self._compute_svd(X=Z,Xq=X_qual,Iq=n_k) - - if self.row_sup_labels_ is not None: - self._compute_row_sup_stats(X=row_sup) - - if self.quanti_sup_labels_ is not None: - self._compute_quanti_sup_stats(X=_X[self.quanti_sup_labels_]) - - if self.quali_sup_labels_ is not None: - self._compute_quali_sup_stats(X=_X[self.quali_sup_labels_]) - return self - - def _compute_svd(self,X,Xq,Iq): - """Compute Singular Value Decomposition - - - - """ - - f_max = X.shape[1] - len(self.quali_labels_) - - self.n_components_ = self.n_components - if self.n_components_ is None: - self.n_components_ = f_max - elif not isinstance(self.n_components_,int): - raise ValueError("Error : 'n_components' must be an integer.") - elif self.n_components_ <= 0: - raise ValueError("Error : 'n_components' must be positive integers") - elif self.n_components_ > f_max: - raise ValueError(f"Error : 'n_components' must be less or equal to {f_max}") - - self.dim_index_ = ["Dim."+str(x+1) for x in np.arange(0,self.n_components_)] - - res = PCA(normalize=False,n_components=self.n_components_,row_labels=X.index,col_labels=X.columns).fit(X) - - ########### Store all informations - self.eig_ = res.eig_ - self.eigen_vectors_ = res.eigen_vectors_ - - ####### Row - Cos2 & contrib - row_cos2 = np.apply_along_axis(func1d=lambda x : x**2/(self.row_infos_[:,0]**2),axis=0,arr=res.row_coord_) - row_contrib = np.apply_along_axis(func1d=lambda x : 100*x**2/(self.n_rows_*res.eig_[0]),axis=1,arr=res.row_coord_) - - # Row informations - self.row_coord_ = res.row_coord_ - self.row_contrib_ = row_contrib - self.row_cos2_ = row_cos2 - self.res_row_dist_ = squareform(pdist(self.row_coord_,metric="sqeuclidean")) - - # Coordinates for quantitatives columns - var_mod_coord = pd.DataFrame(res.col_coord_,index=X.columns,columns=self.dim_index_) - col_coord = var_mod_coord.loc[self.quanti_labels_,:] - - ####### Quantitative columns - Cos2 & Contrib - col_cos2 = mapply(col_coord,lambda x : x**2, axis=1,progressbar=False) - col_contrib = mapply(col_coord,lambda x : 100*x**2/res.eig_[0],axis=1,progressbar=False) - - # Test de significativité de Fisher - col_ftest = mapply(col_coord,lambda x : (1/2)*np.sqrt(self.n_rows_-3)*np.log((1+x)/(1-x)),axis=0,progressbar=False) - - # Quantitatives informations - self.col_coord_ = np.array(col_coord) - self.col_cos2_ = np.array(col_cos2) - self.col_contrib_ = np.array(col_contrib) - self.col_ftest_ = np.array(col_ftest) - - # Continuous labels - self.col_labels_ = self.quanti_labels_ - - # Modality informations - mod_coord = self._correct_modality(X=Xq) - - coord_mod = var_mod_coord.loc[self.mod_labels_,:] - - mod_cos2 = mapply(mod_coord,lambda x : x**2/(self.mod_infos_[:,0]**2), axis=0,progressbar=False) - mod_contrib = mapply(coord_mod,lambda x : 100*x**2/res.eig_[0],axis = 1,progressbar=False) - mod_vtest = mapply(mapply(mod_coord,lambda x : x*np.sqrt(((self.n_rows_-1)*Iq.values)/(self.n_rows_-Iq.values)), - axis=0,progressbar=False), - lambda x : x/np.sqrt(res.eig_[0]),axis=1,progressbar=False) - - # Qualitative informations - var_eta2 = pd.concat((mapply(coord_mod.loc[filter(lambda x: x.startswith(cols),coord_mod.index),:], - lambda x : x**2,axis=1,progressbar=False).sum().to_frame(name=cols).T for cols in self.quali_labels_),axis=0) - - # Cosinus carrés des variables qualitatives - denom = np.array([len(np.unique(Xq[[col]]))-1 for col in self.quali_labels_]) - var_cos2 = var_eta2.div(denom,axis="index") - var_contrib = mapply(var_eta2,lambda x : 100*x/res.eig_[0],axis=1,progressbar=False) - - # Modality informations - self.coord_mod_ = np.array(coord_mod) - self.mod_coord_ = np.array(mod_coord) - self.mod_cos2_ = np.array(mod_cos2) - self.mod_contrib_ = np.array(mod_contrib) - self.mod_vtest_ = np.array(mod_vtest) - - # Information sur les variables qualitatives - self.var_mod_coord_ = np.array(var_mod_coord) - self.var_eta2_ = np.array(var_eta2) - self.var_cos2_ = np.array(var_cos2) - self.var_contrib_ = np.array(var_contrib) - - self.model_ = "famd" - - def _correct_modality(self,X): - """ - - - """ - # Test if X is a DataFrame - if isinstance(X,pd.Series): - X = X.to_frame() - elif not isinstance(X,pd.DataFrame): - raise ValueError("Error : 'X' must be a DataFrame.") - - # Modified modality coordinates - dummies = pd.concat((pd.get_dummies(X[cols],prefix=cols,prefix_sep='_') for cols in X.columns),axis=1) - modified_mod_coord = pd.concat((pd.concat((pd.DataFrame(self.row_coord_,index=self.row_labels_, - columns=self.dim_index_),dummies[cols]),axis=1) - .groupby(cols).mean().iloc[1,:].to_frame(name=cols).T for cols in dummies.columns),axis=0) - - return modified_mod_coord - - def _compute_row_sup_stats(self,X): - """Compute supplementary individuals coordinates - - Parameters - ---------- - X : DataFrame, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - - """ - - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - - X_sup_quant = X[self.quanti_labels_] - X_sup_qual = X[self.quali_labels_] - - if self.normalize: - Z1 = (X_sup_quant - self.means_)/self.std_ - else: - Z1 = X_sup_quant - self.means_ - - # Standardscale Categorical Variable - n_rows = X_sup_qual.shape[0] - n_cols = len(self.mod_labels_) - Y = np.zeros((n_rows,n_cols)) - for i in np.arange(0,n_rows,1): - values = [self.quali_labels_[k] +"_"+str(X_sup_qual.iloc[i,k]) for k in np.arange(0,len(self.quali_labels_))] - for j in np.arange(0,n_cols,1): - if self.mod_labels_[j] in values: - Y[i,j] = 1 - row_sup_dummies = pd.DataFrame(Y,columns=self.mod_labels_,index=X.index) - - # New normalized Data - Z2 = mapply(row_sup_dummies,lambda x : (x - self.dummies_means_)/self.dummies_std_,axis=1,progressbar=False) - - # Supplementary individuals coordinates - row_sup_coord = np.dot(pd.concat([Z1,Z2],axis=1),self.eigen_vectors_) - - # Supplementary individuals distance to inertia - row_sup_disto = (mapply(Z1,lambda x:np.sum(x**2),axis=1,progressbar=False) + - mapply(row_sup_dummies,lambda x:np.sum(1/self.dummies_means_.values*(x-self.dummies_means_.values)**2), - axis=1,progressbar=False)) - - row_sup_cos2 = np.apply_along_axis(func1d=lambda x : x**2/(row_sup_disto),axis=0,arr=row_sup_coord) - - # Save - self.row_sup_coord_ = row_sup_coord[:,:self.n_components_] - self.row_sup_disto_ = np.sqrt(np.array(row_sup_disto)) - self.row_sup_cos2_ = row_sup_cos2 - - def _compute_quanti_sup_stats(self,X,y=None): - """Comupute supplementary continuous variables statistics - - Parameters - ---------- - self : An instance of class FAMD - X : DataFrame (n_rows,n_columns) - y : None - y is ignored - - Return - ------ - col_sup_corr_ : Pearson correlation between new continuous variables and old continuous variables - col_sup_coord_ : Supplementary continuous coordinates - col_sup_cos2_ : Supplementary continuous cosines - col_sup_ftest_ : Supplementary continuous Fisher - test - """ - # Test if X is a DataFrame - if isinstance(X,pd.Series): - X = X.to_frame() - elif not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Correlation between New continuous variables and old continuous variables - col_sup_corr = np.zeros((len(X.columns),len(self.quanti_labels_))) - for i, lab in enumerate(X.columns): - for j, name in enumerate(self.quanti_labels_): - col_sup_corr[i,j] = st.pearsonr(X[lab],self.quanti_data_[name]).statistic - - # Supplementary continuous coordinates - col_sup_coord = np.transpose(np.corrcoef(x=self.row_coord_,y=X.values,rowvar=False)[:self.n_components_,self.n_components_:]) - - # Fisher - test for columns coordinates - col_sup_ftest = np.apply_along_axis(func1d=lambda x : (1/2)*np.sqrt(self.n_rows_-3)*np.log((1+x)/(1-x)),axis=0,arr=col_sup_coord) - - # Supplementary continuous cos2 - col_sup_cos2 = np.apply_along_axis(func1d=lambda x : x**2,arr = col_sup_coord,axis=0) - - # Store supplementary continuous informations - self.col_sup_corr_ = col_sup_corr - self.col_sup_coord_ = col_sup_coord[:,:self.n_components_] - self.col_sup_cos2_ = col_sup_cos2[:,:self.n_components_] - self.col_sup_ftest_ = col_sup_ftest[:,:self.n_components_] - - # Self - self.col_sup_labels_ = X.columns - - return dict({"corr" : pd.DataFrame(self.col_sup_corr_, index=self.col_sup_labels_,columns=self.col_labels_), - "coord" : pd.DataFrame(self.col_sup_coord_,index=self.col_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_sup_cos2_, index = self.col_sup_labels_,columns=self.dim_index_), - "ftest" : pd.DataFrame(self.col_sup_ftest_,index = self.col_sup_labels_,columns=self.dim_index_) - }) - - def _compute_quali_sup_stats(self,X,y=None): - """Compute statistics supplementary categorical variables - - Parameters - ---------- - self : An instance of class FAMD - X : DataFrame (n_rows,n_columns) - y : None - y is ignored - - Return - ------ - chi2_sup_test_ : chi-squared test - mod_sup_coord_ : Supplementary categories coordinates - mod_sup_cos2_ : Supplementary categories cosines - mod_sup_disto_ : Supplementary categories distance - mod_sup_stats_ : Statistic for supplementary categories (count and percentage) - """ - # Test if X is a DataFrame - if isinstance(X,pd.Series): - X = X.to_frame() - elif not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Chi-squared test between old and new categorical variables - chi2_sup_stats = np.zeros(shape=(X.shape[1],len(self.quali_labels_))) - chi2_sup_pvalue = np.zeros(shape=(X.shape[1],len(self.quali_labels_))) - for i in np.arange(0,X.shape[1]): - for j,lab in enumerate(self.quali_labels_): - tab = pd.crosstab(X.iloc[:,i],self.quali_data_[lab]) - chi = st.chi2_contingency(tab) - chi2_sup_stats[i,j],chi2_sup_pvalue[i,j]= chi[0],chi[1] - - # Dummies variables - dummies = pd.concat((pd.get_dummies(X[cols],prefix=cols,prefix_sep='_') for cols in X.columns),axis=1) - mod_sup_stats = dummies.agg(func=[np.sum,np.mean]).T - n_k = dummies.sum(axis=0) - p_k = dummies.mean(axis=0) - mod_sup_labels = dummies.columns - short_sup_labels = list([x.split("_",1)[-1] for x in mod_sup_labels]) - - # Supplementary categories coordinates - mod_sup_coord = pd.concat((pd.concat((pd.DataFrame(self.row_coord_,index=self.row_labels_, - columns=self.dim_index_),dummies[cols]),axis=1) - .groupby(cols).mean().iloc[1,:].to_frame(name=cols).T for cols in dummies.columns),axis=0) - - # Rapport de corrélation - quali_sup_eta2 = pd.concat(((mapply(mod_sup_coord,lambda x : x**2,axis=0,progressbar=False).mul(p_k,axis="index") - .loc[filter(lambda x: x.startswith(cols),mod_sup_coord.index),:] - .sum(axis=0).to_frame(name=cols).T.div(self.eig_[0])) for cols in X.columns),axis=0) - - # Supplementary categories v-test - mod_sup_vtest = mapply(mapply(mod_sup_coord,lambda x : x/np.sqrt((self.n_rows_-n_k)/((self.n_rows_-1)*n_k)), - axis=0,progressbar=False), - lambda x : x/np.sqrt(self.eig_[0]),axis=1,progressbar=False) - - # Moyennes conditionnelles sur la variable Z - mz_g = pd.concat((pd.concat((self.normalized_data_,dummies[cols]),axis=1) - .groupby(cols).mean().iloc[1,:].to_frame(name=cols).T for cols in dummies.columns),axis=0) - - # Distance des modalités à l'origine - mod_sup_disto = mapply(mz_g,lambda x : np.sum(x**2),axis=1,progressbar=False) - - # Supplementary categories cos2 - mod_sup_cos2 = mapply(mod_sup_coord,lambda x : x**2,axis=0,progressbar=False).div(mod_sup_disto,axis="index") - - # Supplementary categories eta2 - correlation - quali_sup_eta2 = pd.concat((mapply(mod_sup_coord.loc[filter(lambda x: x.startswith(cols),mod_sup_coord.index),:], - lambda x : x**2,axis=1,progressbar=False) - .mul(p_k.loc[filter(lambda x: x.startswith(cols),mod_sup_coord.index)],axis="index") - .div(self.eig_[0],axis="columns") - .sum(axis=0).to_frame(name=cols).T for cols in X.columns),axis=0) - - # Supplementary categories informations - self.mod_sup_coord_ = np.array(mod_sup_coord) - self.mod_sup_cos2_ = np.array(mod_sup_cos2) - self.mod_sup_disto_ = np.array(mod_sup_disto) - self.mod_sup_stats_ = np.array(mod_sup_stats) - self.mod_sup_vtest_ = np.array(mod_sup_vtest) - - self.mod_sup_labels_ = mod_sup_labels - self.short_sup_labels_ = short_sup_labels - - # Categorical variables - self.quali_sup_eta2_ = np.array(quali_sup_eta2) - self.chi2_sup_test_ = dict({"statistic" : pd.DataFrame(chi2_sup_stats,index=X.columns,columns=self.quali_labels_), - "pvalue" : pd.DataFrame(chi2_sup_pvalue,index=X.columns,columns=self.quali_labels_) - }) - - return dict({"chi2" : self.chi2_sup_test_, - "coord" : pd.DataFrame(self.mod_sup_coord_,index=self.mod_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_sup_cos2_,index=self.mod_sup_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.mod_sup_disto_,index=self.mod_sup_labels_,columns=["dist"]), - "eta2" : pd.DataFrame(self.quali_sup_eta2_,index=self.quali_sup_labels_,columns=self.dim_index_), - "vtest" : pd.DataFrame(self.mod_sup_vtest_,index=self.mod_sup_labels_,columns=self.dim_index_) - }) - - def transform(self,X): - """Apply the dimensionality reduction on X - - X is projected on the first axes previous extracted from a training set. - - Parameters - ---------- - X : DataFrame, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Store continuous and categorical variables - X_sup_quant = X[self.quanti_labels_] - X_sup_qual = X[self.quali_labels_] - - # Standardscaler numerical variable - if self.normalize: - Z1 = (X_sup_quant - self.means_)/self.std_ - else: - Z1 = X_sup_quant - self.means_ - - # Standardscaler categorical Variable - n_rows = X_sup_qual.shape[0] - n_cols = len(self.mod_labels_) - Y = np.zeros((n_rows,n_cols)) - for i in np.arange(0,n_rows,1): - values = [self.quali_labels_[k] +"_"+str(X_sup_qual.iloc[i,k]) for k in np.arange(0,len(self.quali_labels_))] - for j in np.arange(0,n_cols,1): - if self.mod_labels_[j] in values: - Y[i,j] = 1 - row_sup_dummies = pd.DataFrame(Y,columns=self.mod_labels_,index=X.index) - - # New normalized data - Z2 = mapply(row_sup_dummies,lambda x : (x - self.dummies_means_)/self.dummies_std_,axis=1,progressbar=False) - - # Supplementary individuals coordinates - row_sup_coord = np.dot(np.array(pd.concat([Z1,Z2],axis=1)),self.eigen_vectors_) - - return row_sup_coord[:,:self.n_components_] - - def fit_transform(self,X,y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : pd.DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - y : None - y is ignored - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - - self.fit(X) - return self.row_coord_ - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/build/lib/scientisttools/discriminant_analysis.py b/build/lib/scientisttools/discriminant_analysis.py deleted file mode 100644 index 7c18cc2..0000000 --- a/build/lib/scientisttools/discriminant_analysis.py +++ /dev/null @@ -1,115 +0,0 @@ -# -*- coding: utf-8 -*- -import matplotlib.pyplot as plt -import seaborn as sns -import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin - -########################################################################################## -# CANONICAL DISCRIMINANT ANALYSIS -########################################################################################## - -class CDA(BaseEstimator,TransformerMixin): - """Canonical Discriminant Analysis - - """ - def __init__(self,feature_columns=None,target_columns=None,priors=None,method ="FR"): - self.feature_columns = feature_columns - self.target_columns = target_columns - self.priors = priors - self.method = method - - def fit(self,X,y): - # Compute - self.feature_ = X - self.target_ = y - - if self.feature_columns is None: - raise NotImplementedError("Error : This method is not implemented yet.") - - # Initialize - self.eig_ = None - self.eigen_vectors = None - self.total_variance = None - self.between_variance = None - self.within_variance = None - - if self.method == "FR": - self._computed_fr(X,y) - elif self.method == "GB": - self._computed_gb(X,y) - - return self - - def _computed_fr(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") - - def _computed_gb(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") - - def _computed_stats(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") - - def transform(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") - - def predict(self,X): - raise NotImplementedError("Error : This method is not implemented yet.") - - def predict_proba(self,X): - raise NotImplementedError("Error : This method is not implemented yet.") - - def fit_transform(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") - - def plot_boxplot(self,ax=None): - if ax is None: - ax =plt.gca() - raise NotImplementedError("Error : This method is not implemented yet.") - - -##################################################################################### -# LINEAR DISCRIMINANT ANALYSOS (LDA) -##################################################################################### - - -class LDA(BaseEstimator,TransformerMixin): - """Linear Discriminant Analysis - - """ - def __init__(self,feature_columns,target_columns): - self.feature_columns = feature_columns - self.target_columns = target_columns - - def fit(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") - -###################################################################################### -# QUADRATIC DISCRIMINANT ANALYSIS (QDA) -##################################################################################### - -class QDA(BaseEstimator,TransformerMixin): - """Quadratic Discriminant Analysis - - """ - def __init__(self,features_columns, target_columns,priors=None): - self.features_columns = features_columns - self.target_columns = target_columns - self.priors_ = priors - - def fit(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") - -##################################################################################### -# LOCAL FISHER DISCRIMINANT ANALYSIS (LFDA) -###################################################################################### - -class LFDA(BaseEstimator,TransformerMixin): - """Local Fisher Discriminant Analysis - - """ - def __init__(self,feature_columns,target_columns): - self.feature_columns = feature_columns - self.target_columns = target_columns - - def fit(self,X,y): - raise NotImplementedError("Error : This method is not implemented yet.") \ No newline at end of file diff --git a/build/lib/scientisttools/extractfactor.py b/build/lib/scientisttools/extractfactor.py deleted file mode 100644 index 6b020c4..0000000 --- a/build/lib/scientisttools/extractfactor.py +++ /dev/null @@ -1,1673 +0,0 @@ -# -*- coding: utf-8 -*- - -import pandas as pd -import numpy as np -from scipy.spatial.distance import pdist,squareform -from scipy.cluster import hierarchy - -def get_ca_row(self)-> dict: - - """ - self. : an instance of class CA - - Returns - ------- - Correspondence Analysis - Results for rows - ========================================================= - Name Description - 1 "coord" "coordinates for the rows" - 2 "cos2" "cos2 for the rows" - 3 "constrib" "contributions of the rows" - 4 "dist" "Rows distance" - 5 "res.dist" "Restitued distance" - 6 "infos" "additionnal informations for the rows:" - - distance between rows and inertia - - weight for the rows - - inertia for the rows - """ - if self.model_ != "ca": - raise ValueError("Error : 'self' must be an instance of class CA.") - df = dict({"coord" : pd.DataFrame(self.row_coord_,index=self.row_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_cos2_,index=self.row_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.row_contrib_,index=self.row_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.row_dist_,index=self.row_labels_,columns=self.row_labels_), - "res.dist" : pd.DataFrame(self.res_row_dist_,index=self.row_labels_,columns=self.row_labels_), - "infos" : pd.DataFrame(self.row_infos_,columns= ["d(i,G)","p(i)","I(i,G)"],index=self.row_labels_) - }) - if self.row_sup_labels_ is not None: - df["row_sup"] = dict({ - "coord" : self.row_sup_coord_}) - - return df - -def get_ca_col(self)-> dict: - - """ - self : an instance of class CA - - Returns - ------- - Correspondence Analysis - Results for columns - ========================================================= - Name Description - 1 "coord" "coordinates for the columns" - 2 "cos2" "cos2 for the columns" - 3 "constrib" "contributions of the columns" - 4 "dist" "Columns distance" - 5 "res.dist" "Restitued distance" - 6 "infos" "additionnal informations for the columns :" - - distance between columns and inertia - - weight for the columns - - inertia for the columns - """ - if self.model_ != "ca": - raise ValueError("Error : 'self' must be an object of class CA.") - df = dict({"coord" : pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_cos2_,index = self.col_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.col_contrib_,index = self.col_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.col_dist_,index=self.col_labels_,columns=self.col_labels_), - "res.dist" : pd.DataFrame(self.res_col_dist_,index=self.col_labels_,columns=self.col_labels_), - "infos" : pd.DataFrame(self.col_infos_,columns= ["d(k,G)","p(k)","I(k,G)"],index=self.col_labels_) - }) - if self.col_sup_labels_ is not None: - df["col_sup"] = dict({ - "coord" : self.col_sup_coord_ - }) - - return df - -def get_ca(self,choice = "row")-> dict: - - """ - self : an instance of class CA - - choice : {"row", "col"}, default= "row" - - Returns - ------- - if choice == "row": - Correspondence Analysis - Results for rows - ========================================================= - Name Description - 1 "coord" "coordinates for the rows" - 2 "cos2" "cos2 for the rows" - 3 "constrib" "contributions of the rows" - 4 "dist" "Rows distance" - 5 "res.dist" "Restitued distance" - 6 "infos" "additionnal informations for the rows:" - - distance between rows and inertia - - weight for the rows - - inertia for the rows - if choice == "col": - Correspondence Analysis - Results for columns - ========================================================= - Name Description - 1 "coord" "coordinates for the columns" - 2 "cos2" "cos2 for the columns" - 3 "constrib" "contributions of the columns" - 4 "dist" "Columns distance" - 5 "res.dist" "Restitued distance" - 6 "infos" "additionnal informations for the columns :" - - distance between columns and inertia - - weight for the columns - - inertia for the columns - """ - if self.model_ != "ca": - raise ValueError("Error : 'self' must be an object of class CA.") - if choice == "row": - return get_ca_row(self) - elif choice == "col": - return get_ca_col(self) - else: - raise ValueError("Error : Allowed values for the argument choice are : 'row' or 'col'.") - -# -*- coding: utf-8 -*- - -def StandardScaler(X): - return (X - X.mean())/X.std(ddof=0) - -def get_dist(X, method = "euclidean",normalize=False,**kwargs) -> dict: - if isinstance(X,pd.DataFrame) is False: - raise ValueError("Error : 'X' must be a DataFrame") - if normalize: - X = X.transform(StandardScaler) - if method in ["pearson","spearman","kendall"]: - corr = X.T.corr(method=method) - dist = corr.apply(lambda cor : 1 - cor,axis=0).values.flatten('F') - else: - dist = pdist(X.values,metric=method,**kwargs) - return dict({"dist" :dist,"labels":X.index}) - - -################### Exploratory factor analysis - -def get_efa_ind(self) -> dict: - - """ - self : an instance of class EFA - - Returns - ------- - Exploratoty Factor Analysis - Results for individuals - =============================================================== - Names Description - 1 "coord" "coordinates for the individuals" - """ - if self.model_ != "efa": - raise ValueError("Error : 'self' must be an object of class EFA.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.row_coord_,index=self.row_labels_,columns=self.dim_index_) - }) - return df - -def get_efa_var(self) -> dict: - - """ - self : an instance of class EFA - - Returns - ------- - Exploratory Factor Analysis - Results for variables - ============================================================== - Names Description - 1 "coord" "coordinates for the variables" - 2 "contrib" "contributions of the variables" - 3 "communality" "Communality of the variables" - 4 "variance" "Percentage of variance" - 5 "fscore" "Factor score" - """ - if self.model_ != "efa": - raise ValueError("Error : 'self' must be an object of class EFA.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.col_contrib_,index = self.col_labels_,columns=self.dim_index_), - "communality" : pd.DataFrame(np.c_[self.initial_communality_,self.estimated_communality_],columns=["initial","estimated"],index = self.col_labels_), - "variance" : pd.DataFrame(self.percentage_variance_,index=self.col_labels_,columns=["% var."]), - "fscore" : pd.DataFrame(self.factor_score_,index=self.col_labels_, columns=self.dim_index_) - }) - return df - -def get_efa(self,choice = "row")-> dict: - - """ - self : an instance of class EFA - - choice : {"row", "var"}, default= "row" - - Returns - ------- - if choice == "row": - Exploratory Factor Analysis - Results for individuals - =================================================== - Names Description - 1 "coord" "coordinates for the individuals" - - if choice == "var": - Exploratory Factor Analysis - Results for variables - =================================================== - Names Description - 1 "coord" "coordinates for the variables" - 2 "contrib" "contributions of the variables" - 3 "communality" "Communality of the variables" - 4 "variance" "Percentage of variance" - 5 "fscore" "Factor score" - """ - if self.model_ != "efa": - raise ValueError("Error : 'self' must be an object of class EFA.") - if choice == "row": - return get_efa_ind(self) - elif choice == "var": - return get_efa_var(self) - else: - raise ValueError("Allowed values for the argument choice are : 'row' or 'var'.") - -################## Eigenvalues - -def get_eig(self) -> pd.DataFrame: - - """ - self : an instance of class PCA, PartialPCA, CA, MCA, FAMD, MFA,CMDS - - Returns - ------- - eigenvalue, difference, variance percent and cumulative variance of percent - """ - if self.model_ in ["pca","ppca","ca","mca","famd","mfa","cmds"]: - eig = pd.DataFrame(self.eig_.T,columns=["eigenvalue","difference","proportion","cumulative"],index = self.dim_index_) - return eig - else: - raise ValueError("Eroor : 'self' must be an instance of class PCA, PPCA, CA, MCA, FAMD, MFA, CMDS") - -def get_eigenvalue(self) -> pd.DataFrame: - - """ - self : an instance of class PCA, PartialPCA, CA, MCA, FAMD, MFA, MDS - - Returns - ------- - eigenvalue, variance percent and cumulative variance of percent - """ - return get_eig(self) - - -############ Factor analysis of mixed data - -def get_famd_ind(self) -> dict: - """Extract individuals informations - - Parameters - ---------- - self : an instance of class FAMD - - Returns - ------- - Factor Analysis of Mixed Data - Results for individuals - ======================================================= - Names Description - 1 "coord" "Coordinates for the individuals" - 2 "cos2" "Cos2 for the individuals" - 3 "contrib" "Contributions of the individuals" - 4 "infos" "Additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - """ - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an object of class FAMD.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.row_coord_,index=self.row_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_cos2_,index=self.row_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.row_contrib_,index=self.row_labels_,columns=self.dim_index_), - "infos" : pd.DataFrame(self.row_infos_,columns= ["d(i,G)","p(i)","I(i,G)"],index=self.row_labels_) - }) - if self.row_sup_labels_ is not None: - df["ind_sup"] = dict({ - "dist" : pd.DataFrame(self.row_sup_disto_,index = self.row_sup_labels_,columns=["Dist"]), - "coord" : pd.DataFrame(self.row_sup_coord_,index=self.row_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_sup_cos2_,index=self.row_sup_labels_,columns=self.dim_index_) - }) - return df - -def get_famd_col(self) -> dict: - """Extract continuous variables informations - - Parameters - ---------- - self : an instance of class FAMD - - Returns - ------- - Factor Analysis of Mixed Data - Results for continuous variables - ================================================================ - Names Description - 1 "corr" "Pearson correlation between continuous variables" - 2 "pcorr" "Partial correlation between continuous variables" - 3 "coord" "Coordinates for the continuous variables" - 4 "cos2" "Cos2 for the continuous variables" - 5 "contrib" "Contributions of the continuous variables" - 6 "ftest" "Fisher test of the continuous variables" - """ - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an object of class FAMD") - - # Store informations - df = dict({ - "corr" : pd.DataFrame(self.col_corr_,index=self.col_labels_,columns=self.col_labels_), - "pcorr" : pd.DataFrame(self.col_pcorr_,index=self.col_labels_,columns=self.col_labels_), - "coord" : pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_cos2_,index = self.col_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.col_contrib_,index = self.col_labels_,columns=self.dim_index_), - "ftest" : pd.DataFrame(self.col_ftest_,index = self.col_labels_,columns=self.dim_index_) - }) - if self.quanti_sup_labels_ is not None: - # Add supplementary continuous variables informations - df["quanti_sup"] = dict({ - "corr" : pd.DataFrame(self.col_sup_corr_,index=self.col_sup_labels_,columns=self.col_labels_), - "coord" : pd.DataFrame(self.col_sup_coord_,index=self.col_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_sup_cos2_,index=self.col_sup_labels_,columns=self.dim_index_), - "ftest" : pd.DataFrame(self.col_sup_ftest_,index=self.col_sup_labels_,columns=self.dim_index_) - }) - return df - -def get_famd_mod(self) -> dict: - """Extract categories informations - - Parameters - ---------- - self : an instance of class FAMD - - Returns - ------- - Factor Analysis of Mixed Data - Results for categories - ====================================================== - Names Description - 1 "stats" "Count and percentage of categories" - 2 "coord" "coordinates for the categories" - 3 "cos2" "cos2 for the categories" - 4 "contrib" "contributions of the categories" - 5 "vtest" "value test of the categories" - 6 "infos" "additionnal informations for the categories :" - - distance between categories and inertia - - weight for the categories - - inertia for the categories - """ - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an object of class FAMD.") - - # Store informations - df = dict({ - "stats" : pd.DataFrame(self.mod_stats_,columns=["n(k)","p(k)"],index=self.mod_labels_), - "coord" : pd.DataFrame(self.mod_coord_,index=self.mod_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_cos2_,index=self.mod_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.mod_contrib_,index=self.mod_labels_,columns=self.dim_index_), - "vtest" : pd.DataFrame(self.mod_vtest_,index=self.mod_labels_,columns=self.dim_index_), - "infos" : pd.DataFrame(self.mod_infos_,columns= ["d(k,G)","p(k)","I(k,G)"],index=self.mod_labels_) - }) - if self.quali_sup_labels_ is not None: - df["quali_sup"] = dict({ - "stats" : pd.DataFrame(self.mod_sup_stats_,columns=["n(k)","p(k)"],index=self.mod_sup_labels_), - "coord" : pd.DataFrame(self.mod_sup_coord_,index=self.mod_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_sup_cos2_,index=self.mod_sup_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.mod_sup_disto_,index=self.mod_sup_labels_,columns=self.dim_index_), - "vtest" : pd.DataFrame(self.mod_sup_vtest_,index=self.mod_sup_labels_,columns=self.dim_index_) - }) - return df - -def get_famd_var(self): - """Extract categorical variables informations - - Parameters - ---------- - self : an instance of class FAMD - - Returns - ------- - Factor Analysis of Mixed Data - Results for categorical variables - ================================================================= - Names Description - 1 "chi2" "chi-squared statistics and p-values" - 2 "eta2" "Correlation ratio" - 3 "cos2" "cos2 for categorical variables" - 4 "contrib" "contributions of categorical variables" - """ - - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an object of class FAMD.") - - df = dict({ - "chi2" : self.chi2_test_, - "eta2" : pd.DataFrame(self.var_eta2_,index=self.quali_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.var_cos2_,index=self.quali_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.var_contrib_,index=self.quali_labels_,columns=self.dim_index_) - }) - if self.quali_sup_labels_ is not None: - df["quali_sup"] = dict({ - "chi2" : self.chi2_sup_stats_, - "eta2" : pd.DataFrame(self.quali_sup_eta2_,index=self.quali_sup_labels_,columns=self.dim_index_), - }) - return df - - -def get_famd(self,choice = "ind")-> dict: - """Extract Factor Analysis oif Mixed Data informations - - Parameters - ---------- - self : an instance of class FAMD - - choice : {"ind","var","mod","col"}, default= "ind" - - Returns - ------- - if choice == "ind": - Factor Analysis of Mixed Data - Results for individuals - =================================================== - Names Description - 1 "coord" "Coordinates for the individuals" - 2 "cos2" "Cos2 for the individuals" - 3 "contrib" "Contributions of the individuals" - 4 "infos" "Additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - - if choice == "col": - Factor Analysis of Mixed Data - Results for continuous variables - ================================================================== - Names Description - 1 "corr" "Pearson correlation between continuous variables" - 2 "pcorr" "Partial correlation between continuous variables" - 3 "coord" "Coordinates for the continuous variables" - 4 "cos2" "Cos2 for the continuous variables" - 5 "contrib" "Contributions of the continuous variables" - 6 "ftest" "Fisher test of the continuous variables" - if choice == "mod": - Factor Analysis of Mixed Data - Results for modality of qualitatives variables - =============================================================================== - Names Description - 1 "stats" "Count and percentage of categories" - 2 "coord" "coordinates for the categories" - 3 "cos2" "cos2 for the categories" - 4 "contrib" "contributions of the categories" - 5 "vtest" "value test of the categories" - 6 "infos" "additionnal informations for the categories :" - - distance between categories and inertia - - weight for the categories - - inertia for the categories - if choice == "var" - Factor Analysis of Mixed Data - Results for variables - ===================================================== - Names Description - 1 "chi2" "chi-squared statistics and p-values" - 2 "eta2" "Correlation ratio" - 3 "cos2" "cos2 for categorical variables" - 4 "contrib" "contributions of categorical variables" - """ - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an object of class FAMD.") - if choice == "ind": - return get_famd_ind(self) - elif choice == "col": - return get_famd_col(self) - elif choice == "mod": - return get_famd_mod(self) - elif choice == "var": - return get_famd_var(self) - else: - raise ValueError("Allowed values for the argument choice are : 'ind','var','mod' and 'col'.") - - -############# Hierarchical - -def get_hclust(X, method='single', metric='euclidean', optimal_ordering=False): - Z = hierarchy.linkage(X,method=method, metric=metric) - if optimal_ordering: - order = hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z,X)) - else: - order = hierarchy.leaves_list(Z) - return dict({"order":order,"height":Z[:,2],"method":method, - "merge":Z[:,:2],"n_obs":Z[:,3],"data":X}) - - -########## Multiple Correspondence Analysis - -def get_mca_ind(self) -> dict: - """ - self : an instance of class MCA - - Returns - ------- - Multiple Correspondence Analysis - Results for individuals - =============================================================== - Names Description - 1 "coord" "coordinates for the individuals" - 2 "cos2" "cos2 for the individuals" - 3 "contrib" "contributions of the individuals" - 4 "infos" "additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - """ - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an object of class MCA.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.row_coord_,index=self.row_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_cos2_,index=self.row_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.row_contrib_,index=self.row_labels_,columns=self.dim_index_), - "infos" : pd.DataFrame(self.row_infos_,columns= ["d(i,G)","p(i)","I(i,G)"],index=self.row_labels_) - }) - if self.row_sup_labels_ is not None: - df["ind_sup"] = dict({ - "coord" : pd.DataFrame(self.row_sup_coord_,index=self.row_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_sup_cos2_,index=self.row_sup_labels_,columns=self.dim_index_) - }) - return df - -def get_mca_mod(self) -> dict: - - """ - self : an instance of class MCA - - Returns - ------- - Multiple Correspondence Analysis - Results for categories - ===================================================================== - Names Description - 1 "coord" "coordinates for the categories" - 2 "corrected_coord" "Coorected coordinates for the categories" - 3 "cos2" "cos2 for the categories" - 4 "contrib" "contributions of the categories" - 5 "infos" "additionnal informations for the categories :" - - distance between categories and inertia - - weight for the categories - - inertia for the categories - """ - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an object of class MCA.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.mod_coord_,index=self.mod_labels_,columns=self.dim_index_), - "corrected_coord" : pd.DataFrame(self.corrected_mod_coord_,index=self.mod_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_cos2_,index=self.mod_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.mod_contrib_,index=self.mod_labels_,columns=self.dim_index_), - "vtest" : pd.DataFrame(self.mod_vtest_,index = self.mod_labels_,columns=self.dim_index_), - "infos" : pd.DataFrame(self.mod_infos_,columns= ["d(k,G)","p(k)","I(k,G)"],index=self.mod_labels_) - }) - if self.quali_sup_labels_ is not None: - df["sup"] = dict({ - "stats" : pd.DataFrame(self.mod_sup_stats_, index = self.mod_sup_labels_,columns = ["n(k)","p(k)"]), - "coord" : pd.DataFrame(self.mod_sup_coord_, index =self.mod_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_sup_cos2_, index =self.mod_sup_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.mod_sup_disto_, index = self.mod_sup_labels_,columns=["Dist"]), - "vtest" : pd.DataFrame(self.mod_sup_vtest_, index =self.mod_sup_coord_,columns=self.dim_index_) - }) - return df - -def get_mca_var(self) -> dict: - """ - self : an instance of class MCA - - Returns - ------- - Multiple Correspondence Analysis - Results for categories variables - ===================================================================== - Names Description - 1 "chi2" "chi-squared tests and p-values" - 2 "inertia" "Categories variables inertia" - 3 "eta2" "Correlation ratio" - 4 "cos2" "cosines of the categories variables" - 5 "contrib" "contributions of the categories variables" - """ - - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an object of class MCA.") - - df = dict({ - "chi2" : self.chi2_test_, - "inertia" : pd.DataFrame(self.var_inertia_,index=self.var_labels_,columns=["I(j,G)"]), - "eta2" : pd.DataFrame(self.var_eta2_,index=self.var_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.var_cos2_,index=self.var_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.var_contrib_,index=self.var_labels_,columns=self.dim_index_) - }) - if ((self.quanti_sup_labels_ is not None) & (self.quali_sup_labels_ is not None)): - df["quanti_sup"] = dict({ - "coord" : pd.DataFrame(self.quanti_sup_coord_,index=self.quanti_sup_labels_,columns=self.dim_index_) - }) - df["quali_sup"] = dict({ - "eta2" : pd.DataFrame(self.quali_sup_eta2_,index=self.quali_sup_labels_,columns=self.dim_index_), - }) - elif self.quanti_sup_labels_ is not None: - df["quanti_sup"] = dict({ - "coord" : pd.DataFrame(self.quanti_sup_coord_,index=self.quanti_sup_labels_,columns=self.dim_index_) - }) - elif self.quali_sup_labels_ is not None: - df["quali_sup"] = dict({ - "eta2" : pd.DataFrame(self.quali_sup_eta2_,index=self.quali_sup_labels_,columns=self.dim_index_), - }) - - return df - -def get_mca(self,choice="ind") -> dict: - """ - - Parameters - --------- - self : an instance of class MCA - choice : {'ind','mod','var'} - - if choice == "ind": - ------- - Multiple Correspondence Analysis - Results for individuals - =============================================================== - Names Description - 1 "coord" "coordinates for the individuals" - 2 "cos2" "cos2 for the individuals" - 3 "contrib" "contributions of the individuals" - 4 "infos" "additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - elif choice == "mod": - Multiple Correspondence Analysis - Results for categories - ===================================================================== - Names Description - 1 "coord" "coordinates for the categories" - 2 "corrected_coord" "Coorected coordinates for the categories" - 3 "cos2" "cos2 for the categories" - 4 "contrib" "contributions of the categories" - 5 "infos" "additionnal informations for the categories :" - - distance between categories and inertia - - weight for the categories - - inertia for the categories - elif choice == "var": - Multiple Correspondence Analysis - Results for categories variables - ===================================================================== - Names Description - 1 "chi2" "chi-squared tests and p-values" - 2 "inertia" "Categories variables inertia" - 3 "eta2" "Correlation ratio" - 4 "cos2" "cosines of the categories variables" - 5 "contrib" "contributions of the categories variables" - """ - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an object of class MCA.") - - if choice == "ind": - return get_mca_ind(self) - elif choice == "mod": - return get_mca_mod(self) - elif choice == "var": - return get_mca_var(self) - else: - raise ValueError("Error : Allowed values for the argument 'choice' are : 'ind','var' and 'mod'.") - -################## MDS - -def get_mds(self) -> dict: - - """ - self : an object of class MDS - - Returns - ------- - Multidimensional Scaling - Results - =============================================================== - Names Description - 1 "coord" "coordinates" - 2 "res.dist" "Restitues distances" - """ - if self.model_ not in ["mds","cmds"]: - raise ValueError("Error : 'res' must be an object of class MDS or CMDS.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.coord_,index=self.labels_,columns=self.dim_index_), - "res.dist" : pd.DataFrame(self.res_dist_,index=self.labels_,columns=self.labels_) - }) - return df - -############ Principal Components Analysis - -def get_pca_ind(self) -> dict: - - """ - self : an instance of class PCA - - Returns - ------- - Principal Component Analysis - Results for individuals - =============================================================== - Names Description - 1 "coord" "coordinates for the individuals" - 2 "cos2" "cos2 for the individuals" - 3 "contrib" "contributions of the individuals" - 4 "infos" "additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - """ - if self.model_ != "pca": - raise ValueError("Error : 'self' must be an object of class PCA.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.row_coord_,index=self.row_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_cos2_,index=self.row_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.row_contrib_,index=self.row_labels_,columns=self.dim_index_), - "infos" : pd.DataFrame(self.row_infos_,columns= ["d(i,G)","p(i)","I(i,G)"],index=self.row_labels_) - }) - if self.row_sup_labels_ is not None: - df["ind_sup"] = dict({ - "coord" : pd.DataFrame(self.row_sup_coord_,index=self.row_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_sup_cos2_,index=self.row_sup_labels_,columns=self.dim_index_) - }) - return df - -def get_pca_var(self) -> dict: - - """ - self : an instance of class PCA - - Returns - ------- - Principal Component Analysis - Results for variables - ============================================================== - Names Description - 1 "corr" "Pearson correlation between continuous variables" - 2 "pcorr" "Partial correlation between continuous variables" - 3 "coord" "Coordinates for the continuous variables" - 4 "cos2" "Cos2 for the continuous variables" - 5 "contrib" "Contributions of the continuous variables" - 6 "ftest" "Fisher test of the continuous variables" - 7 "cor" "correlations between variables and dimensions" - """ - if self.model_ != "pca": - raise ValueError("Error : 'self' must be an object of class PCA") - - # Store informations - df = dict({ - "corr" : pd.DataFrame(self.col_corr_,index=self.col_labels_,columns=self.col_labels_), - "pcorr" : pd.DataFrame(self.col_pcorr_,index=self.col_labels_,columns=self.col_labels_), - "coord" : pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_cos2_,index = self.col_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.col_contrib_,index = self.col_labels_,columns=self.dim_index_), - "ftest" : pd.DataFrame(self.col_ftest_,index = self.col_labels_,columns=self.dim_index_), - "cor" : pd.DataFrame(self.col_cor_,index=self.col_labels_,columns=self.dim_index_) - }) - - if ((self.quanti_sup_labels_ is not None) and (self.quali_sup_labels_ is not None)): - # Add supplementary quantitatives informations - df["quanti_sup"] = dict({ - "corr" : pd.DataFrame(self.col_sup_corr_,index=self.quanti_sup_labels_,columns=self.col_labels_), - "coord" : pd.DataFrame(self.col_sup_coord_,index=self.col_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_sup_cos2_,index=self.col_sup_labels_,columns=self.dim_index_), - "ftest" : pd.DataFrame(self.col_sup_ftest_,index=self.col_sup_labels_,columns=self.dim_index_) - }) - # Add supplementary categories informations - df["quali_sup"] = dict({ - "stats" : pd.DataFrame(self.mod_sup_stats_,columns=["n(k)","p(k)"],index=self.mod_sup_labels_), - "coord" : pd.DataFrame(self.mod_sup_coord_,index=self.mod_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_sup_cos2_,index=self.mod_sup_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.mod_sup_disto_,index=self.mod_sup_labels_,columns=["dist"]), - "eta2" : pd.DataFrame(self.quali_sup_eta2_,index=self.quali_sup_labels_,columns=self.dim_index_), - "vtest" : pd.DataFrame(self.mod_sup_vtest_,index=self.mod_sup_labels_,columns=self.dim_index_) - }) - elif self.quanti_sup_labels_ is not None: - # Add supplementary quantitatives informations - df["quanti_sup"] = dict({ - "corr" : pd.DataFrame(self.col_sup_corr_,index=self.quanti_sup_labels_,columns=self.col_labels_), - "coord" : pd.DataFrame(self.col_sup_coord_,index=self.col_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_sup_cos2_,index=self.col_sup_labels_,columns=self.dim_index_), - "ftest" : pd.DataFrame(self.col_sup_ftest_,index=self.col_sup_labels_,columns=self.dim_index_) - }) - elif self.quali_sup_labels_ is not None: - # Add supplementary categories informations - df["quali_sup"] = dict({ - "stats" : pd.DataFrame(self.mod_sup_stats_,columns=["n(k)","p(k)"],index=self.mod_sup_labels_), - "coord" : pd.DataFrame(self.mod_sup_coord_,index=self.mod_sup_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.mod_sup_cos2_,index=self.mod_sup_labels_,columns=self.dim_index_), - "dist" : pd.DataFrame(self.mod_sup_disto_,index=self.mod_sup_labels_,columns=["dist"]), - "eta2" : pd.DataFrame(self.quali_sup_eta2_,index=self.quali_sup_labels_,columns=self.dim_index_), - "vtest" : pd.DataFrame(self.mod_sup_vtest_,index=self.mod_sup_labels_,columns=self.dim_index_) - }) - return df - -def get_pca(self,choice = "row")-> dict: - - """ - self : an instance of class PCA - - choice : {"row", "var"}, default= "row" - - Returns - ------- - if choice == "row": - Principal Component Analysis - Results for individuals - =================================================== - Names Description - 1 "coord" "coordinates for the individuals" - 2 "cos2" "cos2 for the individuals" - 3 "contrib" "contributions of the individuals" - 4 "infos" "additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - elif choice == "var": - Principal Component Analysis - Results for variables - =================================================== - Names Description - 1 "corr" "Pearson correlation between continuous variables" - 2 "pcorr" "Partial correlation between continuous variables" - 3 "coord" "Coordinates for the continuous variables" - 4 "cos2" "Cos2 for the continuous variables" - 5 "contrib" "Contributions of the continuous variables" - 6 "ftest" "Fisher test of the continuous variables" - 7 "cor" "correlations between variables and dimensions" - """ - if self.model_ != "pca": - raise ValueError("Error : 'self' must be an object of class PCA.") - if choice == "row": - return get_pca_ind(self) - elif choice == "var": - return get_pca_var(self) - else: - raise ValueError("Allowed values for the argument choice are : 'row' or 'var'.") - -########## Partial Principal Components Analysis -def get_ppca_ind(self) -> dict: - - """ - self : an instance of class PPCA - - Returns - ------- - Partial Principal Component Analysis - Results for individuals - =============================================================== - Names Description - 1 "coord" "coordinates for the individuals" - 2 "cos2" "cos2 for the individuals" - 3 "contrib" "contributions of the individuals" - 4 "infos" "additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - """ - if self.model_ != "ppca": - raise ValueError("Error : 'self' must be an object of class PPCA.") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.row_coord_,index=self.row_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.row_cos2_,index=self.row_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.row_contrib_,index=self.row_labels_,columns=self.dim_index_), - "infos" : pd.DataFrame(self.row_infos_,columns= ["d(i,G)","p(i)","I(i,G)"],index=self.row_labels_) - }) - return df - -def get_ppca_var(self) -> dict: - - """ - self : an instance of class PPCA - - Returns - ------- - Partial Principal Component Analysis - Results for variables - ============================================================== - Names Description - 1 "coord" "coordinates for the variables" - 2 "cos2" "cos2 for the variables" - 3 "contrib" "contributions of the variables" - 4 "cor" "correlations between variables and dimensions" - """ - if self.model_ != "ppca": - raise ValueError("Error : 'self' must be an object of class PPCA") - - # Store informations - df = dict({ - "coord" : pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_), - "cos2" : pd.DataFrame(self.col_cos2_,index = self.col_labels_,columns=self.dim_index_), - "contrib" : pd.DataFrame(self.col_contrib_,index = self.col_labels_,columns=self.dim_index_), - "cor" : pd.DataFrame(self.col_cor_,index=self.col_labels_,columns=self.dim_index_) - }) - return df - -def get_ppca(self,choice = "row")-> dict: - - """ - self : an instance of class PPCA - - choice : {"row", "var"}, default= "row" - - Returns - ------- - if choice == "row": - Partial Principal Component Analysis - Results for individuals - =================================================== - Names Description - 1 "coord" "coordinates for the individuals" - 2 "cos2" "cos2 for the individuals" - 3 "contrib" "contributions of the individuals" - 4 "infos" "additionnal informations for the individuals :" - - distance between individuals and inertia - - weight for the individuals - - inertia for the individuals - - if choice == "var": - Partial rincipal Component Analysis - Results for variables - =================================================== - Names Description - 1 "coord" "coordinates for the variables" - 2 "cos2" "cos2 for the variables" - 3 "contrib" "contributions of the variables" - 4 "cor" "correlations between variables and dimensions" - """ - if self.model_ != "ppca": - raise ValueError("Error : 'self' must be an object of class PPCA.") - if choice == "row": - return get_ppca_ind(self) - elif choice == "var": - return get_ppca_var(self) - else: - raise ValueError("Allowed values for the argument choice are : 'row' or 'var'.") - - -################## Summarize functions - -def summaryCA(self, - digits=3, - nb_element=10, - ncp=3, - to_markdown=False, - tablefmt="pipe", - **kwargs): - """Printing summaries of correspondence analysis model - - Parameters - ---------- - self : an obect of class CA. - digits : int, default=3. Number of decimal printed - nb_element : int, default = 10. Number of element - ncp : int, default = 3. Number of componennts - to_markdown : Print DataFrame in Markdown-friendly format. - tablefmt : Table format. For more about tablefmt, see : https://pypi.org/project/tabulate/ - **kwargs : These parameters will be passed to tabulate. - """ - - row = get_ca(self,choice="row") - col = get_ca(self,choice="col") - - ncp = min(ncp,self.n_components_) - nb_element = min(nb_element,len(self.row_labels_)) - - # Principal Components Analysis Results - print(" Correspondence Analysis - Results \n") - - # Add eigenvalues informations - print("Importance of components") - eig = pd.DataFrame(self.eig_,columns=self.dim_index_, - index=["Variance","Difference","% of var.","Cumulative of % of var."]).round(decimals=digits) - if to_markdown: - print(eig.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(eig) - - # Add individuals informations - print(f"\nRows\n") - row_infos = row["infos"] - for i in np.arange(0,ncp,1): - row_coord = row["coord"].iloc[:,i] - row_cos2 = row["cos2"].iloc[:,i] - row_cos2.name = "cos2" - row_ctr = row["contrib"].iloc[:,i] - row_ctr.name = "ctr" - row_infos = pd.concat([row_infos,row_coord,row_ctr,row_cos2],axis=1) - row_infos = row_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(row_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_infos) - - # Add supplementary individuals - if self.row_sup_labels_ is not None: - print(f"\nSupplementary rows\n") - # Save all informations - row_sup_coord = row["row_sup"]["coord"].iloc[:,:ncp].round(decimals=digits) - if to_markdown: - print(row_sup_coord.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_sup_coord) - - # Add variables informations - print(f"\nColumns\n") - col_infos = col["infos"] - for i in np.arange(0,ncp,1): - col_coord = col["coord"].iloc[:,i] - col_cos2 = col["cos2"].iloc[:,i] - col_cos2.name = "cos2" - col_ctr = col["contrib"].iloc[:,i] - col_ctr.name = "ctr" - col_infos = pd.concat([col_infos,col_coord,col_ctr,col_cos2],axis=1) - col_infos = col_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(col_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_infos) - - # Add supplementary columns informations - if self.col_sup_labels_ is not None: - print(f"\nSupplementary columns\n") - col_sup_coord = col["col_sup"]["coord"].iloc[:,:ncp].round(decimals=digits) - if to_markdown: - print(col_sup_coord.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_sup_coord) - - -def summaryEFA(self, - digits=3, - nb_element=10, - ncp=3, - to_markdown=False, - tablefmt = "pipe", - **kwargs): - """Printing summaries of exploratory factor analysis model - - Parameters - ---------- - self : an obect of class EFA. - digits : int, default=3. Number of decimal printed - nb_element : int, default = 10. Number of element - ncp : int, default = 3. Number of componennts - to_markdown : Print DataFrame in Markdown-friendly format. - tablefmt : Table format. For more about tablefmt, see : https://pypi.org/project/tabulate/ - **kwargs : These parameters will be passed to tabulate. - """ - - row = get_efa(self,choice="row") - col = get_efa(self,choice="var") - - - ncp = min(ncp,self.n_components_) - nb_element = min(nb_element,len(self.row_labels_),len(self.col_labels_)) - - # Exploratory Factor Analysis Results - print(" Exploratory Factor Analysis - Results \n") - - # Add eigenvalues informations - print("Importance of components") - eig = pd.DataFrame(self.eig_,columns=self.dim_index_, - index=["Variance","Difference","% of var.","Cumulative of % of var."]).round(decimals=digits) - if to_markdown: - print(eig.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(eig) - - # Add individuals informations - print(f"\nIndividuals (the {nb_element} first) \n") - row_coord = row["coord"].iloc[:nb_element,:ncp].round(decimals=digits) - if to_markdown: - print(row_coord.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_coord) - - # Add supplementary individuals - if self.row_sup_labels_ is not None: - nb_elt = min(nb_element,len(self.row_sup_labels_)) - print(f"\nSupplementary Individuals\n") - # Save all informations - row_sup_infos = pd.DataFrame(index=self.row_sup_labels_).astype("float") - row_sup = row["ind_sup"] - for i in np.arange(0,ncp,1): - row_sup_coord = row_sup["coord"].iloc[:,i] - row_sup_cos2 = row_sup["cos2"].iloc[:,i] - row_sup_cos2.name = "cos2" - row_sup_infos = pd.concat([row_sup_infos,row_sup_coord,row_sup_cos2],axis=1) - row_sup_infos = row_sup_infos.iloc[:nb_elt,:].round(decimals=digits) - if to_markdown: - print(row_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_sup_infos) - - # Add variables informations - print(f"\nContinues Variables\n") - col_infos = pd.DataFrame(index=self.col_labels_).astype("float") - for i in np.arange(0,ncp,1): - col_coord = col["coord"].iloc[:,i] - col_ctr = col["contrib"].iloc[:,i] - col_ctr.name = "ctr" - col_infos = pd.concat([col_infos,col_coord,col_ctr],axis=1) - col_infos = col_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(col_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_infos) - - # Add supplementary continuous variables informations - if self.quanti_sup_labels_ is not None: - print(f"\nSupplementary continuous variable\n") - col_sup_infos = pd.DataFrame(index=self.quanti_sup_labels_).astype("float") - col_sup = col["quanti_sup"] - for i in np.arange(0,ncp,1): - col_sup_coord = col_sup["coord"].iloc[:,i] - col_sup_cos2 = col_sup["cos2"].iloc[:,i] - col_sup_cos2.name = "cos2" - col_sup_infos =pd.concat([col_sup_infos,col_sup_coord,col_sup_cos2],axis=1) - col_sup_infos = col_sup_infos.round(decimals=digits) - - if to_markdown: - print(col_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_sup_infos) - - # Add Supplementary categories – Variable illustrative qualitative - if self.quali_sup_labels_ is not None: - print("\nSupplementary categories\n") - mod_sup = col["quali_sup"] - mod_sup_infos = np.sqrt(mod_sup["dist"]) - for i in np.arange(0,ncp,1): - mod_sup_coord = mod_sup["coord"].iloc[:,i] - mod_sup_cos2 = mod_sup["cos2"].iloc[:,i] - mod_sup_cos2.name = "cos2" - mod_sup_vtest = mod_sup["vtest"].iloc[:,i] - mod_sup_vtest.name = "v.test" - mod_sup_infos = pd.concat([mod_sup_infos,mod_sup_coord,mod_sup_cos2,mod_sup_vtest],axis=1) - mod_sup_infos = mod_sup_infos.round(decimals=digits) - - if to_markdown: - print(mod_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(mod_sup_infos) - - # Add supplementary qualitatives - correlation ration - print("\nSupplementatry categorical variable\n") - corr_ratio = mod_sup["eta2"].iloc[:,:ncp].round(decimals=digits) - if to_markdown: - print(corr_ratio.to_markdown(tablefmt=tablefmt)) - else: - print(corr_ratio) - -###### FAMD - -def summaryFAMD(self, - digits=3, - nb_element=10, - ncp=3, - to_markdown=False, - tablefmt = "pipe", - **kwargs): - """Printing summaries of factor analysis of miixed data model - - Parameters - ---------- - self : an obect of class FAMD. - digits : int, default=3. Number of decimal printed - nb_element : int, default = 10. Number of element - ncp : int, default = 3. Number of componennts - to_markdown : Print DataFrame in Markdown-friendly format. - tablefmt : Table format. For more about tablefmt, see : https://pypi.org/project/tabulate/ - **kwargs : These parameters will be passed to tabulate. - """ - - row = get_famd_ind(self) - mod = get_famd_mod(self) - var = get_famd_var(self) - col = get_famd_col(self) - - ncp = min(ncp,self.n_components_) - nb_element = min(nb_element,len(self.row_labels_)) - - # Principal Components Analysis Results - print(" Factor Analysis of Mixed Data - Results \n") - - # Add eigenvalues informations - print("Importance of components") - eig = pd.DataFrame(self.eig_,columns=self.dim_index_, - index=["Variance","Difference","% of var.","Cumulative of % of var."]).round(decimals=digits) - if to_markdown: - print(eig.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(eig) - - # Add individuals informations - print(f"\nIndividuals (the {nb_element} first)\n") - row_infos = row["infos"] - for i in np.arange(0,ncp,1): - row_coord = row["coord"].iloc[:,i] - row_cos2 = row["cos2"].iloc[:,i] - row_cos2.name = "cos2" - row_ctr = row["contrib"].iloc[:,i] - row_ctr.name = "ctr" - row_infos = pd.concat([row_infos,row_coord,row_ctr,row_cos2],axis=1) - row_infos = row_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(row_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_infos) - - # Add supplementary individuals - if self.row_sup_labels_ is not None: - print(f"\nSupplementary individuals\n") - row_sup = row["ind_sup"] - row_sup_infos = row_sup["dist"] - for i in np.arange(0,ncp,1): - row_sup_coord = row_sup["coord"].iloc[:,i] - row_sup_cos2 = row_sup["cos2"].iloc[:,i] - row_sup_cos2.name = "cos2" - row_sup_infos = pd.concat([row_sup_infos,row_sup_coord,row_sup_cos2],axis=1) - row_sup_infos = row_sup_infos.round(decimals=digits) - if to_markdown: - print(row_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_sup_infos) - - # Add variables informations - print(f"\nContinuous variables\n") - col_infos = pd.DataFrame(index=self.col_labels_).astype("float") - for i in np.arange(0,ncp,1): - col_coord = col["coord"].iloc[:,i] - col_cos2 = col["cos2"].iloc[:,i] - col_cos2.name = "cos2" - col_ctr = col["contrib"].iloc[:,i] - col_ctr.name = "ctr" - col_infos = pd.concat([col_infos,col_coord,col_ctr,col_cos2],axis=1) - col_infos = col_infos.round(decimals=digits) - if to_markdown: - print(col_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_infos) - - # Add supplementary continuous variables informations - if self.quanti_sup_labels_ is not None: - print(f"\nSupplementary continuous variable\n") - col_sup_infos = pd.DataFrame(index=self.quanti_sup_labels_).astype("float") - col_sup = col["quanti_sup"] - for i in np.arange(0,ncp,1): - col_sup_coord = col_sup["coord"].iloc[:,i] - col_sup_cos2 = col_sup["cos2"].iloc[:,i] - col_sup_cos2.name = "cos2" - col_sup_infos =pd.concat([col_sup_infos,col_sup_coord,col_sup_cos2],axis=1) - col_sup_infos = col_sup_infos.round(decimals=digits) - - if to_markdown: - print(col_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_sup_infos) - - # Add variables informations - print(f"\nCategories\n") - mod_infos = mod["infos"] - for i in np.arange(0,ncp,1): - mod_coord = mod["coord"].iloc[:,i] - mod_cos2 = mod["cos2"].iloc[:,i] - mod_cos2.name = "cos2" - mod_ctr = mod["contrib"].iloc[:,i] - mod_ctr.name = "ctr" - mod_vtest = mod["vtest"].iloc[:,i] - mod_vtest.name = "vtest" - mod_infos = pd.concat([mod_infos,mod_coord,mod_ctr,mod_cos2,mod_vtest],axis=1) - mod_infos = mod_infos.round(decimals=digits) - if to_markdown: - print(mod_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(mod_infos) - - # Add variables - print("\nCategorical variables\n") - var_infos = pd.DataFrame(index=self.quali_sup_labels_).astype("float") - for i in np.arange(0,ncp,1): - var_eta2 = var["eta2"].iloc[:,i] - var_eta2.name = "eta2."+str(i+1) - var_cos2 = var["cos2"].iloc[:,i] - var_cos2.name = "cos2." +str(i+1) - var_infos = pd.concat([var_infos,var_eta2,var_cos2],axis=1) - var_infos = var_infos.round(decimals=digits) - if to_markdown: - print(var_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(var_infos) - - # Add Supplementary categories – Variable illustrative qualitative - if self.quali_sup_labels_ is not None: - print("\nSupplementary categories\n") - mod_sup = col["quali_sup"] - mod_sup_infos = np.sqrt(mod_sup["dist"]) - for i in np.arange(0,ncp,1): - mod_sup_coord = mod_sup["coord"].iloc[:,i] - mod_sup_cos2 = mod_sup["cos2"].iloc[:,i] - mod_sup_cos2.name = "cos2" - mod_sup_vtest = mod_sup["vtest"].iloc[:,i] - mod_sup_vtest.name = "v.test" - mod_sup_infos = pd.concat([mod_sup_infos,mod_sup_coord,mod_sup_cos2,mod_sup_vtest],axis=1) - mod_sup_infos = mod_sup_infos.round(decimals=digits) - - if to_markdown: - print(mod_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(mod_sup_infos) - - # Add supplementary qualitatives - correlation ration - print("\nSupplementatry categorical variable\n") - corr_ratio = mod_sup["eta2"].iloc[:,:ncp].round(decimals=digits) - if to_markdown: - print(corr_ratio.to_markdown(tablefmt=tablefmt)) - else: - print(corr_ratio) - -########" MCA" - -def summaryMCA(self,digits=3,nb_element=10,ncp=3,to_markdown=False,tablefmt = "pipe",**kwargs): - """Printing summaries of multiple correspondence analysis model - Parameters - ---------- - self : an obect of class MCA. - digits : int, default=3. Number of decimal printed - nb_element : int, default = 10. Number of element - ncp : int, default = 3. Number of componennts - to_markdown : Print DataFrame in Markdown-friendly format. - tablefmt : Table format. For more about tablefmt, see : https://pypi.org/project/tabulate/ - **kwargs : These parameters will be passed to tabulate. - """ - - row = get_mca(self,choice="ind") - mod = get_mca(self,choice="mod") - var = get_mca(self,choice="var") - - - ncp = min(ncp,self.n_components_) - nb_element = min(nb_element,len(self.row_labels_),len(self.mod_labels_)) - - # Multiple correspondance Analysis - Results - print(" Multiple Correspondance Analysis - Results \n") - - # Add eigenvalues informations - print("Importance of components") - eig = pd.DataFrame(self.eig_,columns=self.dim_index_, - index=["Variance","Difference","% of var.","Cumulative of % of var."]).round(decimals=digits) - if to_markdown: - print(eig.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(eig) - - # Add individuals informations - print(f"\nIndividuals (the {nb_element} first)\n") - row_infos = row["infos"] - for i in np.arange(0,ncp,1): - row_coord = row["coord"].iloc[:,i] - row_cos2 = row["cos2"].iloc[:,i] - row_cos2.name = "cos2" - row_ctr = row["contrib"].iloc[:,i] - row_ctr.name = "ctr" - row_infos = pd.concat([row_infos,row_coord,row_ctr,row_cos2],axis=1) - row_infos = row_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(row_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_infos) - - # Add supplementary individuals - if self.row_sup_labels_ is not None: - nb_elt = min(nb_element,len(self.row_sup_labels_)) - print(f"\nSupplementary Individuals\n") - # Save all informations - row_sup_infos = pd.DataFrame(index=self.row_sup_labels_).astype("float") - row_sup = row["ind_sup"] - for i in np.arange(0,ncp,1): - row_sup_coord = row_sup["coord"].iloc[:,i] - row_sup_cos2 = row_sup["cos2"].iloc[:,i] - row_sup_cos2.name = "cos2" - row_sup_infos = pd.concat([row_sup_infos,row_sup_coord,row_sup_cos2],axis=1) - row_sup_infos = row_sup_infos.iloc[:nb_elt,:].round(decimals=digits) - if to_markdown: - print(row_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_sup_infos) - - # Add variables informations - print(f"\nCategories\n") - mod_infos = mod["infos"] - for i in np.arange(0,ncp,1): - mod_coord = mod["coord"].iloc[:,i] - mod_cos2 = mod["cos2"].iloc[:,i] - mod_cos2.name = "cos2" - mod_ctr = mod["contrib"].iloc[:,i] - mod_ctr.name = "ctr" - mod_vtest = mod["vtest"].iloc[:,i] - mod_vtest.name = "vtest" - mod_infos = pd.concat([mod_infos,mod_coord,mod_ctr,mod_cos2,mod_vtest],axis=1) - mod_infos = mod_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(mod_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(mod_infos) - - # Add variables - print("\nCategorical variables\n") - var_infos = var["inertia"] - for i in np.arange(0,ncp,1): - var_eta2 = var["eta2"].iloc[:,i] - var_eta2.name = "eta2."+str(i+1) - var_cos2 = var["cos2"].iloc[:,i] - var_cos2.name = "cos2." +str(i+1) - var_ctr = var["contrib"].iloc[:,i] - var_ctr.name = "ctr."+str(i+1) - var_infos = pd.concat([var_infos,var_eta2,var_ctr,var_cos2],axis=1) - var_infos = var_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(var_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(var_infos) - - # Add supplementary continuous variables informations - if self.quanti_sup_labels_ is not None: - print(f"\nSupplementary continuous variable\n") - col_sup_coord = var["quanti_sup"]["coord"].iloc[:,:ncp] - if to_markdown: - print(col_sup_coord.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_sup_coord) - - # Add Supplementary categories – Variable illustrative qualitative - if self.quali_sup_labels_ is not None: - print("\nSupplementary categories\n") - mod_sup = mod["quali_sup"] - mod_sup_infos = np.sqrt(mod_sup["dist"]) - for i in np.arange(0,ncp,1): - mod_sup_coord = mod_sup["coord"].iloc[:,i] - mod_sup_cos2 = mod_sup["cos2"].iloc[:,i] - mod_sup_cos2.name = "cos2" - mod_sup_vtest = mod_sup["vtest"].iloc[:,i] - mod_sup_vtest.name = "v.test" - mod_sup_infos = pd.concat([mod_sup_infos,mod_sup_coord,mod_sup_cos2,mod_sup_vtest],axis=1) - mod_sup_infos = mod_sup_infos.round(decimals=digits) - - if to_markdown: - print(mod_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(mod_sup_infos) - - # Add supplementary qualitatives - correlation ration - print("\nSupplementatry qualitative variable - Correlation ratio\n") - corr_ratio = mod_sup["eta2"].iloc[:,:ncp].round(decimals=digits) - if to_markdown: - print(corr_ratio.to_markdown(tablefmt=tablefmt)) - else: - print(corr_ratio) - -###### PCA - -def summaryPCA(self, - digits=3, - nb_element=10, - ncp=3, - to_markdown=False, - tablefmt = "pipe", - **kwargs): - """Printing summaries of principal component analysis model - - Parameters - ---------- - self : an obect of class PCA. - digits : int, default=3. Number of decimal printed - nb_element : int, default = 10. Number of element - ncp : int, default = 3. Number of componennts - to_markdown : Print DataFrame in Markdown-friendly format. - tablefmt : Table format. For more about tablefmt, see : https://pypi.org/project/tabulate/ - **kwargs : These parameters will be passed to tabulate. - """ - - row = get_pca(self,choice="row") - col = get_pca(self,choice="var") - - - ncp = min(ncp,self.n_components_) - nb_element = min(nb_element,len(self.row_labels_)) - - # Principal Components Analysis Results - print(" Principal Component Analysis - Results \n") - - # Add eigenvalues informations - print("Importance of components") - eig = pd.DataFrame(self.eig_,columns=self.dim_index_, - index=["Variance","Difference","% of var.","Cumulative of % of var."]).round(decimals=digits) - if to_markdown: - print(eig.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(eig) - - # Add individuals informations - print(f"\nIndividuals (the {nb_element} first)\n") - row_infos = row["infos"] - for i in np.arange(0,ncp,1): - row_coord = row["coord"].iloc[:,i] - row_cos2 = row["cos2"].iloc[:,i] - row_cos2.name = "cos2" - row_ctr = row["contrib"].iloc[:,i] - row_ctr.name = "ctr" - row_infos = pd.concat([row_infos,row_coord,row_ctr,row_cos2],axis=1) - row_infos = row_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(row_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_infos) - - # Add supplementary individuals - if self.row_sup_labels_ is not None: - print(f"\nSupplementary Individuals\n") - # Save all informations - row_sup_infos = pd.DataFrame(index=self.row_sup_labels_).astype("float") - row_sup = row["ind_sup"] - for i in np.arange(0,ncp,1): - row_sup_coord = row_sup["coord"].iloc[:,i] - row_sup_cos2 = row_sup["cos2"].iloc[:,i] - row_sup_cos2.name = "cos2" - row_sup_infos = pd.concat([row_sup_infos,row_sup_coord,row_sup_cos2],axis=1) - row_sup_infos = row_sup_infos.round(decimals=digits) - if to_markdown: - print(row_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_sup_infos) - - # Add variables informations - print(f"\nContinues variables\n") - col_infos = pd.DataFrame(index=self.col_labels_).astype("float") - for i in np.arange(0,ncp,1): - col_coord = col["coord"].iloc[:,i] - col_cos2 = col["cos2"].iloc[:,i] - col_cos2.name = "cos2" - col_ctr = col["contrib"].iloc[:,i] - col_ctr.name = "ctr" - col_infos = pd.concat([col_infos,col_coord,col_ctr,col_cos2],axis=1) - col_infos = col_infos.round(decimals=digits) - if to_markdown: - print(col_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_infos) - - # Add supplementary continuous variables informations - if self.quanti_sup_labels_ is not None: - print(f"\nSupplementary continuous variable\n") - col_sup_infos = pd.DataFrame(index=self.quanti_sup_labels_).astype("float") - col_sup = col["quanti_sup"] - for i in np.arange(0,ncp,1): - col_sup_coord = col_sup["coord"].iloc[:,i] - col_sup_cos2 = col_sup["cos2"].iloc[:,i] - col_sup_cos2.name = "cos2" - col_sup_infos =pd.concat([col_sup_infos,col_sup_coord,col_sup_cos2],axis=1) - col_sup_infos = col_sup_infos.round(decimals=digits) - - if to_markdown: - print(col_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_sup_infos) - - # Add Supplementary categories – Variable illustrative qualitative - if self.quali_sup_labels_ is not None: - print("\nSupplementary categories\n") - mod_sup = col["quali_sup"] - mod_sup_infos = np.sqrt(mod_sup["dist"]) - for i in np.arange(0,ncp,1): - mod_sup_coord = mod_sup["coord"].iloc[:,i] - mod_sup_cos2 = mod_sup["cos2"].iloc[:,i] - mod_sup_cos2.name = "cos2" - mod_sup_vtest = mod_sup["vtest"].iloc[:,i] - mod_sup_vtest.name = "v.test" - mod_sup_infos = pd.concat([mod_sup_infos,mod_sup_coord,mod_sup_cos2,mod_sup_vtest],axis=1) - mod_sup_infos = mod_sup_infos.round(decimals=digits) - - if to_markdown: - print(mod_sup_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(mod_sup_infos) - - # Add supplementary qualitatives - correlation ration - print("\nSupplementatry categorical variable\n") - corr_ratio = mod_sup["eta2"].iloc[:,:ncp].round(decimals=digits) - if to_markdown: - print(corr_ratio.to_markdown(tablefmt=tablefmt)) - else: - print(corr_ratio) - -########### Partial PCA - -def summaryPPCA(self, - digits=3, - nb_element=10, - ncp=3, - to_markdown=False, - tablefmt = "pipe", - **kwargs): - """Printing summaries of partial principal component analysis model - - Parameters - ---------- - self : an obect of class PPCA. - digits : int, default=3. Number of decimal printed - nb_element : int, default = 10. Number of element - ncp : int, default = 3. Number of componennts - to_markdown : Print DataFrame in Markdown-friendly format. - tablefmt : Table format. For more about tablefmt, see : https://pypi.org/project/tabulate/ - **kwargs : These parameters will be passed to tabulate. - """ - - row = get_ppca(self,choice="row") - col = get_ppca(self,choice="var") - - - ncp = min(ncp,self.n_components_) - nb_element = min(nb_element,len(self.row_labels_)) - - # Principal Components Analysis Results - print(" Partial Principal Component Analysis - Results \n") - - # Add eigenvalues informations - print("Importance of components") - eig = pd.DataFrame(self.eig_,columns=self.dim_index_, - index=["Variance","Difference","% of var.","Cumulative of % of var."]).round(decimals=digits) - if to_markdown: - print(eig.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(eig) - - # Add individuals informations - print(f"\nIndividuals (the {nb_element} first)\n") - row_infos = row["infos"] - for i in np.arange(0,ncp,1): - row_coord = row["coord"].iloc[:,i] - row_cos2 = row["cos2"].iloc[:,i] - row_cos2.name = "cos2" - row_ctr = row["contrib"].iloc[:,i] - row_ctr.name = "ctr" - row_infos = pd.concat([row_infos,row_coord,row_ctr,row_cos2],axis=1) - row_infos = row_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(row_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(row_infos) - - # Add variables informations - print(f"\nContinues variables\n") - col_infos = pd.DataFrame(index=self.col_labels_).astype("float") - for i in np.arange(0,ncp,1): - col_coord = col["coord"].iloc[:,i] - col_cos2 = col["cos2"].iloc[:,i] - col_cos2.name = "cos2" - col_ctr = col["contrib"].iloc[:,i] - col_ctr.name = "ctr" - col_infos = pd.concat([col_infos,col_coord,col_ctr,col_cos2],axis=1) - col_infos = col_infos.iloc[:nb_element,:].round(decimals=digits) - if to_markdown: - print(col_infos.to_markdown(tablefmt=tablefmt,**kwargs)) - else: - print(col_infos) - - - diff --git a/build/lib/scientisttools/ggcorrplot.py b/build/lib/scientisttools/ggcorrplot.py deleted file mode 100644 index d5eeb2c..0000000 --- a/build/lib/scientisttools/ggcorrplot.py +++ /dev/null @@ -1,238 +0,0 @@ -# -*- coding: utf-8 -*- - -import numpy as np -import pandas as pd -import plotnine as pn -import scipy.stats as st -import plydata as ply -from scipy.cluster import hierarchy -from scipy.spatial.distance import squareform -from scientisttools.utils import get_melt - -def hc_cormat_order(cormat, method='complete'): - if not isinstance(cormat,pd.DataFrame): - raise ValueError("Error : 'cormat' must be a DataFrame.") - X = (1-cormat)/2 - Z = hierarchy.linkage(squareform(X),method=method, metric="euclidean") - order = hierarchy.leaves_list(Z) - return dict({"order":order,"height":Z[:,2],"method":method, - "merge":Z[:,:2],"n_obs":Z[:,3],"data":cormat}) - -def match_arg(x, lst): - return [el for el in lst if x in el][0] - -def no_panel(): - return pn.theme( - axis_title_x=pn.element_blank(), - axis_title_y=pn.element_blank() - ) - -def remove_diag(cormat): - if cormat is None: - return cormat - if not isinstance(cormat,pd.DataFrame): - raise ValueError("Error : 'cormat' must be a DataFrame.") - np.fill_diagonal(cormat.values, np.nan) - return cormat - -def get_upper_tri(cormat,show_diag=False): - if cormat is None: - return cormat - if not isinstance(cormat,pd.DataFrame): - raise ValueError("Error : 'cormat' must be a DataFrame.") - cormat = pd.DataFrame(np.triu(cormat),index=cormat.index,columns=cormat.columns) - cormat.values[np.tril_indices(cormat.shape[0], -1)] = np.nan - if not show_diag: - np.fill_diagonal(cormat.values,np.nan) - return cormat - -def get_lower_tri(cormat,show_diag=False): - if cormat is None: - return cormat - if not isinstance(cormat,pd.DataFrame): - raise ValueError("Error : 'cormat' must be a DataFrame.") - cormat = pd.DataFrame(np.tril(cormat),index=cormat.index,columns=cormat.columns) - cormat.values[np.triu_indices(cormat.shape[0], 1)] = np.nan - if not show_diag: - np.fill_diagonal(cormat.values,np.nan) - return cormat - -def cor_pmat(x,**kwargs): - if not isinstance(x,pd.DataFrame): - raise ValueError("Error : 'x' must be a DataFrame.") - y = np.array(x) - n = y.shape[1] - p_mat = np.zeros((n,n)) - np.fill_diagonal(p_mat,0) - for i in np.arange(0,n-1): - for j in np.arange(i+1,n): - tmps = st.pearsonr(y[:,i],y[:,j],**kwargs) - p_mat[i,j] = p_mat[j,i] = tmps[1] - p_mat = pd.DataFrame(p_mat,index=x.columns,columns=x.columns) - return p_mat - -def ggcorrplot(x, - method = "square", - type = "full", - ggtheme = pn.theme_minimal(), - title = None, - show_legend = True, - legend_title = "Corr", - show_diag = None, - colors = ["blue","white","red"], - outline_color = "gray", - hc_order = False, - hc_method = "complete", - lab = False, - lab_col = "black", - lab_size = 11, - p_mat = None, - sig_level=0.05, - insig = "pch", - pch = 4, - pch_col = "black", - pch_cex = 5, - tl_cex = 12, - tl_col = "black", - tl_srt = 45, - digits = 2): - - if not isinstance(x,pd.DataFrame): - raise ValueError("Error : 'x' must be a DataFrame.") - - if p_mat is not None: - if not isinstance(p_mat,pd.DataFrame): - raise ValueError("Error : 'p_mat' must be a DataFrame.") - - type = match_arg(type, ["full","lower","upper"]) - method = match_arg(method,["square",'circle']) - insig = match_arg(insig,["pch","blank"]) - - if show_diag is None: - if type == "full": - show_diag = True - else: - show_diag = False - - corr = x.corr().round(decimals=digits) - - if hc_order: - ord = hc_cormat_order(corr,method=hc_method)["order"] - corr = corr.iloc[ord,ord] - if p_mat is not None: - p_mat = p_mat.iloc[ord,ord] - p_mat = p_mat.round(decimals=digits) - - if not show_diag: - corr = remove_diag(corr) - if p_mat is not None: - p_mat = remove_diag(p_mat) - - # Get lower or upper triangle - if type == "lower": - corr = get_lower_tri(corr,show_diag) - if p_mat is not None: - p_mat = get_lower_tri(p_mat,show_diag) - elif type == "upper": - corr = get_upper_tri(corr,show_diag) - if p_mat is not None: - p_mat = get_upper_tri(corr,show_diag) - - # Melt corr and p_mat - corr.columns = pd.Categorical(corr.columns,categories=corr.columns) - corr.index = pd.Categorical(corr.columns,categories=corr.columns) - corr = get_melt(corr) - - corr = corr >> ply.define(pvalue=np.nan) - corr = corr >> ply.define(signif=np.nan) - - if p_mat is not None: - p_mat = get_melt(p_mat) - corr = corr >> ply.define(coef="value") - corr = corr >> ply.mutate(pvalue=p_mat.value) - corr["signif"] = np.where(p_mat.value <= sig_level,1,0) - p_mat = p_mat.query(f'value > {sig_level}') - if insig == "blank": - corr = corr >> ply.mutate(value="value*signif") - - corr = corr >> ply.define(abs_corr="abs(value)*10") - - p = pn.ggplot(corr,pn.aes(x="Var1",y="Var2",fill="value")) - - # Modification based on method - if method == "square": - p = p + pn.geom_tile(color=outline_color) - elif method == "circle": - p = p+pn.geom_point(pn.aes(size="abs_corr"), - color=outline_color, - shape="o")+pn.scale_size_continuous(range=(4,10))+pn.guides(size=None) - - # Adding colors - p =p + pn.scale_fill_gradient2( - low = colors[0], - high = colors[2], - mid = colors[1], - midpoint = 0, - limits = [-1,1], - name = legend_title - ) - - # depending on the class of the object, add the specified theme - p = p + ggtheme - - p =p+pn.theme( - axis_text_x=pn.element_text(angle=tl_srt, - va="center", - size=tl_cex, - ha="center", - color=tl_col), - axis_text_y=pn.element_text(size=tl_cex) - ) + pn.coord_fixed() - - label = corr["value"].round(digits) - - if p_mat is not None and insig == "blank": - ns = corr["pvalue"] > sig_level - if sum(ns) > 0: - label[ns] = " " - - # matrix cell labels - if lab: - p = p + pn.geom_text(mapping=pn.aes(x="Var1",y="Var2"), - label = label, - color=lab_col, - size=lab_size) - - # matrix cell - if p_mat is not None and insig == "pch": - p = p + pn.geom_point(data = p_mat, - mapping = pn.aes(x = "Var1",y = "Var2"), - shape = pch, - size=pch_cex, - color= pch_col) - - if title is not None: - p = p + pn.ggtitle(title=title) - - # Removing legend - if not show_legend: - p =p+pn.theme(legend_position=None) - - # Removing panel - p = p + no_panel() - - return p - - - - - - - - - - - - - - diff --git a/build/lib/scientisttools/ggplot.py b/build/lib/scientisttools/ggplot.py deleted file mode 100644 index 43399f7..0000000 --- a/build/lib/scientisttools/ggplot.py +++ /dev/null @@ -1,2434 +0,0 @@ -# -*- coding: utf-8 -*- - -import plotnine as pn -import numpy as np -import pandas as pd -from scientisttools.extractfactor import get_eigenvalue -import matplotlib.pyplot as plt - -def text_label(texttype,**kwargs): - """Function to choose between ``geom_text`` and ``geom_label`` - - Parameters - ---------- - text_type : {"text", "label"}, default = "text" - - **kwargs : geom parameters - - return - ------ - - - """ - if texttype == "text": - return pn.geom_text(**kwargs) - elif texttype == "label": - return pn.geom_label(**kwargs) - -def gg_circle(r, xc, yc, color="black",fill=None,**kwargs): - seq1 = np.linspace(0,np.pi,num=100) - seq2 = np.linspace(0,-np.pi,num=100) - x = xc + r*np.cos(seq1) - ymax = yc + r*np.sin(seq1) - ymin = yc + r*np.sin(seq2) - return pn.annotate("ribbon", x=x, ymin=ymin, ymax=ymax, color=color, fill=fill,**kwargs) - - -def fviz_screeplot(self, - choice="proportion", - geom_type=["bar","line"], - bar_fill = "steelblue", - bar_color="steelblue", - line_color="black", - line_type="solid", - bar_width=None, - add_kaiser=False, - add_kss = False, - add_broken_stick = False, - n_components=10, - add_labels=False, - h_align= "center", - v_align = "bottom", - title=None, - x_label=None, - y_label=None, - ggtheme=pn.theme_gray())-> plt: - """ - - - """ - - if self.model_ not in ["pca","ca","mca","famd","mfa","cmds"]: - raise ValueError("'res' must be an object of class PCA, CA, MCA, FAMD, MFA, CMDS") - - eig = get_eigenvalue(self) - eig = eig.iloc[:min(n_components,self.n_components_),:] - - if choice == "eigenvalue": - eig = eig["eigenvalue"] - text_labels = list([str(np.around(x,3)) for x in eig.values]) - if self.model_ != "mds": - kaiser = self.kaiser_threshold_ - if y_label is None: - y_label = "Eigenvalue" - elif choice == "proportion": - eig = eig["proportion"] - text_labels = list([str(np.around(x,1))+"%" for x in eig.values]) - if self.model_ != "pca": - kaiser = self.kaiser_proportion_threshold_ - else: - raise ValueError("Allowed values for the argument choice are : 'proportion' or 'eigenvalue'") - - df_eig = pd.DataFrame({"dim" : pd.Categorical(np.arange(1,len(eig)+1)),"eig" : eig.values}) - - p = pn.ggplot(df_eig,pn.aes(x = "dim",y="eig",group = 1)) - if "bar" in geom_type : - p = p + pn.geom_bar(stat="identity",fill=bar_fill,color=bar_color,width=bar_width) - if "line" in geom_type : - p = (p + pn.geom_line(color=line_color,linetype=line_type)+\ - pn.geom_point(shape="o",color=line_color)) - if add_labels: - p = p + pn.geom_text(label=text_labels,ha = h_align,va = v_align) - if add_kaiser : - p = (p + pn.geom_hline(yintercept=kaiser,linetype="--", color="red")+\ - pn.annotate("text", x=int(np.median(np.arange(1,len(eig)+1))), y=kaiser, - label="Kaiser threshold")) - - if add_kss: - if self.model_ in ["pca","ppca"]: - if choice == "eigenvalue": - p = (p + pn.geom_hline(yintercept=self.kss_threshold_,linetype="--", color="yellow")+ \ - pn.annotate("text", x=int(np.mean(np.arange(1,len(eig)+1))), y=self.kss_threshold_, - label="Karlis - Saporta - Spinaki threshold",colour = "yellow")) - else: - raise ValueError("'add_kss' is only with 'choice=eigenvalue'.") - else: - raise ValueError("'add_kss' is only for class PCA.") - if add_broken_stick: - if choice == "eigenvalue": - if self.model_ == ["pca","ppca"]: - bst = self.broken_stick_threshold_[:min(n_components,self.n_components_)] - p = (p + pn.geom_line(pn.aes(x="dim",y=bst),color="green",linetype="--")+\ - pn.geom_point(pn.aes(x="dim",y=bst),colour="green")+\ - pn.annotate("text", x=int(np.mean(np.arange(1,len(eig)+1))), y=np.median(bst), - label="Broken stick threshold",colour = "green")) - else: - raise ValueError("'add_broken_stick' is only for class PCA.") - - if title is None: - title = "Scree plot" - if x_label is None: - x_label = "Dimensions" - if y_label is None: - y_label = "Percentage of explained variances" - - p = p + pn.labs(title = title, x = x_label, y = y_label) - p = p + ggtheme - return p - -def fviz_eigenvalue(self,**kwargs): - return fviz_screeplot(self,**kwargs) - -def fviz_eig(self,**kwargs): - return fviz_screeplot(self,**kwargs) - -#################################################################################### -# Principal Components Analysis (PCA) -#################################################################################### - -def fviz_pca_ind(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - ind_sup=False, - color_sup = "red", - marker_sup = "^", - add_ellipse=False, - ellipse_type = "t", - confint_level = 0.95, - geom_ellipse = "polygon", - habillage = None, - short_labels=True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - """ - - if self.model_ != "pca": - raise ValueError("Error : 'self' must be an instance of class PCA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.row_coord_,index = self.row_labels_,columns=self.dim_index_) - - # Add categorical supplementary variables - if self.quali_sup_labels_ is not None: - coord[self.quali_sup_labels] = self.data_[self.quali_sup_labels_] - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_labels_)) - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.row_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.row_contrib_[:,axis],axis=1) - - if habillage is None : - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black",'lw':1.0}}) - else: - p = p + text_label(text_type,pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,'lw':1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - else: - if self.quali_sup_labels_ is not None: - p = p + pn.geom_point(pn.aes(color = habillage,linetype = habillage),size=point_size,shape=marker) - if repel: - p = p + text_label(text_type,pn.aes(color=habillage),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','lw':1.0}}) - else: - p = p + text_label(text_type,pn.aes(color=habillage),size=text_size,va=va,ha=ha) - - if add_ellipse: - p = p + pn.geom_point(pn.aes(color = habillage)) - p = p + pn.stat_ellipse(geom=geom_ellipse,mapping=pn.aes(fill=habillage),type = ellipse_type,alpha = 0.25,level=confint_level) - - if ind_sup: - if self.row_sup_labels_ is not None: - sup_coord = pd.DataFrame(self.row_sup_coord_,index=self.row_sup_labels_,columns=self.dim_index_) - - p = p + pn.geom_point(sup_coord,pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_sup_labels_), - color = color_sup,shape = marker_sup,size=point_size) - if repel: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,'lw':1.0}}) - else: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_sup_labels_), - color = color_sup,size=text_size,va=va,ha=ha) - if self.quali_sup_labels_ is not None: - if habillage is None: - if short_labels: - mod_sup_labels = self.short_sup_labels_ - else: - mod_sup_labels = self.mod_sup_labels_ - - mod_sup_coord = pd.DataFrame(self.mod_sup_coord_,columns=self.dim_index_) - - p = p + pn.geom_point(mod_sup_coord,pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels),color="red",size=point_size) - - if repel: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color="red",size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black",'lw':1.0}}) - else: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color ="red",size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Individuals factor map - PCA" - - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_pca_var(self, - axis=[0,1], - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - text_type = "text", - text_size = 8, - add_grid =True, - quanti_sup=True, - color_sup = "red", - add_hline = True, - add_vline=True, - ha="center", - va="center", - limits = None, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - add_circle = True, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "pca": - raise ValueError("Error : 'self' must be an instance of class PCA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.col_labels_)) - - if color == "cos2": - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.col_cos2_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - elif color == "contrib": - midpoint = 50 - legend_title = "Contrib" - c = np.sum(self.col_contrib_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}",colour=c), arrow = pn.arrow()) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits,name = legend_title) - if repel: - p = p + text_label(text_type,mappping=pn.aes(colour=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': 'black','lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(colour=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}"), arrow = pn.arrow(),color=color) - if repel: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,'lw':1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - if quanti_sup: - if self.quanti_sup_labels_ is not None: - sup_coord = pd.DataFrame(self.col_sup_coord_,columns=self.dim_index_,index=self.col_sup_labels_) - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=sup_coord.iloc[:,axis[0]],yend=sup_coord.iloc[:,axis[1]]),arrow = pn.arrow(),color=color_sup) - if repel: - p = p + text_label(text_type,mapping=pn.aes(x=sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.col_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,'lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(x=sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.col_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha) - - # Create circle - if add_circle: - p = p + gg_circle(r=1.0, xc=0.0, yc=0.0, color="black", fill=None) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Variables factor map - PCA" - - p = p + pn.xlim((-1,1))+ pn.ylim((-1,1))+ pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_pca(self,choice="ind",**kwargs)->plt: - """ - - """ - - if choice == "ind": - return fviz_pca_ind(self,**kwargs) - elif choice == "var": - return fviz_pca_var(self,**kwargs) - else: - raise ValueError("Error : Allowed values are 'ind' or 'var'.") - - -###################################################################################################### -## Multiple Correspondence Analysis (MCA) -###################################################################################################### - -def fviz_mca_ind(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - ind_sup=True, - color_sup = "red", - marker_sup = "^", - add_ellipse=False, - ellipse_type = "t", - confint_level = 0.95, - geom_ellipse = "polygon", - habillage = None, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an instance of class MCA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - # Initialize - coord = pd.DataFrame(self.row_coord_,index = self.row_labels_,columns=self.dim_index_) - - # Add categorical supplementary variables - if self.quali_sup_labels_ is not None: - coord[self.quali_sup_labels] = self.data_[self.quali_sup_labels_] - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.row_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.row_contrib_[:,axis],axis=1) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_labels_)) - - if habillage is None : - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black",'lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,'lw':1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - else: - if self.quali_sup_labels_ is not None: - p = p + pn.geom_point(pn.aes(color = habillage,linetype = habillage),size=point_size,shape=marker) - if repel: - p = p + text_label(text_type,mapping=pn.aes(color=habillage),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color':"black","lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(color=habillage),size=text_size,va=va,ha=ha) - - if add_ellipse: - p = p + pn.geom_point(pn.aes(color = habillage)) - p = p + pn.stat_ellipse(geom=geom_ellipse,mapping=pn.aes(fill=habillage),type = ellipse_type,alpha = 0.25,level=confint_level) - - if ind_sup: - if self.row_sup_labels_ is not None: - sup_coord = pd.DataFrame(self.row_sup_coord_,index=self.row_sup_labels_,columns=self.dim_index_) - p = p + pn.geom_point(mapping=pn.aes(x = sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.row_sup_labels_), - color = color_sup,shape = marker_sup,size=point_size) - if repel: - p = p + text_label(text_type,mapping=pn.aes(x = sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.row_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,'lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(x = sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.row_sup_labels_), - color = color_sup,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Individuals factor map - MCA" - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -# Graph for categories -def fviz_mca_mod(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - quali_sup = True, - color_sup = "red", - marker_sup = "^", - short_labels=True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an instance of class MCA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - # Initialize - coord = pd.DataFrame(self.mod_coord_,index = self.mod_labels_,columns=self.dim_index_) - - # Categories labels - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=labels)) - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.mod_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.mod_contrib_[:,axis],axis=1) - - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black",'lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Add supplementary categories - if quali_sup: - if self.quali_sup_labels_ is not None: - if short_labels: - mod_sup_labels = self.short_sup_labels_ - else: - mod_sup_labels = self.mod_sup_labels_ - - mod_sup_coord = pd.DataFrame(self.mod_sup_coord_,columns=self.dim_index_,index=mod_sup_labels) - - p = p + pn.geom_point(mod_sup_coord,pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color=color_sup,size=point_size,shape=marker_sup) - - if repel: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,'lw':1.0}}) - else: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color=color_sup,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Qualitatives variables categories - MCA" - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p + pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - - -def fviz_mca_var(self, - axis=[0,1], - xlim=(0,1), - ylim=(0,1), - title=None, - color="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - marker="o", - text_type="text", - add_grid =True, - add_hline = True, - add_vline =True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an instance of class MCA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - # Initialize - coord = pd.DataFrame(self.var_eta2_,index = self.var_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.var_labels_)) - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.var_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.var_contrib_[:,axis],axis=1) - - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black",'lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Graphe of variables - MCA" - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p + pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_mca(self,choice="ind",**kwargs)->plt: - """ - - - """ - if choice == "ind": - return fviz_mca_ind(self,**kwargs) - elif choice == "mod": - return fviz_mca_mod(self,**kwargs) - elif choice == "var": - return fviz_mca_var(self,**kwargs) - else: - raise ValueError("Error : Allowed values are 'ind', 'mod' or 'var'.") - - -################################################################################################################# -# Correspondence Analysis (CA) -################################################################################################################# - -def fviz_ca_row(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - row_sup=True, - color_sup = "red", - marker_sup = "^", - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "ca": - raise ValueError("Error : 'self' must be an instance of class CA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - # Initialize - coord = pd.DataFrame(self.row_coord_,index = self.row_labels_,columns=self.dim_index_) - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.row_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.row_contrib_[:,axis],axis=1) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_labels_)) - - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black","lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - if row_sup: - if self.row_sup_labels_ is not None: - sup_coord = pd.DataFrame(self.row_sup_coord_,index=self.row_sup_labels_,columns=self.dim_index_) - p = p + pn.geom_point(pn.aes(x = sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.row_sup_labels_), - color = color_sup,shape = marker_sup,size=point_size) - if repel: - p = p + text_label(text_type,mapping=pn.aes(x = sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.row_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,"lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(x = sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.row_sup_labels_), - color = color_sup,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Row points - CA" - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_ca_col(self, - axis=[0,1], - xlim= None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - text_type = "text", - marker = "o", - point_size = 1.5, - text_size = 8, - add_grid =True, - col_sup=True, - color_sup = "red", - marker_sup = "^", - add_hline = True, - add_vline=True, - ha="center", - va="center", - limits = None, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "ca": - raise ValueError("Error : 'self' must be an instance of class CA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.col_labels_)) - - if color == "cos2": - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.col_cos2_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - elif color == "contrib": - midpoint = 50 - legend_title = "Contrib" - c = np.sum(self.col_contrib_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits,name = legend_title) - if repel: - p = p + text_label(text_type,mapping=pn.aes(colour=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': 'black','lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(colour=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}"),color=color,shape=marker,size=point_size) - if repel: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - if col_sup: - if self.col_sup_labels_ is not None: - sup_coord = pd.DataFrame(self.col_sup_coord_,columns=self.dim_index_,index=self.col_sup_labels_) - p = p+pn.geom_point(sup_coord,pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.col_sup_labels_), - color=color_sup,shape=marker_sup,size=point_size) - if repel: - p = p + text_label(text_type,mapping=pn.aes(x=sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.col_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha,adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,"lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(x=sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.col_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Columns points - CA" - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_ca(self,choice,**kwargs)->plt: - """ - - - """ - if self.model_ != "ca": - raise ValueError("Error : 'self' must be an instance of class CA.") - - - if choice == "row": - return fviz_ca_row(self,**kwargs) - elif choice == "col": - return fviz_ca_col(self,**kwargs) - else: - raise ValueError("Error : Allowed values for choice are :'row' or 'col'.") - - -#################################################################################################################### -# Factor Analyis of Mixed Data (FAMD) -#################################################################################################################### - -def fviz_famd_ind(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - ind_sup=True, - color_sup = "red", - marker_sup = "^", - add_ellipse=False, - ellipse_type = "t", - confint_level = 0.95, - geom_ellipse = "polygon", - habillage = None, - short_labels=True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - """ - - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an instance of class FAMD.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.row_coord_,index = self.row_labels_,columns=self.dim_index_) - - # Add categorical supplementary variables - if self.quali_sup_labels_ is not None: - coord[self.quali_sup_labels] = self.data_[self.quali_sup_labels_] - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_labels_)) - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.row_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.row_contrib_[:,axis],axis=1) - - if habillage is None : - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black","lw":1.0}}) - else: - p = p + text_label(text_type,pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - else: - if self.quali_sup_labels_ is not None: - p = p + pn.geom_point(pn.aes(color = habillage,linetype = habillage),size=point_size,shape=marker) - if repel: - p = p + text_label(text_type,pn.aes(color=habillage),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->',"color":"black","lw":1.0}}) - else: - p = p + text_label(text_type,pn.aes(color=habillage),size=text_size,va=va,ha=ha) - - if add_ellipse: - p = p + pn.geom_point(pn.aes(color = habillage)) - p = p + pn.stat_ellipse(geom=geom_ellipse,mapping=pn.aes(fill=habillage),type = ellipse_type,alpha = 0.25,level=confint_level) - - if ind_sup: - if self.row_sup_labels_ is not None: - sup_coord = pd.DataFrame(self.row_sup_coord_,index=self.row_sup_labels_,columns=self.dim_index_) - - p = p + pn.geom_point(sup_coord,pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_sup_labels_), - color = color_sup,shape = marker_sup,size=point_size) - if repel: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,"lw":1.0}}) - else: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_sup_labels_), - color = color_sup,size=text_size,va=va,ha=ha) - if self.quali_sup_labels_ is not None: - if habillage is None: - if short_labels: - mod_sup_labels = self.short_sup_labels_ - else: - mod_sup_labels = self.mod_sup_labels_ - - mod_sup_coord = pd.DataFrame(self.mod_sup_coord_,columns=self.dim_index_) - - p = p + pn.geom_point(mod_sup_coord,pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels),color="red",size=point_size) - - if repel: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color="red",size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black","lw":1.0}}) - else: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color ="red",size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Individuals factor map - FAMD" - - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_famd_col(self, - axis=[0,1], - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - text_type = "text", - text_size = 8, - add_grid =True, - quanti_sup=True, - color_sup = "red", - add_hline = True, - add_vline=True, - ha="center", - va="center", - limits = None, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - add_circle = True, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an instance of class FAMD.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.col_labels_)) - - if color == "cos2": - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.col_cos2_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - elif color == "contrib": - midpoint = 50 - legend_title = "Contrib" - c = np.sum(self.col_contrib_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}",colour=c), arrow = pn.arrow()) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits,name = legend_title) - if repel: - p = p + text_label(text_type,mappping=pn.aes(colour=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': 'black',"lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(colour=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}"), arrow = pn.arrow(),color=color) - if repel: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha,adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - if quanti_sup: - if self.quanti_sup_labels_ is not None: - sup_coord = pd.DataFrame(self.col_sup_coord_,columns=self.dim_index_,index=self.col_sup_labels_) - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=sup_coord.iloc[:,axis[0]],yend=sup_coord.iloc[:,axis[1]]),arrow = pn.arrow(),color=color_sup) - if repel: - p = p + text_label(text_type,mapping=pn.aes(x=sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.col_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha,adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,"lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(x=sup_coord.iloc[:,axis[0]],y=sup_coord.iloc[:,axis[1]],label=self.col_sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha) - - # Create circle - if add_circle: - p = p + gg_circle(r=1.0, xc=0.0, yc=0.0, color="black", fill=None) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Continuous variables factor map - FAMD" - - p = p + pn.xlim((-1,1))+ pn.ylim((-1,1))+ pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -# Graph for categories -def fviz_famd_mod(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - quali_sup = True, - color_sup = "red", - marker_sup = "^", - short_labels=True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an instance of class FAMD.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - # Initialize - coord = pd.DataFrame(self.mod_coord_,index = self.mod_labels_,columns=self.dim_index_) - - # Categories labels - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=labels)) - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.mod_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.mod_contrib_[:,axis],axis=1) - - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black","lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Add supplementary categories - if quali_sup: - if self.quali_sup_labels_ is not None: - if short_labels: - mod_sup_labels = self.short_sup_labels_ - else: - mod_sup_labels = self.mod_sup_labels_ - - mod_sup_coord = pd.DataFrame(self.mod_sup_coord_,columns=self.dim_index_,index=mod_sup_labels) - - p = p + pn.geom_point(data=mod_sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color=color_sup,size=point_size,shape=marker_sup) - - if repel: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,"lw":1.0}}) - else: - p = p + text_label(text_type,data=mod_sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=mod_sup_labels), - color=color_sup,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Qualitatives variables categories - FAMD" - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p + pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - - -def fviz_famd_var(self, - axis=[0,1], - xlim=(0,1), - ylim=(0,1), - title=None, - color="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - marker="o", - text_type="text", - add_grid =True, - add_hline = True, - add_vline =True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an instance of class FAMD.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - # Initialize - col_cos2 = pd.DataFrame(self.col_cos2_[:,axis],index = self.col_labels_,columns=self.dim_index_) - var_eta2 = pd.DataFrame(self.var_eta2_[:,axis],index = self.quali_labels_,columns=self.dim_index_) - coord = pd.concat([col_cos2,var_eta2],axis=0) - labels = self.col_labels_ + self.quali_labels_ - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=labels)) - - if color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - contrib = np.append(self.col_contrib_[:,axis],self.var_contrib_[:,axis],axis=0) - c = np.sum(contrib,axis=1) - - # Using cosine and contributions - if color == "contrib": - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black","lw":1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Graphe of variables - FAMD" - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p + pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - - -def fviz_famd(self,choice="ind",**kwargs)->plt: - """ - - - """ - if choice == "ind": - return fviz_famd_ind(self,**kwargs) - elif choice == "col": - return fviz_famd_col(self,**kwargs) - elif choice == "mod": - return fviz_famd_mod(self,**kwargs) - elif choice == "var": - return fviz_famd_var(self,**kwargs) - else: - raise ValueError("Error : Allowed values are 'ind', 'col', 'mod' or 'var'.") - - -###################################################################################################################### -# Classical Multidimensional Scaling (CMDSCALE) -###################################################################################################################### - -def fviz_cmds(self, - axis=[0,1], - text_type = "text", - point_size = 1.5, - text_size = 8, - xlim=None, - ylim=None, - title =None, - xlabel=None, - ylabel=None, - color="blue", - color_sup ="red", - marker="o", - marker_sup = "^", - add_sup = True, - add_grid =True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - """ - - - """ - - if self.model_ != "cmds": - raise ValueError("Error : 'self' must be an instance of class CMDSCALE.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid axis") - - coord = pd.DataFrame(self.coord_,index = self.labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.labels_)) - - # Add point - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - if add_sup: - if self.sup_labels_ is not None: - sup_coord = pd.DataFrame(self.sup_coord_, index= self.sup_labels_,columns=self.dim_index_) - p = p + pn.geom_point(data=sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.sup_labels_), - color=color_sup,size=point_size,shape=marker_sup) - - if repel: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,"lw":1.0}}) - else: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha) - - if title is None: - title = "Classical multidimensional scaling (PCoA, Principal Coordinates Analysis)" - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p + pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - p = p + ggtheme - return p - -###################################################################################################################### -# Metric and Non - Metric Multidimensional Scaling (CMDSCALE) -###################################################################################################################### - -def fviz_mds(self, - axis=[0,1], - text_type = "text", - point_size = 1.5, - text_size = 8, - xlim=None, - ylim=None, - title =None, - xlabel=None, - ylabel=None, - color="blue", - color_sup ="red", - marker="o", - marker_sup = "^", - add_sup = True, - add_grid =True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - """ - - - """ - - if self.model_ != "mds": - raise ValueError("Error : 'self' must be an instance of class MDS.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid axis") - - coord = pd.DataFrame(self.coord_,index = self.labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.labels_)) - - # Add point - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - if add_sup: - if self.sup_labels_ is not None: - sup_coord = pd.DataFrame(self.sup_coord_, index= self.sup_labels_,columns=self.dim_index_) - p = p + pn.geom_point(data=sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.sup_labels_), - color=color_sup,size=point_size,shape=marker_sup) - if repel: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color_sup,"lw":1.0}}) - else: - p = p + text_label(text_type,data=sup_coord,mapping=pn.aes(x=f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.sup_labels_), - color=color_sup,size=text_size,va=va,ha=ha) - - if title is None: - title = self.title_ - - p = p + pn.ggtitle(title) - if xlabel is not None: - p = p + pn.xlab(xlab=xlabel) - if ylabel is not None: - p = p + pn.ylab(ylab=ylabel) - - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p + pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - p = p + ggtheme - return p - -# Shepard Diagram -def fviz_shepard(self, - xlim=None, - ylim=None, - color="blue", - title=None, - xlabel=None, - ylabel=None, - add_grid=True, - ggtheme=pn.theme_gray())-> plt: - """Computes the Shepard plot - - - """ - - if self.model_ not in ["cmds","mds"]: - raise ValueError("Error : 'Method' is allowed only for multidimensional scaling.") - - coord = pd.DataFrame({"InDist": self.dist_[np.triu_indices(self.nobs_, k = 1)], - "OutDist": self.res_dist_[np.triu_indices(self.nobs_, k = 1)]}) - - p = pn.ggplot(coord,pn.aes(x = "InDist",y = "OutDist"))+pn.geom_point(color=color) - - if xlabel is None: - xlabel = "Input Distances" - if ylabel is None: - ylabel = "Output Distances" - if title is None: - title = "Shepard Diagram" - - p = p + pn.ggtitle(title)+pn.xlab(xlabel)+pn.ylab(ylabel) - - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - return p+ ggtheme - -########################################################################################################## -###### Principal Components Analysis with partial correlation matrix (PartialPCA) -########################################################################################################### - -def fviz_ppca_ind(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - """ - - if self.model_ != "ppca": - raise ValueError("Error : 'self' must be an instance of class PartialPCA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.row_coord_,index = self.row_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_labels_)) - - if color == "cos2": - limits = [0,1] - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.row_cos2_[:,axis],axis=1) - elif color == "contrib": - midpoint = 50 - limits = [0,100] - legend_title = "Contrib" - c = np.sum(self.row_contrib_[:,axis],axis=1) - - - # Using cosine and contributions - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_point(pn.aes(colour=c),shape=marker,size=point_size,show_legend=False) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits, - name = legend_title) - if repel : - p = p + text_label(text_type,mapping=pn.aes(color=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': "black","lw":1.0}}) - else: - p = p + text_label(text_type,pn.aes(color=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Individuals factor map - Partial PCA" - - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_ppca_var(self, - axis=[0,1], - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - text_type = "text", - text_size = 8, - add_grid =True, - quanti_sup=True, - color_sup = "red", - add_hline = True, - add_vline=True, - ha="center", - va="center", - limits = None, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - add_circle = True, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "ppca": - raise ValueError("Error : 'self' must be an instance of class PartialPCA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.col_labels_)) - - if color == "cos2": - legend_title = "cos2" - midpoint = 0.5 - c = np.sum(self.col_cos2_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - elif color == "contrib": - midpoint = 50 - legend_title = "Contrib" - c = np.sum(self.col_contrib_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - - if color in ["cos2","contrib"]: - # Add gradients colors - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}",colour=c), arrow = pn.arrow()) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits,name = legend_title) - if repel: - p = p + text_label(text_type,mappping=pn.aes(colour=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': 'black','lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(colour=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}"), arrow = pn.arrow(),color=color) - if repel: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha,adjust_text={'arrowprops': {'arrowstyle': '->','color': color,'lw':1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Create circle - if add_circle: - p = p + gg_circle(r=1.0, xc=0.0, yc=0.0, color="black", fill=None) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Variables factor map - Partial PCA" - - p = p + pn.xlim((-1,1))+ pn.ylim((-1,1))+ pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_ppca(self,choice="ind",**kwargs)->plt: - """ - - """ - - if choice == "ind": - return fviz_ppca_ind(self,**kwargs) - elif choice == "var": - return fviz_ppca_var(self,**kwargs) - else: - raise ValueError("Error : Allowed values are 'ind' or 'var'.") - -################################################################################################################################### -# Exploratory Factor Analysis (EFA) -################################################################################################################################# - -def fviz_efa_ind(self, - axis=[0,1], - xlim=None, - ylim=None, - title =None, - color ="blue", - point_size = 1.5, - text_size = 8, - text_type = "text", - marker = "o", - add_grid =True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ggtheme=pn.theme_gray()) -> plt: - - """ - - """ - - if self.model_ != "efa": - raise ValueError("Error : 'self' must be an instance of class EFA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.row_coord_,index = self.row_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.row_labels_)) - - p = p + pn.geom_point(color=color,shape=marker,size=point_size,show_legend=False) - if repel : - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,'lw':1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Individuals factor map - EFA" - - if ((xlim is not None) and ((isinstance(xlim,list) or (isinstance(xlim,tuple))))): - p = p + pn.xlim(xlim) - if ((ylim is not None) and ((isinstance(ylim,list) or (isinstance(ylim,tuple))))): - p = p + pn.ylim(ylim) - - p = p + pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_efa_var(self, - axis=[0,1], - title =None, - color ="blue", - gradient_cols = ("#00AFBB", "#E7B800", "#FC4E07"), - text_type = "text", - text_size = 8, - add_grid =True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - limits = None, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - add_circle = True, - ggtheme=pn.theme_gray()) -> plt: - - """ - - - """ - - if self.model_ != "efa": - raise ValueError("Error : 'self' must be an instance of class EFA.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - coord = pd.DataFrame(self.col_coord_,index = self.col_labels_,columns=self.dim_index_) - - # Initialize - p = pn.ggplot(data=coord,mapping=pn.aes(x = f"Dim.{axis[0]+1}",y=f"Dim.{axis[1]+1}",label=self.col_labels_)) - - if color == "contrib": - midpoint = 50 - legend_title = "Contrib" - c = np.sum(self.col_contrib_[:,axis],axis=1) - if limits is None: - limits = list([np.min(c),np.max(c)]) - - if color == "contrib": - # Add gradients colors - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}",colour=c), arrow = pn.arrow()) - p = p + pn.scale_color_gradient2(low = gradient_cols[0],high = gradient_cols[2],mid = gradient_cols[1],midpoint=midpoint,limits = limits,name = legend_title) - if repel: - p = p + text_label(text_type,mapping=pn.aes(colour=c),size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': 'black','lw':1.0}}) - else: - p = p + text_label(text_type,mapping=pn.aes(colour=c),size=text_size,va=va,ha=ha) - else: - p = p + pn.geom_segment(pn.aes(x=0,y=0,xend=f"Dim.{axis[0]+1}",yend=f"Dim.{axis[1]+1}"), arrow = pn.arrow(),color=color) - if repel: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha, - adjust_text={'arrowprops': {'arrowstyle': '->','color': color,"lw":1.0}}) - else: - p = p + text_label(text_type,color=color,size=text_size,va=va,ha=ha) - - # Create circle - if add_circle: - p = p + gg_circle(r=1.0, xc=0.0, yc=0.0, color="black", fill=None) - - # Add additionnal - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - - if title is None: - title = "Variables factor map - EFA" - - p = p + pn.xlim((-1,1))+ pn.ylim((-1,1))+ pn.ggtitle(title)+ pn.xlab(xlab=xlabel)+pn.ylab(ylab=ylabel) - - if add_hline: - p = p + pn.geom_hline(yintercept=0, colour=hline_color, linetype =hline_style) - if add_vline: - p = p+ pn.geom_vline(xintercept=0, colour=vline_color, linetype =vline_style) - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed")) - - # Add theme - p = p + ggtheme - - return p - -def fviz_efa(self,choice="ind",**kwargs)->plt: - """ - - """ - - if choice == "ind": - return fviz_efa_ind(self,**kwargs) - elif choice == "var": - return fviz_efa_var(self,**kwargs) - else: - raise ValueError("Error : Allowed values are 'ind' or 'var'.") - - -################################################################################################## -# Visualize the contributions of row/column elements -################################################################################################### - -def fviz_contrib(self, - choice="ind", - axis=None, - xlabel=None, - top_contrib=10, - bar_width=None, - add_grid=True, - color="steelblue", - short_labels=False, - ggtheme=pn.theme_gray()) -> plt: - - """ Plot the row and column contributions graph - - For the selected axis, the graph represents the row or column - cosines sorted in descending order. - - Parameters - ---------- - choice : {'ind','var','mod'}. - 'ind' : individuals - 'var' : continues/categorical variables - 'mod' : categories - - axis : None or int. - Select the axis for which the row/col contributions are plotted. If None, axis = 0. - - xlabel : None or str (default). - The label text. - - top_contrib : None or int. - Set the maximum number of values to plot. - If top_contrib is None : all the values are plotted. - - bar_width : None, float or array-like. - The width(s) of the bars. - - add_grid : bool or None, default = True. - Whether to show the grid lines. - - color : color or list of color, default = "steelblue". - The colors of the bar faces. - - short_labels : bool, default = False - - Returns - ------- - None - """ - - if choice not in ["ind","var","mod"]: - raise ValueError("Error : 'choice' not allowed.") - - if axis is None: - axis = 0 - elif not isinstance(axis,int): - raise ValueError("Error : 'axis' must be an integer.") - elif axis < 0 or axis > self.n_components_: - raise ValueError(f"Error : 'axis' must be an integer between 0 and {self.n_components_ - 1}.") - - if xlabel is None: - xlabel = "Contributions (%)" - - if bar_width is None: - bar_width = 0.5 - if top_contrib is None: - top_contrib = 10 - elif not isinstance(top_contrib,int): - raise ValueError("Error : 'top_contrib' must be an integer.") - - if choice == "ind": - name = "individuals" - contrib = self.row_contrib_[:,axis] - labels = self.row_labels_ - elif choice == "var" and self.model_ != "mca": - name = "continues variables" - contrib = self.col_contrib_[:,axis] - labels = self.col_labels_ - if self.model_ == "famd": - contrib = np.append(contrib,self.var_contrib_[:,axis],axis=0) - labels = labels + self.quali_labels_ - elif choice == "mod" and self.model_ in ["mca","famd"]: - name = "categories" - contrib = self.mod_contrib_[:,axis] - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - - n = len(labels) - n_labels = len(labels) - - if (top_contrib is not None) & (top_contrib < n_labels): - n_labels = top_contrib - - limit = n - n_labels - contrib_sorted = np.sort(contrib)[limit:n] - labels_sort = pd.Series(labels)[np.argsort(contrib)][limit:n] - - df = pd.DataFrame({"labels" : labels_sort, "contrib" : contrib_sorted}) - - p = pn.ggplot(df,pn.aes(x = "reorder(labels,contrib)", y = "contrib"))+pn.geom_bar(stat="identity",fill=color,width=bar_width) - - title = f"Contribution of {name} to Dim-{axis+1}" - p = p + pn.ggtitle(title)+pn.xlab(name)+pn.ylab(xlabel) - p = p + pn.coord_flip() - - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed"), - axis_text_x = pn.element_text(angle = 60, ha = "center", va = "center")) - - return p+ggtheme - -################################################################################################## -# Visualize the cosines of row/column elements -################################################################################################### - -def fviz_cosines(self, - choice="ind", - axis=None, - xlabel=None, - top_cos2=10, - bar_width=None, - add_grid=True, - color="steelblue", - short_labels=False, - ggtheme=pn.theme_gray()) -> plt: - - """ Plot the row and columns cosines graph - - For the selected axis, the graph represents the row or column - cosines sorted in descending order. - - Parameters - ---------- - choice : {'ind','var','mod','quanti_sup','quali_sup','ind_sup'} - 'ind' : individuals - 'var' : continues variables - 'mod' : categories - 'quanti_sup' : supplementary continues variables - 'quali_sup' : supplementary categories variables - 'ind_sup ' : supplementary individuals - - axis : None or int - Select the axis for which the row/col cosines are plotted. If None, axis = 0. - - xlabel : None or str (default) - The label text. - - top_cos2 : int - Set the maximum number of values to plot. - If top_cos2 is None : all the values are plotted. - - bar_width : None, float or array-like. - The width(s) of the bars. - - add_grid : bool or None, default = True. - Whether to show the grid lines - - color : color or list of color, default = "steelblue". - The colors of the bar faces. - - short_labels : bool, default = False - - Returns - ------- - None - """ - - if choice not in ["ind","var","mod","quanti_sup","quali_sup","ind_sup"]: - raise ValueError("Error : 'choice' not allowed.") - - if axis is None: - axis = 0 - elif not isinstance(axis,int): - raise ValueError("Error : 'axis' must be an integer.") - elif axis < 0 or axis > self.n_components_: - raise ValueError(f"Error : 'axis' must be an integer between 0 and {self.n_components_ - 1}") - - if xlabel is None: - xlabel = "Cos2 - Quality of representation" - if bar_width is None: - bar_width = 0.5 - if top_cos2 is None: - top_cos2 = 10 - - if choice == "ind": - name = "individuals" - cos2 = self.row_cos2_[:,axis] - labels = self.row_labels_ - elif choice == "var" and self.model_ != "mca": - name = "continues variables" - cos2 = self.col_cos2_[:,axis] - labels = self.col_labels_ - elif choice == "mod" and self.model_ in ["mca","famd"]: - name = "categories" - cos2 = self.mod_cos2_[:,axis] - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - elif choice == "quanti_sup" and self.model_ != "ca": - if ((self.quanti_sup_labels_ is not None) and (len(self.col_sup_labels_) >= 2)): - name = "supplementary continues variables" - cos2 = self.col_sup_cos2_[:,axis] - labels = self.col_sup_labels_ - else: - raise ValueError("Error : 'quanti_sup'") - elif choice == "quali_sup" and self.model_ !="ca": - if self.quali_sup_labels_ is not None: - name = "supplementary categories" - cos2 = self.mod_sup_cos2_[:,axis] - if short_labels: - labels = self.short_sup_labels_ - else: - labels = self.mod_sup_labels_ - - # Start - n = len(labels) - n_labels = len(labels) - if (top_cos2 is not None) & (top_cos2 < n_labels): - n_labels = top_cos2 - - limit = n - n_labels - cos2_sorted = np.sort(cos2)[limit:n] - labels_sort = pd.Series(labels)[np.argsort(cos2)][limit:n] - - df = pd.DataFrame({"labels" : labels_sort, "cos2" : cos2_sorted}) - - p = pn.ggplot(df,pn.aes(x = "reorder(labels,cos2)", y = "cos2"))+pn.geom_bar(stat="identity",fill=color,width=bar_width) - - title = f"Cosinus of {name} to Dim-{axis+1}" - p = p + pn.ggtitle(title)+pn.xlab(name)+pn.ylab(xlabel) - p = p + pn.coord_flip() - - if add_grid: - p = p + pn.theme(panel_grid_major = pn.element_line(color = "black",size = 0.5,linetype = "dashed"), - axis_text_x = pn.element_text(angle = 60, ha = "center", va = "center")) - - return p+ggtheme - diff --git a/build/lib/scientisttools/manifold.py b/build/lib/scientisttools/manifold.py deleted file mode 100644 index 4d111e1..0000000 --- a/build/lib/scientisttools/manifold.py +++ /dev/null @@ -1,745 +0,0 @@ -# -*- coding: utf-8 -*- - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from scipy.spatial.distance import pdist,squareform -import warnings -from sklearn.utils import check_symmetric -from scientisttools.utils import sim_dist -from sklearn.base import BaseEstimator, TransformerMixin -from scientisttools.pyplot import plotMDS, plotCMDS -from sklearn import manifold -from mapply.mapply import mapply -from scipy.spatial.distance import euclidean - -#################################################################################" -# SMACOF ALGORITHM -################################################################################### - -def SMACOF( - X, - metric=True, - n_components=2, - proximity ="euclidean", - init=None, - n_init=8, - n_jobs=None, - max_iter=300, - verbose=0, - eps=0.001, - random_state=None, - return_n_iter=False, -): - """Compute multidimensional scaling using the SMACOF algorithm. - - The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a - multidimensional scaling algorithm which minimizes an objective function - (the *stress*) using a majorization technique. Stress majorization, also - known as the Guttman Transform, guarantees a monotone convergence of - stress, and is more powerful than traditional techniques such as gradient - descent. - - The SMACOF algorithm for metric MDS can be summarized by the following - steps: - 1. Set an initial start configuration, randomly or not. - 2. Compute the stress - 3. Compute the Guttman Transform - 4. Iterate 2 and 3 until convergence. - The nonmetric algorithm adds a monotonic regression step before computing - the stress. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) or \ - (n_samples, n_samples) - Input data. If ``dissimilarity=='precomputed'``, the input should - be the dissimilarity matrix. - - metric : bool, default=True - Compute metric or nonmetric SMACOF algorithm. - When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as - missing values. - - n_components : int, default=2 - Number of dimensions in which to immerse the dissimilarities. If an - ``init`` array is provided, this option is overridden and the shape of - ``init`` is used to determine the dimensionality of the embedding - space. - - init : ndarray of shape (n_samples, n_components), default=None - Starting configuration of the embedding to initialize the algorithm. By - default, the algorithm is initialized with a randomly chosen array. - - n_init : int, default=8 - Number of times the SMACOF algorithm will be run with different - initializations. The final results will be the best output of the runs, - determined by the run with the smallest final stress. If ``init`` is - provided, this option is overridden and a single run is performed. - - n_jobs : int, default=None - The number of jobs to use for the computation. If multiple - initializations are used (``n_init``), each run of the algorithm is - computed in parallel. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - max_iter : int, default=300 - Maximum number of iterations of the SMACOF algorithm for a single run. - - verbose : int, default=0 - Level of verbosity. - - eps : float, default=1e-3 - Relative tolerance with respect to stress at which to declare - convergence. The value of `eps` should be tuned separately depending - on whether or not `normalized_stress` is being used. - - random_state : int, RandomState instance or None, default=None - Determines the random number generator used to initialize the centers. - Pass an int for reproducible results across multiple function calls. - See :term:`Glossary `. - - return_n_iter : bool, default=False - Whether or not to return the number of iterations. - - Returns - ------- - coord : ndarray of shape (n_samples, n_components) - Coordinates of the points in a ``n_components``-space. - - stress : float - The final value of the stress (sum of squared distance of the - disparities and the distances for all constrained points). - If `normalized_stress=True`, and `metric=False` returns Stress-1. - A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good, - 0.1 fair, and 0.2 poor [1]_. - - n_iter : int - The number of iterations corresponding to the best stress. Returned - only if ``return_n_iter`` is set to ``True``. - - References - ---------- - .. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. - Psychometrika, 29 (1964) - .. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric - hypothesis" Kruskal, J. Psychometrika, 29, (1964) - .. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; - Groenen P. Springer Series in Statistics (1997) - """ - - - if proximity == "euclidean": - dissimilarities = squareform(pdist(X,metric="euclidean")) - elif proximity == "precomputed": - dissimilarities = check_symmetric(X.values, raise_exception=True) - elif proximity == "similarity": - dissimilarities = sim_dist(X) - - smacof_res = manifold.smacof( - dissimilarities = dissimilarities, - metric = metric, - n_components = n_components, - init = init, - n_init = n_init, - n_jobs = n_jobs, - max_iter = max_iter, - verbose = verbose, - eps = eps, - random_state = random_state, - return_n_iter = return_n_iter, - normalized_stress = "auto") - - if return_n_iter: - return smacof_res[0], smacof_res[1], smacof_res[2] - else: - return smacof_res[0], smacof_res[1] - - - - - - -################################################################################ -# CLASSICAL MULTIDIMENSIONAL SCALING (CMDSCALE) -############################################################################### - -class CMDSCALE(BaseEstimator,TransformerMixin): - """Classic Muldimensional Scaling (CMDSCALE) - - This is a classical multidimensional scaling also - known as Principal Coordinates Analysis (PCoA). - - Performs Classical Multidimensional Scaling (MDS) with supplementary - rows points. - - Parameters - ---------- - n_components : int, default=None - Number of dimensions in which to immerse the dissimilarities. - - labels : list of string, default : None - Labels for the rows. - - sup_labels : list of string, default = None - Labels of supplementary rows. - - proximity : {'euclidean','precomputed','similarity'}, default = 'euclidean' - Dissmilarity measure to use : - - 'euclidean': - Pairwise Euclidean distances between points in the dataset - - - 'precomputed': - Pre-computed dissimilarities are passed disrectly to ``fit`` and ``fit_transform``. - - - `similarity`: - Similarity matrix is transform to dissimilarity matrix before passed to ``fit`` and ``fit_transform``. - - normalized_stress : bool, default = True - Whether use and return normed stress value (Stress-1) instead of raw - stress calculated by default. - - graph : bool, default = True - if True a graph is displayed - - figsize : tuple of int, default = None - Width, height in inches. - - Returns - ------- - n_components_ : int - The estimated number of components. - - labels_ : array of strings - Labels for the rows. - - nobs_ : int - number of rows - - dist_ : ndarray of shape -n_rows, nr_ows) - Eulidean distances matrix. - - eig_ : array of float - A 4 x n_components_ matrix containing all the eigenvalues - (1st row), difference (2nd row) the percentage of variance (3rd row) and the - cumulative percentage of variance (4th row). - - eigen_vector_ : array of float: - A matrix containing eigenvectors - - coord_ : ndarray of shape (n_rows,n_components_) - A n_rows x n_components_ matrix containing the row coordinates. - - res_dist_ : ndarray of shape (n_rows,n_rows_) - A n_rows x n_rows_ matrix containing the distances based on coordinates. - - stress_ : float - - inertia_ : - - dim_index_ : - - centered_matrix_ : ndarray of shape - - model_ : string - The model fitted = 'cmds' - - """ - def __init__(self, - n_components=None, - labels = None, - sup_labels = None, - proximity="euclidean", - normalized_stress=True, - graph=True, - figsize=None): - self.n_components = n_components - self.labels = labels - self.sup_labels = sup_labels - self.proximity = proximity - self.normalized_stress = normalized_stress - self.graph = graph - self.figsize = figsize - - def fit(self,X,y=None): - """Fit the model to X - - Parameters - ---------- - X : DataFrame of float, shape (n_rows, n_columns) - - y : None - y is ignored - - Returns: - -------- - self : object - Returns the instance itself - """ - - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Extract supplementary data - self.sup_labels_ = self.sup_labels - if self.sup_labels_ is not None: - _X = X.drop(index = self.sup_labels_) - row_sup = X.loc[self.sup_labels_,:] - else: - _X = X - - self.data_ = X - self.active_data_ = _X - - # Initialize - self.sup_coord_ = None - - self.nobs_ = _X.shape[0] - self.centering_matrix_ = np.identity(self.nobs_) - (1/self.nobs_)*np.ones(shape=(self.nobs_,self.nobs_)) - - self._compute_stats(_X) - - if self.graph: - fig, axe = plt.subplots(figsize=self.figsize) - plotCMDS(self,repel=True,ax=axe) - - if self.sup_labels_ is not None: - self.sup_coord_ = self.transform(row_sup) - - return self - - def _is_euclidean(self,X): - """Compute eigenvalue and eigenvalue for euclidean matrix - - """ - self.dist_ = squareform(pdist(X,metric="euclidean")) - B = np.dot(np.dot(self.centering_matrix_,X),np.dot(self.centering_matrix_,X).T) - value, vector = np.linalg.eig(B) - return np.real(value), np.real(vector) - - def _is_precomputed(self,X): - """Return eigenvalue and eigenvector for precomputed matrix - - """ - self.dist_ = check_symmetric(X.values, raise_exception=True) - A = -0.5*np.multiply(self.dist_,self.dist_) - B = np.dot(self.centering_matrix_,np.dot(A,self.centering_matrix_)) - value, vector = np.linalg.eig(B) - return np.real(value), np.real(vector) - - def _is_similarity(self,X): - """Return eigenvalue - - """ - D = sim_dist(X) - self.dist_ = check_symmetric(D, raise_exception=True) - A = -0.5*np.multiply(self.dist_,self.dist_) - B = np.dot(self.centering_matrix_,np.dot(A,self.centering_matrix_)) - value, vector = np.linalg.eig(B) - return np.real(value), np.real(vector) - - def _compute_stats(self,X): - """Compute statistic - - """ - if X.shape[0] == X.shape[1] and self.proximity != "precomputed": - raise warnings.warn( - "The ClassicMDS API has changed. ``fit`` now constructs an" - " dissimilarity matrix from data. To use a custom " - "dissimilarity matrix, set " - "``proximity='precomputed'``." - ) - - # Compute euclidean - if self.proximity == "euclidean": - eigen_value, eigen_vector = self._is_euclidean(X) - elif self.proximity == "precomputed": - eigen_value, eigen_vector = self._is_precomputed(X) - elif self.proximity == "similarity" : - eigen_value, eigen_vector = self._is_similarity(X) - else: - raise ValueError("Error : You must pass a valid 'proximity'.") - - proportion = 100*eigen_value/np.sum(eigen_value) - difference = np.insert(-np.diff(eigen_value),len(eigen_value)-1,np.nan) - cumulative = np.cumsum(proportion) - - # Set n_components - self.n_components_ = self.n_components - if self.n_components_ is None: - self.n_components_ = (eigen_value > 1e-16).sum() - elif not self.n_components_: - self.n_components_ = self.n_components_ - elif self.n_components_ > self.nobs_: - raise ValueError("Error : You must pass a valid 'n_components'.") - - self.eig_ = np.array([eigen_value[:self.n_components_], - difference[:self.n_components_], - proportion[:self.n_components_], - cumulative[:self.n_components_]]) - - self.eigen_vector_ = eigen_vector[:,:self.n_components_] - - self.coord_ = self.eigen_vector_*np.sqrt(eigen_value[:self.n_components_]) - - self.res_dist_ = squareform(pdist(self.coord_,metric="euclidean")) - - #calcul du stress - if self.normalized_stress: - self.stress_ = np.sqrt(np.sum((self.res_dist_-self.dist_)**2)/np.sum(self.dist_**2)) - else: - self.stress_ = np.sum((self.res_dist_-self.dist_)**2) - - # Inertie - inertia = np.sum(self.dist_**2)/(2*self.nobs_**2) - - self.inertia_ = inertia - self.dim_index_ = ["Dim."+str(x+1) for x in np.arange(0,self.n_components_)] - - # Set labels - self.labels_ = self.labels - if self.labels_ is None: - self.labels_ = [f"label_" + str(i+1) for i in np.arange(0,self.nobs_)] - - self.model_ = "cmds" - - def transform(self,X,y=None): - """Apply the Multidimensional Scaling reduction on X - - X is projected on the first axes previous extracted from a training set. - - Parameters - ---------- - X : DataFrame of float, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - - d2 = np.sum(self.dist_**2,axis=1)/self.nobs_ - d3 = np.sum(self.dist_**2)/(self.nobs_**2) - - if self.proximity == "precomputed": - sup_coord = mapply(X,lambda x : -(1/2)*(x**2 - np.sum(x**2)-d2+d3),axis=1,progressbar=False).dot(self.coord_)/self.eig_[0] - elif self.proximity == "euclidean": - n_supp_obs = X.shape[0] - sup_dist = np.zeros((n_supp_obs,self.nobs_)) - for i in np.arange(0,n_supp_obs): - for j in np.arange(0,self.nobs_): - sup_dist[i,j] = euclidean(X.iloc[i,:],self.active_data_.iloc[j,:]) - sup_coord = np.apply_along_axis(arr=sup_dist,axis=1,func1d=lambda x : -(1/2)*(x**2 - np.sum(x**2)-d2+d3)).dot(self.coord_)/self.eig_[0] - elif self.proximity == "similarity": - raise NotImplementedError("Error : This method is not implemented yet.") - - return np.array(sup_coord) - - def fit_transform(self,X,y=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : pd.DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - - self.fit(X) - - return self.coord_ - - -############################################################################## -# METRIC & NON METRIC MULTIDIMENSIONAL SCALING (MDS) -############################################################################## - -class MDS(BaseEstimator,TransformerMixin): - """Metric and Non - Metric Multidimensional Scaling (MDS) - - This is a metric and non - metric multidimensional scaling - - Performs metric and non - metric Multidimensional Scaling (MDS) - with supplementary rows points. - - - Parameters - ---------- - n_components : int, default=2 - Number of dimensions in which to immerse the dissimilarities. - - proximity : {'euclidean','precomputed','similarity'}, default = 'euclidean' - Dissmilarity measure to use : - - 'euclidean': - Pairwise Euclidean distances between points in the dataset - - - 'precomputed': - Pre-computed dissimilarities are passed disrectly to ``fit`` and ``fit_transform``. - - - `similarity`: - Similarity matrix is transform to dissimilarity matrix before passed to ``fit`` and ``fit_transform``. - - metric : bool, default=True - If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. - When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as - missing values. - - n_init : int, default=4 - Number of times the SMACOF algorithm will be run with different - initializations. The final results will be the best output of the runs, - determined by the run with the smallest final stress. - - max_iter : int, default=300 - Maximum number of iterations of the SMACOF algorithm for a single run. - - verbose : int, default=0 - Level of verbosity. - - eps : float, default=1e-3 - Relative tolerance with respect to stress at which to declare - convergence. The value of `eps` should be tuned separately depending - on whether or not `normalized_stress` is being used. - - n_jobs : int, default=None - The number of jobs to use for the computation. If multiple - initializations are used (``n_init``), each run of the algorithm is - computed in parallel. - - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors. See :term:`Glossary ` - for more details. - - random_state : int, RandomState instance or None, default=None - Determines the random number generator used to initialize the centers. - Pass an int for reproducible results across multiple function calls. - See :term:`Glossary `. - - normalized_stress : bool, default=True - Whether use and return normed stress value (Stress-1) instead of raw - stress calculated by default. - - Attributes - ---------- - coord_ : ndarray of shape (n_samples, n_components) - Stores the position of the dataset in the embedding space. - - stress_ : float - The final value of the stress (sum of squared distance of the - disparities and the distances for all constrained points). - If `normalized_stress=True`, and `metric=False` returns Stress-1. - A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good, - 0.1 fair, and 0.2 poor [1]_. - - dist_ : ndarray of shape (n_samples, n_samples) - Pairwise dissimilarities between the points. Symmetric matrix that: - - - either uses a custom dissimilarity matrix by setting `proximity` - to 'precomputed'; - - or constructs a dissimilarity matrix from data using - Euclidean distances. - - res_dist_ : ndarray of shape (n_samples, n_samples) - Pairwise dissimilarities between the points based of coordinates - - model_ : string - The model fitted = 'mds' - - """ - def __init__(self, - n_components=2, - proximity ='euclidean', - metric=True, - n_init=4, - max_iter=300, - verbose=0, - eps=1e-3, - n_jobs=None, - random_state=None, - labels = None, - sup_labels = None, - normalized_stress=True, - graph =True, - figsize=(10,10)): - self.n_components = n_components - self.proximity = proximity - self.metric = metric - self.n_init = n_init - self.max_iter = max_iter - self.verbose = verbose - self.eps = eps - self.n_jobs = n_jobs - self.random_state = random_state - self.labels = labels - self.sup_labels = sup_labels - self.normalized_stress =normalized_stress - self.graph = graph - self.figsize = figsize - - def fit(self,X,y=None, init=None): - """Fit the model to X - - Parameters - ---------- - X : DataFrame of float, shape (n_rows, n_columns) - - y : None - y is ignored - - Returns: - -------- - self : object - Returns the instance itself - """ - - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - # Extract supplementary data - self.sup_labels_ = self.sup_labels - if self.sup_labels_ is not None: - _X = X.drop(index = self.sup_labels_) - row_sup = X.loc[self.sup_labels_,:] - else: - _X = X - - self.active_data_ = _X - self.data_ = X - - self.nobs_ = _X.shape[0] - - - if self.proximity == "euclidean": - self.dist_ = squareform(pdist(_X,metric="euclidean")) - elif self.proximity == "precomputed": - self.dist_ = check_symmetric(_X.values, raise_exception=True) - elif self.proximity == "similarity": - self.dist_ = sim_dist(_X) - - #Set Labels - self.labels_ = self.labels - if self.labels_ is None: - self.labels_ = ["label_"+str(i+1) for i in range(0,self.nobs_)] - - if self.metric: - self.title_ = "Metric multidimensional scaling (mMDS)" - else: - self.title_ = "Non-metric multidimensional scaling (NMDS)" - - self.fit_transform(_X,init=init) - - self.res_dist_ = squareform(pdist(self.coord_,metric="euclidean")) - - #calcul du stress - if self.normalized_stress: - self.stress_ = np.sqrt(np.sum((self.res_dist_-self.dist_)**2)/np.sum(self.dist_**2)) - else: - self.stress_ = np.sum((self.res_dist_-self.dist_)**2) - - self.model_ = "mds" - - if self.graph: - fig, axe = plt.subplots(figsize=self.figsize) - plotMDS(self,repel=True,ax=axe) - - return self - - def fit_transform(self,X, y=None, init=None): - """Fit the model with X and apply the dimensionality reduction on X. - - Parameters - ---------- - X : pd.DataFrame, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. - - Returns - ------- - X_new : array-like, shape (n_samples, n_components) - """ - - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - - self.coord_, self.stress_, self.n_iter_ = SMACOF( - X=X, - metric=self.metric, - n_components=self.n_components, - proximity = self.proximity, - init=init, - n_init=self.n_init, - n_jobs=self.n_jobs, - max_iter=self.max_iter, - verbose=self.verbose, - eps=self.eps, - random_state=self.random_state, - return_n_iter=True, - ) - #set n_compoents - self.n_components_ = self.n_components - if self.n_components_ is None: - self.n_components_ = self.coord_.shape[1] - self.dim_index_ = ["Dim."+str(x+1) for x in np.arange(0,self.n_components_)] - - return self.coord_ - - def transform(self,X,y=None): - """Apply the Multidimensional Scaling reduction on X - - X is projected on the first axes previous extracted from a training set. - - Parameters - ---------- - X : DataFrame of float, shape (n_rows_sup, n_columns) - New data, where n_row_sup is the number of supplementary - row points and n_columns is the number of columns - X rows correspond to supplementary row points that are - projected on the axes - X is a table containing numeric values - - y : None - y is ignored - - Returns - ------- - X_new : DataFrame of float, shape (n_rows_sup, n_components_) - X_new : coordinates of the projections of the supplementary - row points on the axes. - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - raise NotImplementedError("Error : This method is not implemented yet.") diff --git a/build/lib/scientisttools/pyplot.py b/build/lib/scientisttools/pyplot.py deleted file mode 100644 index 2d3c22d..0000000 --- a/build/lib/scientisttools/pyplot.py +++ /dev/null @@ -1,1997 +0,0 @@ -# -*- coding: utf-8 -*- - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import matplotlib.cm as cm -import matplotlib.colors as mcolors -from adjustText import adjust_text -import scipy.stats as st -from matplotlib.patches import Ellipse -import math -import random - -################################################################### -# CORRESPONDENCE ANALYSIS (CA) -#################################################################### - -def plotCA(self, - choice ="row", - axis=(0,1), - xlim=None, - ylim=None, - title=None, - color="blue", - marker="o", - add_grid =True, - add_sup=False, - color_sup = "red", - marker_sup ="^", - color_map ="jet", - add_hline = True, - add_vline=True, - arrow = False, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel = False, - ax=None)->plt: - - """ Plot te Factor map for rows and columns - - Parameters - ---------- - self : aninstance of class CA - choice : str - axis : tuple or list of two elements - xlim : tuple or list of two elements - ylim : tuple of list of two elements - title : str - color : str - marker : str - The marker style for active points - add_grid : bool - add_sup : bool - color_sup : str : - The markers colors - marker_sup : str - The marker style for supplementary points - color_map : str - add_hline : bool - add_vline : bool - ha : horizontalalignment : {'left','center','right'} - va : verticalalignment {"bottom","baseline","center","center_baseline","top"} - hline_color : - hline_style : - vline_color : - vline_style : - ax : - - Returns - ------- - None - """ - - if self.model_ != "ca": - raise ValueError("Error : 'self' must be an instance of class CA.") - - if choice not in ["row","col"]: - raise ValueError("Error : 'choice' ") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - if ax is None: - ax = plt.gca() - - if choice == "row": - coord = self.row_coord_[:,axis] - cos2 = self.row_cos2_[:,axis] - contrib = self.row_contrib_[:,axis] - labels = self.row_labels_ - if title is None: - title = "Row points - CA" - if add_sup: - if self.row_sup_labels_ is not None: - sup_labels = self.row_sup_labels_ - sup_coord = self.row_sup_coord_[:,axis] - else: - coord = self.col_coord_[:,axis] - cos2 = self.col_cos2_[:,axis] - contrib = self.col_contrib_[:,axis] - labels = self.col_labels_ - if title is None: - title = "Columns points - CA" - if add_sup: - if self.col_sup_labels_ is not None: - sup_labels = self.col_sup_labels_ - sup_coord = self.col_sup_coord_[:,axis] - - - # Extract coordinates - xs = coord[:,axis[0]] - ys = coord[:,axis[1]] - - if color == "cos2": - c = np.sum(cos2,axis=1) - elif color == "contrib": - c = np.sum(contrib,axis=1) - - if color in ["cos2","contrib"]: - cNorm = mcolors.Normalize(vmin=np.min(c), vmax=np.max(c)) - scalarMap = cm.ScalarMappable(norm=cNorm,cmap=plt.get_cmap(color_map)) - p = ax.scatter(xs,ys,c=c,s=len(c),marker=marker,cmap=plt.get_cmap(color_map)) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - if arrow: - ax.arrow(0,0,xs[i],ys[i],length_includes_head=True,color=colorVal) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - if arrow: - ax.arrow(0,0,xs[i],ys[i],length_includes_head=True,color=colorVal) - else: - ax.scatter(xs,ys,c=color,marker=marker) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - if arrow: - ax.arrow(0,0,xs[i],ys[i],length_includes_head=True,color=color) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - if arrow: - ax.arrow(0,0,xs[i],ys[i],length_includes_head=True,color=color) - if add_sup: - xxs = sup_coord - # Reset xlim and ylim - xxs = sup_coord[:,axis[0]] - yys = sup_coord[:,axis[1]] - # Add supplementary row coordinates - ax.scatter(xxs,yys,c=color_sup,marker=marker_sup) - if repel: - texts = list() - for i,lab in enumerate(sup_labels): - texts.append(ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xxs,y=yys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for i,lab in enumerate(sup_labels): - ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup) - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - # Add horizontal and vertical lines - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -##################################################################################### -# -#################################################################################### -# -*- coding: utf-8 -*- - -def plotCMDS(self, - axis=[0,1], - xlim=(None,None), - ylim=(None,None), - title =None, - color="blue", - marker="o", - add_grid =True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ax=None) -> plt: - - if self.model_ != "cmds": - raise ValueError("Error : 'self' must be an instance of class CMDSCALE.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - if ax is None: - ax = plt.gca() - - xs = self.coord_[:,axis[0]] - ys = self.coord_[:,axis[1]] - ax.scatter(xs,ys,color=color,marker=marker) - if repel: - texts =list() - for i,lab in enumerate(self.labels_): - texts.append(ax.text(xs[i],ys[i],lab,color=color,ha=ha,va=va)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i,lab in enumerate(self.labels_): - ax.text(xs[i],ys[i],lab,color=color,ha=ha,va=va) - - if title is None: - title = "Classical multidimensional scaling (PCoA, Principal Coordinates Analysis)" - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - -##################################################################################################### -# PLOT CONTRIBUTIONS -##################################################################################################### - -def plot_contrib(self, - choice="ind", - axis=None, - xlabel=None, - top_contrib=10, - bar_width=None, - add_grid=True, - color="steelblue", - short_labels=False, - ax=None) -> plt: - - """ Plot the row and column contributions graph - - For the selected axis, the graph represents the row or column - cosines sorted in descending order. - - Parameters - ---------- - choice : {'ind','var','mod'}. - 'ind' : individuals - 'var' : continues/categorical variables - 'mod' : categories - - axis : None or int. - Select the axis for which the row/col contributions are plotted. If None, axis = 0. - - xlabel : None or str (default). - The label text. - - top_contrib : None or int. - Set the maximum number of values to plot. - If top_contrib is None : all the values are plotted. - - bar_width : None, float or array-like. - The width(s) of the bars. - - add_grid : bool or None, default = True. - Whether to show the grid lines. - - color : color or list of color, default = "steelblue". - The colors of the bar faces. - - short_labels : bool, default = False - - ax : matplotlib Axes, optional - Axes in which to draw the plot, otherwise use the currently-active Axes. - - Returns - ------- - None - """ - - if choice not in ["ind","var","mod"]: - raise ValueError("Error : 'choice' not allowed.") - - if axis is None: - axis = 0 - elif not isinstance(axis,int): - raise ValueError("Error : 'axis' must be an integer.") - elif axis < 0 or axis > self.n_components_: - raise ValueError(f"Error : 'axis' must be an integer between 0 and {self.n_components_ - 1}.") - - if ax is None: - ax = plt.gca() - if xlabel is None: - xlabel = "Contributions (%)" - - if bar_width is None: - bar_width = 0.5 - if top_contrib is None: - top_contrib = 10 - elif not isinstance(top_contrib,int): - raise ValueError("Error : 'top_contrib' must be an integer.") - - if choice == "ind": - name = "individuals" - contrib = self.row_contrib_[:,axis] - labels = self.row_labels_ - elif choice == "var" and self.model_ != "mca": - name = "continues variables" - contrib = self.col_contrib_[:,axis] - labels = self.col_labels_ - if self.model_ == "famd": - contrib = np.append(contrib,self.var_contrib_[:,axis],axis=0) - labels = labels + self.quali_labels_ - elif choice == "mod" and self.model_ in ["mca","famd"]: - name = "categories" - contrib = self.mod_contrib_[:,axis] - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - - n = len(labels) - n_labels = len(labels) - - if (top_contrib is not None) & (top_contrib < n_labels): - n_labels = top_contrib - - limit = n - n_labels - contrib_sorted = np.sort(contrib)[limit:n] - labels_sort = pd.Series(labels)[np.argsort(contrib)][limit:n] - r = np.arange(n_labels) - ax.barh(r,contrib_sorted,height=bar_width,color=color,align="edge") - ax.set_yticks([x + bar_width/2 for x in r], labels_sort) - ax.set(title=f"Contribution of {name} to Dim-{axis+1}",xlabel=xlabel,ylabel=name) - ax.grid(visible=add_grid) - - -################################################################################################ -# PLOT CORRELATION CIRCLE -################################################################################################ - -def plot_correlation_circle(self, - axis=[0,1], - title =None, - color="blue", - add_grid =True, - color_map ="jet", - add_hline = True, - add_vline=True, - ha="center", - va="center", - add_circle=True, - quanti_sup=True, - color_sup = "red", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - patch_color = "black", - repel=False, - ax=None) -> plt: - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - xs = self.col_coord_[:,axis[0]] - ys = self.col_coord_[:,axis[1]] - - if ax is None: - ax = plt.gca() - if color == "cos2": - c = np.sum(self.col_cos2_,axis=1) - elif color == "contrib": - c = np.sum(self.col_contrib_,axis=1) - - if color in ["cos2","contrib"]: - cNorm = mcolors.Normalize(vmin=np.min(c), vmax=np.max(c)) - scalarMap = cm.ScalarMappable(norm=cNorm,cmap=plt.get_cmap(color_map)) - - if color in ["cos2","contrib"]: - if repel: - texts = list() - for j, lab in enumerate(self.col_labels_): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal)) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for j, lab in enumerate(self.col_labels_): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal) - else: - if repel: - texts = list() - for j, lab in enumerate(self.col_labels_): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for j, lab in enumerate(self.col_labels_): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color) - - if quanti_sup: - if self.quanti_sup_labels_ is not None: - xxs = self.col_sup_coord_[:,axis[0]] - yys = self.col_sup_coord_[:,axis[1]] - # Add labels - if repel: - texts = list() - for j, lab in enumerate(self.quanti_sup_labels_): - ax.arrow(0,0,xxs[j],yys[j],head_width=0.02,length_includes_head=True,color=color_sup) - texts.append(ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xxs,y=yys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for j, lab in enumerate(self.quanti_sup_labels_): - ax.arrow(0,0,xxs[j],yys[j],head_width=0.02,length_includes_head=True,color=color_sup) - ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color=color_sup) - if add_circle: - ax.add_patch(plt.Circle((0,0),1, color=patch_color,fill=False)) - - if title is None : - title = "Correlation circle" - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=(-1.1,1.1),ylim=(-1.1,1.1)) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -###################################################################################### -# PLOT COSINES -##################################################################################### - -def plot_cosines(self, - choice="ind", - axis=None, - xlabel=None, - top_cos2=10, - bar_width=None, - add_grid=True, - color="steelblue", - short_labels=False, - ax=None) -> plt: - - """ Plot the row and columns cosines graph - - For the selected axis, the graph represents the row or column - cosines sorted in descending order. - - Parameters - ---------- - choice : {'ind','var','mod','quanti_sup','quali_sup','ind_sup'} - 'ind' : individuals - 'var' : continues variables - 'mod' : categories - 'quanti_sup' : supplementary continues variables - 'quali_sup' : supplementary categories variables - 'ind_sup ' : supplementary individuals - - axis : None or int - Select the axis for which the row/col cosines are plotted. If None, axis = 0. - - xlabel : None or str (default) - The label text. - - top_cos2 : int - Set the maximum number of values to plot. - If top_cos2 is None : all the values are plotted. - - bar_width : None, float or array-like. - The width(s) of the bars. - - add_grid : bool or None, default = True. - Whether to show the grid lines - - color : color or list of color, default = "steelblue". - The colors of the bar faces. - - short_labels : bool, default = False - - ax : matplotlib Axes, optional - Axes in which to draw the plot, otherwise use the currently-active Axes. - - Returns - ------- - None - """ - - if choice not in ["ind","var","mod","quanti_sup","quali_sup","ind_sup"]: - raise ValueError("Error : 'choice' not allowed.") - - if axis is None: - axis = 0 - elif not isinstance(axis,int): - raise ValueError("Error : 'axis' must be an integer.") - elif axis < 0 or axis > self.n_components_: - raise ValueError(f"Error : 'axis' must be an integer between 0 and {self.n_components_ - 1}") - - if ax is None: - ax = plt.gca() - if xlabel is None: - xlabel = "Cos2 - Quality of representation" - if bar_width is None: - bar_width = 0.5 - if top_cos2 is None: - top_cos2 = 10 - - if choice == "ind": - name = "individuals" - cos2 = self.row_cos2_[:,axis] - labels = self.row_labels_ - elif choice == "var" and self.model_ != "mca": - name = "continues variables" - cos2 = self.col_cos2_[:,axis] - labels = self.col_labels_ - elif choice == "mod" and self.model_ in ["mca","famd"]: - name = "categories" - cos2 = self.mod_cos2_[:,axis] - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - elif choice == "quanti_sup" and self.model_ != "ca": - if ((self.quanti_sup_labels_ is not None) and (len(self.col_sup_labels_) >= 2)): - name = "supplementary continues variables" - cos2 = self.col_sup_cos2_[:,axis] - labels = self.col_sup_labels_ - else: - raise ValueError("Error : 'quanti_sup'") - elif choice == "quali_sup" and self.model_ !="ca": - if self.quali_sup_labels_ is not None: - name = "supplementary categories" - cos2 = self.mod_sup_cos2_[:,axis] - if short_labels: - labels = self.short_sup_labels_ - else: - labels = self.mod_sup_labels_ - - # Start - n = len(labels) - n_labels = len(labels) - if (top_cos2 is not None) & (top_cos2 < n_labels): - n_labels = top_cos2 - - limit = n - n_labels - cos2_sorted = np.sort(cos2)[limit:n] - labels_sort = pd.Series(labels)[np.argsort(cos2)][limit:n] - r = np.arange(n_labels) - ax.barh(r,cos2_sorted,height=bar_width,color=color,align="edge") - ax.set_yticks([x + bar_width/2 for x in r], labels_sort) - ax.set(title=f"Cosinus of {name} to Dim-{axis+1}",xlabel=xlabel,ylabel=name,xlim=(0,1)) - ax.grid(visible=add_grid) - - -############################################################################################## -# EXPLORATORY FACTOR ANALYSIS -############################################################################################### - -def plotEFA(self, - choice ="ind", - axis=[0,1], - xlim=(None,None), - ylim=(None,None), - title =None, - color="blue", - marker="o", - add_grid =True, - color_map ="jet", - add_hline = True, - add_vline=True, - ha="center", - va="center", - add_circle=True, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - patch_color = "black", - repel=False, - ax=None) -> plt: - - """ Plot the Factor map for individuals and variables - - Parameters - ---------- - self : aninstance of class PCA - choice : str - axis : tuple or list of two elements - xlim : tuple or list of two elements - ylim : tuple or list of two elements - title : str - color : str - marker : str - The marker style for active points - add_grid : bool - color_map : str - add_hline : bool - add_vline : bool - ha : horizontalalignment : {'left','center','right'} - va : verticalalignment {"bottom","baseline","center","center_baseline","top"} - hline_color : - hline_style : - vline_color : - vline_style : - ax : - **kwargs : Collection properties - - Returns - ------- - None - """ - - if self.model_ != "efa": - raise ValueError("Error : 'self' must be an instance of class EFA.") - - if choice not in ["ind","var"]: - raise ValueError("Error : 'choice' ") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - if ax is None: - ax = plt.gca() - - if choice == "ind": - coord = self.row_coord_[:,axis] - labels = self.row_labels_ - if title is None: - title = "Individuals factor map - EFA" - else: - coord = self.col_coord_[:,axis] - contrib = self.col_contrib_[:,axis] - labels = self.col_labels_ - if title is None: - title = "Variables factor map - EFA" - - # Extract coordinates - xs = coord[:,axis[0]] - ys = coord[:,axis[1]] - - if color == "contrib": - c = np.sum(contrib,axis=1) - - if color in ["contrib"]: - cNorm = mcolors.Normalize(vmin=np.min(c), vmax=np.max(c)) - scalarMap = cm.ScalarMappable(norm=cNorm,cmap=plt.get_cmap(color_map)) - - if choice == "ind": - if color in ["contrib"]: - raise NotImplementedError("Error : This method is not implemented yet.") - else: - ax.scatter(xs,ys,c=color,marker=marker) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - else: - if color == "contrib": - if repel: - texts = list() - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal) - else: - if repel: - texts = list() - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color) - - if add_circle: - ax.add_patch(plt.Circle((0,0),1,color=patch_color,fill=False)) - - if choice == "var": - xlim = ylim = (-1.1,1.1) - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -############################################################################################### -# Plot Eigenvalues -############################################################################################### - -def plot_eigenvalues(self,choice ="proportion",n_components=10,title=None,xlabel=None,ylabel=None,bar_fill="steelblue", - bar_color = "steelblue",line_color="black",line_style="dashed",bar_width=None, - add_kaiser=False, add_kss = False, add_broken_stick = False,add_grid=True, - add_labels=False, ha = "center",va = "bottom",ax=None): - - """ Plot the eigen values graph - - Parameters - ---------- - choice : string - Select the graph to plot : - - If "eigenvalue" : plot the eigenvalues. - - If "proportion" : plot the percentage of variance. - n_components : - title : - x_label : - y_label : - bar_fill : - bar_color : - line_color : - line_tyle : - bar_width : - add_labels : - add_kss : - add_broken_stick : - add_grid : - n_compon - Returns - ------- - None - """ - - if self.model_ == "mds": - raise ValueError("Error : ") - - if choice not in ["eigenvalue","proportion"]: - raise ValueError("Error : Allowed values are 'eigenvalue' or 'proportion'.") - - # Set style size - if ax is None: - ax = plt.gca() - if add_kaiser: - add_kss = False - add_broken_stick = False - elif add_kss: - add_kaiser = False - add_broken_stick = False - elif add_broken_stick: - add_kaiser = False - add_kss = False - - ncp = min(n_components,self.n_components_) - if choice == "eigenvalue": - eig = self.eig_[0][:ncp] - text_labels = list([str(np.around(x,3)) for x in eig]) - if self.model_ != "cmds": - kaiser = self.kaiser_threshold_ - if self.model_ in ["pca","ppca","efa"]: - kss = self.kss_threshold_ - bst = self.broken_stick_threshold_[:ncp] - if ylabel is None: - ylabel = "Eigenvalue" - elif choice == "proportion": - eig = self.eig_[2][:ncp] - text_labels = list([str(np.around(x,1))+"%" for x in eig]) - if self.model_ != "cmds": - kaiser = self.kaiser_proportion_threshold_ - else: - raise ValueError("Error : 'choice' variable must be 'eigenvalue' or 'proportion'.") - - if bar_width is None: - bar_width = 0.5 - elif isinstance(bar_width,float)is False: - raise ValueError("Error : 'bar_width' variable must be a float.") - - xs = pd.Categorical(np.arange(1,ncp+1)) - ys = eig - - ax.bar(xs,ys,color=bar_fill,edgecolor=bar_color,width=bar_width) - ax.plot(xs,ys,marker="o",color=line_color,linestyle=line_style) - if add_labels: - for i, lab in enumerate(text_labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va) - - if add_kaiser: - ax.plot([1,ncp],[kaiser,kaiser],linestyle="dashed",color="red",label="Kaiser threshold") - ax.legend() - - if choice == "eigenvalue": - if add_kss : - if self.model_ in ["pca","ppca","efa"]: - ax.plot([1,ncp],[kss,kss],linestyle="dashed",color="red",label="Karlis - Saporta - Spinaki threshold") - ax.legend() - else: - raise ValueError(f"Error : 'add_kss' is not allowed for an instance of class {self.model_.upper()}.") - - if add_broken_stick: - if self.model_ in ["pca","ppca","efa"]: - ax.plot(xs,bst,marker="o",color="red",linestyle="dashed",label ="Broken stick threshold") - ax.legend() - else: - raise ValueError(f"Error : 'add_broken_stick' is not allowed for an instance of class {self.model_.upper()}.") - - if title is None: - title = "Scree plot" - if xlabel is None: - xlabel = "Dimensions" - if ylabel is None: - ylabel = "Percentage of explained variances" - - # Set - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xticks=xs) - ax.grid(visible=add_grid) - - -###################################################################################################################################" -# PLOT FACTOR ANALYSIS OF MIXED DATA -####################################################################################################################"" - -def plotFAMD(self, - choice ="ind", - axis=[0,1], - xlim=None, - ylim=None, - title=None, - color="blue", - marker="o", - add_grid =True, - ind_sup=False, - color_sup = "red", - marker_sup ="^", - hotelling_ellipse=False, - habillage=None, - short_labels=False, - add_mod_sup=True, - color_map ="jet", - add_hline = True, - add_vline=True, - ha="center", - va="center", - add_circle=True, - quanti_sup=True, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - patch_color = "black", - random_state=None, - repel=False, - ax=None, - **kwargs): - """ - - """ - - if self.model_ != "famd": - raise ValueError("Error : 'self' must be an instance of class FAMD.") - - if choice not in ["ind","var","col","mod"]: - raise ValueError("Error : 'choice' ") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - if ax is None: - ax = plt.gca() - - if choice == "ind": - coord = self.row_coord_[:,axis] - cos2 = self.row_cos2_[:,axis] - contrib = self.row_contrib_[:,axis] - labels = self.row_labels_ - if title is None: - title = "Individuals factor map - FAMD" - elif choice == "col": - coord = self.col_coord_[:,axis] - cos2 = self.col_cos2_[:,axis] - contrib = self.col_contrib_[:,axis] - labels = self.col_labels_ - if title is None: - title = "Graph of continuous variables - FAMD" - elif choice == "mod": - coord = self.mod_coord_[:,axis] - cos2 = self.mod_cos2_[:,axis] - contrib = self.mod_contrib_[:,axis] - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - if title is None: - title = "Graph of the categories - FAMD" - elif choice == "var": - coord = self.col_cos2_[:,axis] - contrib = np.append(self.col_contrib_[:,axis],self.var_contrib_[:,axis],axis=0) - if title is None: - title = "Graphe of variables - FAMD" - else: - raise ValueError("Error : Allowed values are 'ind', 'col', 'mod' and 'var'.") - - # Extract coordinates - xs = coord[:,axis[0]] - ys = coord[:,axis[1]] - - if choice in ["ind","mod"]: - if xlim is None: - xlim = list([np.min(xs)-0.05,np.max(xs)+0.05]) - if ylim is None: - ylim = list([np.min(ys)-0.05,np.max(ys)+0.05]) - elif choice == "var": - xlim = (-0.1,1.1) - ylim = (-0.1,1.1) - else: - xlim = (-1.1,1.1) - ylim = (-1.1,1.1) - - #if choice in ["ind","mod","col"]: - if color == "cos2" and choice != "var": - c = np.sum(cos2,axis=1) - elif color == "contrib": - c = np.sum(contrib,axis=1) - if color in ["cos2","contrib"]: - cNorm = mcolors.Normalize(vmin=np.min(c), vmax=np.max(c)) - scalarMap = cm.ScalarMappable(norm=cNorm,cmap=plt.get_cmap(color_map)) - - if choice == "ind": - if habillage is None: - if color in ["cos2","contrib"]: - p = ax.scatter(xs,ys,c=c,s=len(c),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - - else: - ax.scatter(xs,ys,c=color,marker=marker,**kwargs) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - - else: - # Add Categorical variable - if self.quali_sup_labels_ is not None: - color_list=list([x[4:] for x in list(mcolors.TABLEAU_COLORS.keys())]) - marker_list = list(['.', 'o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']) - vsQual = self.data_[habillage] - modality_list = list(np.unique(vsQual)) - random.seed(random_state) - color_dict = dict(zip(modality_list,random.sample(color_list,len(modality_list)))) - marker_dict = dict(zip(modality_list,random.sample(marker_list,len(modality_list)))) - for group in modality_list: - idx = np.where(vsQual==group) - ax.scatter(xs[idx[0]],ys[idx[0]],label=group,c= color_dict[group],marker = marker_dict[group]) - if repel: - texts=list() - for i in idx[0]: - texts.append(ax.text(xs[i],ys[i],labels[i],c=color_dict[group],ha=ha,va=va)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_dict[group],lw=1.0),ax=ax) - else: - for i in idx[0]: - ax.text(xs[i],ys[i],labels[i],c=color_dict[group],ha=ha,va=va) - box = ax.get_position() - ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) - # Put a legend to the right of the current axis - ax.legend(loc="center right",title=habillage, bbox_to_anchor=(1, 0.5),fancybox=True, shadow=True) - - if ind_sup: - if self.row_sup_labels_ is not None: - # Reset xlim and ylim - xxs = self.row_sup_coord_[:,axis[0]] - yys = self.row_sup_coord_[:,axis[1]] - # Add supplementary row coordinates - ax.scatter(xxs,yys,c=color_sup,marker=marker_sup) - if repel: - texts = list() - for i,lab in enumerate(self.row_sup_labels_): - texts.append(ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for i,lab in enumerate(self.row_sup_labels_): - ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup) - # Add Hotelling Ellipse - if hotelling_ellipse: - num = len(axis)*(len(xs)**2-1)*st.f.ppf(0.95,len(axis),len(xs)-len(axis)) - denum = len(xs)*(len(xs)-len(axis)) - c = num/denum - e1 = 2*math.sqrt(self.eig_[0][axis[0]]*c) - e2 = 2*math.sqrt(self.eig_[0][axis[1]]*c) - # Add Epplipse - ellipse = Ellipse((0,0),width=e1,height=e2,facecolor="none",edgecolor="tomato",linestyle="--") - ax.add_patch(ellipse) - elif choice == "col": - if color in ["cos2","contrib"]: - if repel: - texts = list() - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal) - else: - if repel: - texts = list() - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color) - - if quanti_sup: - if self.quanti_sup_labels_ is not None: - xxs = self.col_sup_coord_[:,axis[0]] - yys = self.col_sup_coord_[:,axis[1]] - # Add labels - if repel: - texts=list() - for j, lab in enumerate(self.col_sup_labels_): - ax.arrow(0,0,xxs[j],yys[j],head_width=0.02,length_includes_head=True,color=color_sup) - texts.append(ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for j, lab in enumerate(self.quanti_sup_labels_): - ax.arrow(0,0,xxs[j],yys[j],head_width=0.02,length_includes_head=True,color=color_sup) - ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color=color_sup) - if add_circle: - ax.add_patch(plt.Circle((0,0),1,color=patch_color,fill=False)) - elif choice == "mod": - if color in ["cos2","contrib"]: - p = ax.scatter(xs,ys,c=c,s=len(c),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - - else: - ax.scatter(xs,ys,c=color,marker=marker,**kwargs) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - - if add_mod_sup: - if self.quali_sup_labels_ is not None: - # Reset xlim and ylim - xxs = self.mod_sup_coord_[:,axis[0]] - yys = self.mod_sup_coord_[:,axis[1]] - # Add supplementary row coordinates - ax.scatter(xxs,yys,color=color_sup,marker=marker_sup) - if repel: - texts = list() - for i,lab in enumerate(self.mod_sup_labels_): - texts.append(ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for i,lab in enumerate(self.mod_sup_labels_): - ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup) - else: - # Add qualitative correlation ratio - xxs = self.var_eta2_[:,axis[0]] - yys = self.var_eta2_[:,axis[1]] - if color == "contrib": - # Append all informations - xs = np.append(xs,xxs,axis=0) - ys = np.append(ys,yys,axis=0) - # Labels - labels = self.quanti_labels_ + self.quali_labels_ - # Scatter plot - p = ax.scatter(xs,ys,c=c,s=len(c),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - elif color == "cos2": - raise ValueError("Error : 'cos2' is not allowed.") - else: - ax.scatter(xs,ys, color="blue",marker=">") - ax.scatter(xxs,yys, color="red",marker = "^") - if repel: - texts1 = list() - for i, lab in enumerate(self.quanti_labels_): - texts1.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color="blue")) - adjust_text(texts1,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color="blue",lw=1.0),ax=ax) - texts2 = list() - for j,lab in enumerate(self.quali_labels_): - texts2.append(ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color="red")) - adjust_text(texts2,x=xxs,y=yys,arrowprops=dict(arrowstyle="->",color="red",lw=1.0),ax=ax) - else: - for i, lab in enumerate(self.quanti_labels_): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color="blue") - for j, lab in enumerate(self.quali_labels_): - ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color="red") - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -#####################################################################################" -# PLOT MULTIPLE CORRESPONDANCE ANALYSIS (MCA) -####################################################################################### - -def plotMCA(self, - choice ="ind", - axis=[0,1], - xlim=(None,None), - ylim=(None,None), - title=None, - color="blue", - marker="o", - add_grid =True, - ind_sup=False, - color_sup = "red", - marker_sup ="^", - hotelling_ellipse=False, - habillage=None, - short_labels=False, - add_mod_sup=True, - color_map ="jet", - add_hline = True, - add_vline =True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - random_state=None, - repel=False, - ax=None, - **kwargs): - - if self.model_ != "mca": - raise ValueError("Error : 'self' must be an instance of class MCA.") - - if choice not in ["ind","mod","var"]: - raise ValueError("Error : 'choice' ") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - if ax is None: - ax = plt.gca() - - if choice == "ind": - coord = self.row_coord_[:,axis] - cos2 = self.row_cos2_[:,axis] - contrib = self.row_contrib_[:,axis] - labels = self.row_labels_ - if title is None: - title = "Individuals - MCA" - elif choice == "mod": - coord = self.mod_coord_[:,axis] - cos2 = self.mod_cos2_[:,axis] - contrib = self.mod_contrib_[:,axis] - if short_labels: - labels = self.short_labels_ - else: - labels = self.mod_labels_ - if title is None: - title = "Qualitatives variables categories - MCA" - elif choice == "var": - coord = self.var_eta2_[:,axis] - cos2 = self.var_cos2_[:,axis] - contrib = self.var_contrib_[:,axis] - labels = self.var_labels_ - if title is None: - title = "Graphe of variables - MCA" - else: - raise ValueError("Error : 'choice'") - - # Extract coordinates - xs = coord[:,axis[0]] - ys = coord[:,axis[1]] - - if color == "cos2": - gradient = np.sum(cos2,axis=1) - elif color == "contrib": - gradient = np.sum(contrib,axis=1) - - # Set colors - if color in ["cos2","contrib"]: - cNorm = mcolors.Normalize(vmin=np.min(gradient), vmax=np.max(gradient)) - scalarMap = cm.ScalarMappable(norm=cNorm,cmap=plt.get_cmap(color_map)) - - if choice == "ind": - if habillage is None: - if color in ["cos2","contrib"]: - p = ax.scatter(xs,ys,c=gradient,s=len(gradient),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(gradient[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(gradient[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - - else: - ax.scatter(xs,ys,c=color,marker=marker,**kwargs) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - - else: - # Add Categorical variable - if self.quali_sup_labels_ is not None: - color_list=list([x[4:] for x in list(mcolors.TABLEAU_COLORS.keys())]) - marker_list = list(['.', 'o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']) - vsQual = self.data_[habillage] - modality_list = list(np.unique(vsQual)) - random.seed(random_state) - color_dict = dict(zip(modality_list,random.sample(color_list,len(modality_list)))) - marker_dict = dict(zip(modality_list,random.sample(marker_list,len(modality_list)))) - for group in modality_list: - idx = np.where(vsQual==group) - ax.scatter(xs[idx[0]],ys[idx[0]],label=group,c= color_dict[group],marker = marker_dict[group]) - if repel: - texts=list() - for i in idx[0]: - texts.append(ax.text(xs[i],ys[i],labels[i],c=color_dict[group],ha=ha,va=va)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_dict[group],lw=1.0),ax=ax) - else: - for i in idx[0]: - ax.text(xs[i],ys[i],labels[i],c=color_dict[group],ha=ha,va=va) - box = ax.get_position() - ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) - # Put a legend to the right of the current axis - ax.legend(loc="center right",title=habillage, bbox_to_anchor=(1, 0.5),fancybox=True, shadow=True) - - if ind_sup: - if self.row_sup_labels_ is not None: - # Reset xlim and ylim - xxs = self.row_sup_coord_[:,axis[0]] - yys = self.row_sup_coord_[:,axis[1]] - # Add supplementary row coordinates - ax.scatter(xxs,yys,c=color_sup,marker=marker_sup) - if repel: - texts = list() - for i,lab in enumerate(self.row_sup_labels_): - texts.append(ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for i,lab in enumerate(self.row_sup_labels_): - ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup) - # Add Hotelling Ellipse - if hotelling_ellipse: - num = len(axis)*(len(xs)**2-1)*st.f.ppf(0.95,len(axis),len(xs)-len(axis)) - denum = len(xs)*(len(xs)-len(axis)) - c = num/denum - e1 = 2*math.sqrt(self.eig_[0][axis[0]]*c) - e2 = 2*math.sqrt(self.eig_[0][axis[1]]*c) - # Add Epplipse - ellipse = Ellipse((0,0),width=e1,height=e2,facecolor="none",edgecolor="tomato",linestyle="--") - ax.add_patch(ellipse) - elif choice == "mod": - if color in ["cos2","contrib"]: - p = ax.scatter(xs,ys,c=gradient,s=len(gradient),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(gradient[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(gradient[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - - else: - ax.scatter(xs,ys,c=color,marker=marker,**kwargs) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - - if add_mod_sup: - if self.quali_sup_labels_ is not None: - # Reset xlim and ylim - xxs = self.mod_sup_coord_[:,axis[0]] - yys = self.mod_sup_coord_[:,axis[1]] - # Add supplementary row coordinates - ax.scatter(xxs,yys,color=color_sup,marker=marker_sup) - # - if short_labels: - mod_sup_labels = self.short_sup_labels_ - else: - mod_sup_labels = self.mod_sup_labels_ - if repel: - texts = list() - for i,lab in enumerate(mod_sup_labels): - texts.append(ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for i,lab in enumerate(mod_sup_labels): - ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup) - else: - if color not in ["cos2","contrib"]: - ax.scatter(xs,ys, color=color,marker=marker) - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - else: - p = ax.scatter(xs,ys,c=gradient,s=len(gradient),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(gradient[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(gradient[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -################################################################################ -# PLOT MULTIDIMENSIONAL SCALING (MDS) -############################################################################### - -def plotMDS(self, - axis=[0,1], - xlim=(None,None), - ylim=(None,None), - title =None, - xlabel=None, - ylabel=None, - color="blue", - marker="o", - add_grid =True, - add_hline = True, - add_vline=True, - ha="center", - va="center", - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - repel=False, - ax=None) -> plt: - - if self.model_ != "mds": - raise ValueError("Error : 'self' must be an instance of class MDS.") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid axis") - - if ax is None: - ax = plt.gca() - - xs = self.coord_[:,axis[0]] - ys = self.coord_[:,axis[1]] - ax.scatter(xs,ys,color=color,marker=marker) - if repel: - texts =list() - for i,lab in enumerate(self.labels_): - texts.append(ax.text(xs[i],ys[i],lab,color=color,ha=ha,va=va)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i,lab in enumerate(self.labels_): - ax.text(xs[i],ys[i],lab,color=color,ha="center",va="center") - - if title is None: - title = "Multidimensional scaling" - - # Add elements - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -####################################################################################" -# PRINCIPAL COMPONENTS ANALYSIS -# #################################################################################" - -def plotPCA(self,choice ="ind",axis=[0,1],xlim=(None,None),ylim=(None,None),title =None,color="blue",marker="o", - add_grid =True,ind_sup=False,color_sup = "red",marker_sup ="^",hotelling_ellipse=False, - habillage = None,short_labels=True,color_map ="jet",add_hline = True,add_vline=True,ha="center",va="center", - add_circle=True,quanti_sup=True,hline_color="black",hline_style="dashed",vline_color="black", - vline_style ="dashed",patch_color = "black", - random_state=None,repel=False,ax=None,**kwargs) -> plt: - - """ Plot the Factor map for individuals and variables - - Parameters - ---------- - self : an instance of class PCA - choice : str - axis : tuple or list of two elements - xlim : tuple or list of two elements - ylim : tuple or list of two elements - title : str - color : str - marker : str - The marker style for active points - add_grid : bool - ind_sup : bool - color_sup : str : - The markers colors - marker_sup : str - The marker style for supplementary points - color_map : str - add_hline : bool - add_vline : bool - ha : horizontalalignment : {'left','center','right'} - va : verticalalignment {"bottom","baseline","center","center_baseline","top"} - hline_color : - hline_style : - vline_color : - vline_style : - ax : - **kwargs : Collection properties - - Returns - ------- - None - """ - - if self.model_ != "pca": - raise ValueError("Error : 'self' must be an instance of class PCA.") - - if choice not in ["ind","var"]: - raise ValueError("Error : 'choice' ") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - if ax is None: - ax = plt.gca() - - if choice == "ind": - coord = self.row_coord_[:,axis] - cos2 = self.row_cos2_[:,axis] - contrib = self.row_contrib_[:,axis] - labels = self.row_labels_ - if title is None: - title = "Individuals factor map - PCA" - else: - coord = self.col_coord_[:,axis] - cos2 = self.col_cos2_[:,axis] - contrib = self.col_contrib_[:,axis] - labels = self.col_labels_ - if title is None: - title = "Variables factor map - PCA" - - # Extract coordinates - xs = coord[:,axis[0]] - ys = coord[:,axis[1]] - - if color == "cos2": - c = np.sum(cos2,axis=1) - elif color == "contrib": - c = np.sum(contrib,axis=1) - - if color in ["cos2","contrib"]: - cNorm = mcolors.Normalize(vmin=np.min(c), vmax=np.max(c)) - scalarMap = cm.ScalarMappable(norm=cNorm,cmap=plt.get_cmap(color_map)) - - if choice == "ind": - if habillage is None: - if color in ["cos2","contrib"]: - p = ax.scatter(xs,ys,c=c,s=len(c),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - - else: - ax.scatter(xs,ys,c=color,marker=marker,**kwargs) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - else: - # Add Categorical variable - if self.quali_sup_labels_ is not None: - color_list=list([x[4:] for x in list(mcolors.TABLEAU_COLORS.keys())]) - marker_list = list(['.', 'o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']) - vsQual = self.data_[habillage] - modality_list = list(np.unique(vsQual)) - random.seed(random_state) - color_dict = dict(zip(modality_list,random.sample(color_list,len(modality_list)))) - marker_dict = dict(zip(modality_list,random.sample(marker_list,len(modality_list)))) - for group in modality_list: - idx = np.where(vsQual==group) - ax.scatter(xs[idx[0]],ys[idx[0]],label=group,c= color_dict[group],marker = marker_dict[group]) - if repel: - texts=list() - for i in idx[0]: - texts.append(ax.text(xs[i],ys[i],labels[i],c=color_dict[group],ha=ha,va=va)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_dict[group],lw=1.0),ax=ax) - else: - for i in idx[0]: - ax.text(xs[i],ys[i],labels[i],c=color_dict[group],ha=ha,va=va) - box = ax.get_position() - ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) - # Put a legend to the right of the current axis - ax.legend(title=habillage, bbox_to_anchor=(1, 0.5),fancybox=True, shadow=True) - - if ind_sup: - if self.row_sup_labels_ is not None: - # Reset xlim and ylim - xxs = self.row_sup_coord_[:,axis[0]] - yys = self.row_sup_coord_[:,axis[1]] - # Add supplementary row coordinates - ax.scatter(xxs,yys,c=color_sup,marker=marker_sup) - if repel: - texts = list() - for i,lab in enumerate(self.row_sup_labels_): - texts.append(ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for i,lab in enumerate(self.row_sup_labels_): - ax.text(xxs[i],yys[i],lab,ha=ha,va=va,color=color_sup) - # Add Hotelling Ellipse - if hotelling_ellipse: - num = len(axis)*(len(xs)**2-1)*st.f.ppf(0.95,len(axis),len(xs)-len(axis)) - denum = len(xs)*(len(xs)-len(axis)) - c = num/denum - e1 = 2*math.sqrt(self.eig_[0][axis[0]]*c) - e2 = 2*math.sqrt(self.eig_[0][axis[1]]*c) - # Add Epplipse - ellipse = Ellipse((0,0),width=e1,height=e2,facecolor="none",edgecolor="tomato",linestyle="--") - ax.add_patch(ellipse) - if self.quali_sup_labels_ is not None: - if habillage is None: - xxs = np.array(self.mod_sup_coord_[:,axis[0]]) - yys = np.array(self.mod_sup_coord_[:,axis[1]]) - ax.scatter(xxs,yys,color="red") - if short_labels: - mod_sup_labels = self.short_sup_labels_ - else: - mod_sup_labels = self.mod_sup_labels_ - if repel: - texts =list() - for i,lab in enumerate(mod_sup_labels): - texts.append(ax.text(xxs[i],yys[i],lab,color="red")) - adjust_text(texts,x=xxs,y=yys,arrowprops=dict(arrowstyle="->",color="red",lw=1.0),ax=ax) - else: - for i,lab in enumerate(mod_sup_labels): - ax.text(xxs[i],yys[i],lab,color="red") - else: - if color in ["cos2","contrib"]: - if repel: - texts = list() - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal) - else: - if repel: - texts = list() - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color) - - if quanti_sup: - if self.quanti_sup_labels_ is not None: - xxs = self.col_sup_coord_[:,axis[0]] - yys = self.col_sup_coord_[:,axis[1]] - # Add labels - if repel: - texts=list() - for j, lab in enumerate(self.quanti_sup_labels_): - ax.arrow(0,0,xxs[j],yys[j],head_width=0.02,length_includes_head=True,color=color_sup) - texts.append(ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color=color_sup)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color_sup,lw=1.0),ax=ax) - else: - for j, lab in enumerate(self.quanti_sup_labels_): - ax.arrow(0,0,xxs[j],yys[j],head_width=0.02,length_includes_head=True,color=color_sup) - ax.text(xxs[j],yys[j],lab,ha=ha,va=va,color=color_sup) - if add_circle: - ax.add_patch(plt.Circle((0,0),1,color=patch_color,fill=False)) - - if choice == "var": - xlim = ylim = (-1.1,1.1) - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -######################################################################################################## -## PARTIAL PRINCIPAL COMPONENTS ANALYSIS -######################################################################################################## - -def plotPPCA(self, - choice ="ind", - axis=[0,1], - xlim=(None,None), - ylim=(None,None), - title =None, - color="blue", - marker="o", - add_grid =True, - color_map ="jet", - add_hline = True, - add_vline=True, - ha="center", - va="center", - add_circle=True, - hline_color="black", - hline_style="dashed", - vline_color="black", - vline_style ="dashed", - patch_color = "black", - repel=False, - ax=None,**kwargs) -> plt: - - """ Plot the Factor map for individuals and variables - - Parameters - ---------- - self : aninstance of class PCA - choice : {'ind', 'var'}, default = 'ind' - axis : tuple or list of two elements - xlim : tuple or list of two elements - ylim : tuple or list of two elements - title : str - color : str - marker : str - The marker style for active points - add_grid : bool - color_map : str - add_hline : bool - add_vline : bool - ha : horizontalalignment : {'left','center','right'} - va : verticalalignment {"bottom","baseline","center","center_baseline","top"} - hline_color : - hline_style : - vline_color : - vline_style : - ax : - **kwargs : Collection properties - - Returns - ------- - None - """ - - if self.model_ != "ppca": - raise ValueError("Error : 'self' must be an instance of class PPCA.") - - if choice not in ["ind","var"]: - raise ValueError("Error : 'choice' ") - - if ((len(axis) !=2) or - (axis[0] < 0) or - (axis[1] > self.n_components_-1) or - (axis[0] > axis[1])) : - raise ValueError("Error : You must pass a valid 'axis'.") - - if ax is None: - ax = plt.gca() - - if choice == "ind": - coord = self.row_coord_[:,axis] - cos2 = self.row_cos2_[:,axis] - contrib = self.row_contrib_[:,axis] - labels = self.row_labels_ - if title is None: - title = "Individuals factor map - Partial PCA" - else: - coord = self.col_coord_[:,axis] - cos2 = self.col_cos2_[:,axis] - contrib = self.col_contrib_[:,axis] - labels = self.col_labels_ - if title is None: - title = "Variables factor map - Partial PCA" - - # Extract coordinates - xs = coord[:,axis[0]] - ys = coord[:,axis[1]] - - if color == "cos2": - c = np.sum(cos2,axis=1) - elif color == "contrib": - c = np.sum(contrib,axis=1) - - if color in ["cos2","contrib"]: - cNorm = mcolors.Normalize(vmin=np.min(c), vmax=np.max(c)) - scalarMap = cm.ScalarMappable(norm=cNorm,cmap=plt.get_cmap(color_map)) - - if choice == "ind": - if color in ["cos2","contrib"]: - p = ax.scatter(xs,ys,c=c,s=len(c),marker=marker,cmap=plt.get_cmap(color_map),**kwargs) - plt.colorbar(p).ax.set_title(label=color,weight='bold') - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[i]) - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=colorVal) - - else: - ax.scatter(xs,ys,c=color,marker=marker,**kwargs) - # Add labels - if repel: - texts = list() - for i, lab in enumerate(labels): - texts.append(ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for i, lab in enumerate(labels): - ax.text(xs[i],ys[i],lab,ha=ha,va=va,color=color) - else: - if color in ["cos2","contrib"]: - if repel: - texts = list() - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=colorVal,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - colorVal = scalarMap.to_rgba(c[j]) - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=colorVal) - #plt.colorbar(p).ax.set_title(label=color,weight='bold') - #cb=mpl.colorbar.ColorbarBase(ax,cmap=plt.get_cmap(color_map),norm=cNorm,orientation='vertical') - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=colorVal) - else: - if repel: - texts = list() - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - texts.append(ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color)) - adjust_text(texts,x=xs,y=ys,arrowprops=dict(arrowstyle="->",color=color,lw=1.0),ax=ax) - else: - for j, lab in enumerate(labels): - ax.arrow(0,0,xs[j],ys[j],head_width=0.02,length_includes_head=True,color=color) - ax.text(xs[j],ys[j],lab,ha=ha,va=va,color=color) - - if add_circle: - ax.add_patch(plt.Circle((0,0),1,color=patch_color,fill=False)) - - if choice == "var": - xlim = ylim = (-1.1,1.1) - - # Add elements - proportion = self.eig_[2] - xlabel = "Dim."+str(axis[0]+1)+" ("+str(round(proportion[axis[0]],2))+"%)" - ylabel = "Dim."+str(axis[1]+1)+" ("+str(round(proportion[axis[1]],2))+"%)" - ax.grid(visible=add_grid) - ax.set(xlabel=xlabel,ylabel=ylabel,title=title,xlim=xlim,ylim=ylim) - if add_hline: - ax.axhline(y=0,color=hline_color,linestyle=hline_style) - if add_vline: - ax.axvline(x=0,color=vline_color,linestyle=vline_style) - - -########################################################################################3 -# -########################################################################################### - -def plot_shepard(self, - title=None, - xlabel=None, - ylabel=None, - add_grid=True, - ax=None) -> plt: - """Computes the Shepard plot - - Parameter: - --------- - self: An instance of class CMDS/MDS - title : title - xlabel : x-axis labels - ylabel : y-axis labels - add_grid : boolean. default = True. - ax : default = None - - Return - ------ - None - - """ - - if self.model_ not in ["cmds","mds"]: - raise ValueError("Error : 'Method' is allowed only for multidimensional scaling.") - if ax is None: - ax =plt.gca() - - # Scatter plot - ax.scatter(self.dist_,self.dist_,color="steelblue") - ax.scatter(self.dist_,self.res_dist_,color = "steelblue") - - if title == None: - title = "Shepard Diagram" - if xlabel is None: - xlabel = "input distance" - if ylabel is None: - ylabel = "output distance" - - ax.set(xlabel = xlabel, ylabel =ylabel,title= title) - ax.grid(visible=add_grid) - diff --git a/build/lib/scientisttools/utils.py b/build/lib/scientisttools/utils.py deleted file mode 100644 index bb602a0..0000000 --- a/build/lib/scientisttools/utils.py +++ /dev/null @@ -1,550 +0,0 @@ -# -*- coding: utf-8 -*- - -import matplotlib.pyplot as plt -import networkx as nx -from collections import defaultdict -from typing import Hashable -from pandas.core.frame import DataFrame -import numpy as np -import pandas as pd -from sklearn.utils.validation import check_array -from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity -import pingouin as pg -from functools import partial -import itertools - -def get_melt(X,level=- 1, dropna=True): - """Stack the prescribed level(s) from columns to index. - - Return a reshaped DataFrame or Series having a multi-level index with one or more - new inner-most levels compared to the current DataFrame. The new inner-most levels - are created by pivoting the columns of the current dataframe: - - Parameters - ---------- - X : DataFrame - level : int, str, list, default -1 - Level(s) to stack from the column axis onto the index axis, - defined as one index or label, or a list of indices or labels. - dropna : bool, default True - Whether to drop rows in the resulting Frame/Series with missing values. - Stacking a column level onto the index axis can create combinations of index - and column values that are missing from the original dataframe. - - Return - ------ - Stacked dataframe or series. - - """ - if not isinstance(X,pd.DataFrame): - raise TypeError( - f"{type(X)} is not supported. Please convert to a DataFrame with " - "pd.DataFrame. For more information see: " - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html") - - return X.stack(level=level, dropna=dropna).rename_axis(('Var1', 'Var2')).reset_index(name='value') - -def match_arg(x, lst): - return [elt for elt in lst if x in elt] - - -def is_euclidean(D,plot=False,printf=False,tol=1e-07): - """Confirmation of the Euclidean nature of a distance matrix by the Gower's theorem. - - Parameters - ---------- - D : Dissimilarities matrix - plot : a logical value indicating whether the eigenvalues bar plot of the matrix of the term -(1/2)*d(i,j)^2 centred by rows and columns should be diplayed. - print : a logical value indicating whether the eigenvalues of the matrix of the term -(1/2)*d(i,j)^2 centred by rows and columns should be printed. - tol : a tolerance threshold : an eigenvalue is considered positive if it is larger than `-tol*lambda1` where `lambda1` is the largest eigenvalue. - - Return - ------ - None - - References - ---------- - Gower, J.C. and Legendre, P. (1986) Metric and Euclidean properties of dissimilarity coefficients. Journal of Classification, 3, 5--48. - - """ - D = np.array(D) - n = D.shape[0] - A = -0.5*np.multiply(D,D) - H = np.identity(n) - (1/n)*np.ones(shape=(n,n)) - B = np.dot(H,np.dot(A,H)) - value, vector = np.linalg.eig(B) - w0 = value[-1]/value[0] - if plot: - plt.bar(np.arange(len(value)),value) - plt.show() - if printf: - print(value) - return w0 > - tol - -def sim_dist(X, method= "standard"): - """Transforms similarities matrix to dissimilarities matrix - - Parameters - ---------- - X : array of float, square matrix. - method : {'standard','oneminus'} - - Return - ------ - D : Dissimilarities matrix - - """ - if X.shape[0] != X.shape[1]: - raise ValueError("Error : 'X' must be square matrix") - if method == "standard": - D = np.zeros(shape=(X.shape[0],X.shape[0])) - for i in np.arange(0,X.shape[0]): - for j in np.arange(0,X.shape[0]): - D[i,j] = np.sqrt((X[i,i] - X[j,j] +2*X[i,j])) - elif method == "oneminus": - D = 1 - X - else: - raise ValueError("Error : Allowed method are 'standard' or 'oneminus'.") - return D - - -def bicenter_wt(X, row_wt=None,col_wt=None): - """This function creates a doubly centred matrix. - - Parameters - ---------- - X : a matrix with n rows and p columns - - Return - ------ - A doubly centred matrix - """ - X = np.array(X) - n, p = X.shape - if row_wt is None: - row_wt = np.repeat(a=1,repeats=n) - if col_wt is None: - col_wt = np.repeat(a=1,repeats=p) - row_wt = np.array(row_wt) - col_wt = np.array(col_wt) - sr = sum(row_wt) - row_wt = row_wt/sr - st = sum(col_wt) - col_wt = col_wt/st - row_mean = np.apply_along_axis(func1d=np.sum,axis=0,arr=np.apply_along_axis(arr=X,func1d=lambda x : x*row_wt,axis=0)) - col_mean = np.apply_along_axis(func1d=np.sum,axis=0,arr=np.apply_along_axis(arr=np.transpose(X),func1d=lambda x : x*col_wt,axis=0)) - col_mean = col_mean - np.sum(row_mean * col_wt) - X = np.apply_along_axis(func1d=lambda x : x - row_mean,axis=1,arr=X) - X = np.transpose(np.apply_along_axis(func1d=lambda x : x - col_mean,axis=1,arr=np.transpose(X))) - return X - -# Rapport de corrélation -def eta2(categories, value,digits=4): - K = len(np.unique(categories, return_inverse=True)[0]) - n = value.shape[0] - - cat = np.unique(categories, return_inverse=True)[1] - values = np.array(value) - - scintra = 0 - scinter = 0 - for i in np.unique(cat): - subgroup = values[np.argwhere(cat == i).flatten()] - scintra += np.sum((subgroup-np.mean(subgroup))**2) - scinter += len(subgroup)*(np.mean(subgroup)-np.mean(values))**2 - - eta2 = scinter/(scinter+scintra) - f_stat = (scinter/(K-1))/(scintra/(n-K)) - # calcul de la pvalue - from scipy.stats import f - pvalue = np.round(f.sf(f_stat, K-1, n-K),4) - return dict({'Sum. Intra':round(scintra,digits), - 'Sum. Inter':round(scinter,digits), - 'correlation ratio':round(eta2,digits), - 'F-stats': round(f_stat,digits), - 'pvalue': pvalue}) - -def RGBtoHex(vals, rgbtype=256): - """Converts RGB values in a variety of formats to Hex values. - - @param vals An RGB/RGBA tuple - @param rgbtype Valid valus are: - 1 - Inputs are in the range 0 to 1 - 256 - Inputs are in the range 0 to 255 - - @return A hex string in the form '#RRGGBB' or '#RRGGBBAA' -""" - - if len(vals)!=3 and len(vals)!=4: - raise Exception("RGB or RGBA inputs to RGBtoHex must have three or four elements!") - if rgbtype!=1 and rgbtype!=256: - raise Exception("rgbtype must be 1 or 256!") - - #Convert from 0-1 RGB/RGBA to 0-255 RGB/RGBA - if rgbtype==1: - vals = [255*x for x in vals] - - #Ensure values are rounded integers, convert to hex, and concatenate - return '#' + ''.join(['{:02X}'.format(int(round(x))) for x in vals]) - - -# https://stackoverflow.com/questions/34693991/repel-annotations-in-matplotlib - -def repel_labels(ax,x, y, labels, k=0.01): - G = nx.DiGraph() - data_nodes = [] - init_pos = {} - for xi, yi, label in zip(x, y, labels): - data_str = 'data_{0}'.format(label) - G.add_node(data_str) - G.add_node(label) - G.add_edge(label, data_str) - data_nodes.append(data_str) - init_pos[data_str] = (xi, yi) - init_pos[label] = (xi, yi) - - pos = nx.spring_layout(G, pos=init_pos, fixed=data_nodes, k=k) - - # undo spring_layout's rescaling - pos_after = np.vstack([pos[d] for d in data_nodes]) - pos_before = np.vstack([init_pos[d] for d in data_nodes]) - scale, shift_x = np.polyfit(pos_after[:,0], pos_before[:,0], 1) - scale, shift_y = np.polyfit(pos_after[:,1], pos_before[:,1], 1) - shift = np.array([shift_x, shift_y]) - for key, val in pos.iteritems(): - pos[key] = (val*scale) + shift - - for label, data_str in G.edges(): - ax.annotate(label, - xy=pos[data_str], xycoords='data', - xytext=pos[label], textcoords='data', - arrowprops=dict(arrowstyle="->", - shrinkA=0, shrinkB=0, - connectionstyle="arc3", - color='red'), ) - # expand limits - all_pos = np.vstack(pos.values()) - x_span, y_span = np.ptp(all_pos, axis=0) - mins = np.min(all_pos-x_span*0.15, 0) - maxs = np.max(all_pos+y_span*0.15, 0) - ax.set_xlim([mins[0], maxs[0]]) - ax.set_ylim([mins[1], maxs[1]]) - - -def from_dummies( - data: DataFrame, - sep: None | str = None, - default_category: None | Hashable | dict[str, Hashable] = None, -) -> DataFrame: - """ - Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. - Inverts the operation performed by :func:`~pandas.get_dummies`. - .. versionadded:: 1.5.0 - Parameters - ---------- - data : DataFrame - Data which contains dummy-coded variables in form of integer columns of - 1's and 0's. - sep : str, default None - Separator used in the column names of the dummy categories they are - character indicating the separation of the categorical names from the prefixes. - For example, if your column names are 'prefix_A' and 'prefix_B', - you can strip the underscore by specifying sep='_'. - default_category : None, Hashable or dict of Hashables, default None - The default category is the implied category when a value has none of the - listed categories specified with a one, i.e. if all dummies in a row are - zero. Can be a single value for all variables or a dict directly mapping - the default categories to a prefix of a variable. - Returns - ------- - DataFrame - Categorical data decoded from the dummy input-data. - Raises - ------ - ValueError - * When the input ``DataFrame`` ``data`` contains NA values. - * When the input ``DataFrame`` ``data`` contains column names with separators - that do not match the separator specified with ``sep``. - * When a ``dict`` passed to ``default_category`` does not include an implied - category for each prefix. - * When a value in ``data`` has more than one category assigned to it. - * When ``default_category=None`` and a value in ``data`` has no category - assigned to it. - TypeError - * When the input ``data`` is not of type ``DataFrame``. - * When the input ``DataFrame`` ``data`` contains non-dummy data. - * When the passed ``sep`` is of a wrong data type. - * When the passed ``default_category`` is of a wrong data type. - """ - from pandas.core.reshape.concat import concat - - if not isinstance(data, DataFrame): - raise TypeError( - "Expected 'data' to be a 'DataFrame'; " - f"Received 'data' of type: {type(data).__name__}" - ) - - if data.isna().any().any(): - raise ValueError( - "Dummy DataFrame contains NA value in column: " - f"'{data.isna().any().idxmax()}'" - ) - - # index data with a list of all columns that are dummies - try: - data_to_decode = data.astype("boolean", copy=False) - except TypeError: - raise TypeError("Passed DataFrame contains non-dummy data") - - # collect prefixes and get lists to slice data for each prefix - variables_slice = defaultdict(list) - if sep is None: - variables_slice[""] = list(data.columns) - elif isinstance(sep, str): - for col in data_to_decode.columns: - prefix = col.split(sep)[0] - if len(prefix) == len(col): - raise ValueError(f"Separator not specified for column: {col}") - variables_slice[prefix].append(col) - else: - raise TypeError( - "Expected 'sep' to be of type 'str' or 'None'; " - f"Received 'sep' of type: {type(sep).__name__}" - ) - - if default_category is not None: - if isinstance(default_category, dict): - if not len(default_category) == len(variables_slice): - len_msg = ( - f"Length of 'default_category' ({len(default_category)}) " - f"did not match the length of the columns being encoded " - f"({len(variables_slice)})" - ) - raise ValueError(len_msg) - elif isinstance(default_category, Hashable): - default_category = dict( - zip(variables_slice, [default_category] * len(variables_slice)) - ) - else: - raise TypeError( - "Expected 'default_category' to be of type " - "'None', 'Hashable', or 'dict'; " - "Received 'default_category' of type: " - f"{type(default_category).__name__}" - ) - - cat_data = {} - for prefix, prefix_slice in variables_slice.items(): - if sep is None: - cats = prefix_slice.copy() - else: - cats = [col[len(prefix + sep) :] for col in prefix_slice] - assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) - if any(assigned > 1): - raise ValueError( - "Dummy DataFrame contains multi-assignment(s); " - f"First instance in row: {assigned.idxmax()}" - ) - if any(assigned == 0): - if isinstance(default_category, dict): - cats.append(default_category[prefix]) - else: - raise ValueError( - "Dummy DataFrame contains unassigned value(s); " - f"First instance in row: {assigned.idxmin()}" - ) - data_slice = concat( - (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1 - ) - else: - data_slice = data_to_decode.loc[:, prefix_slice] - cats_array = np.array(cats, dtype="object") - # get indices of True entries along axis=1 - cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] - - return DataFrame(cat_data) - -def check_array_with_weights(X, weights, **kwargs): - """Utility to validate data and weights. - This calls check_array on X and weights, making sure results match. - """ - if weights is None: - return check_array(X, **kwargs), weights - - # Always use copy=False for weights - kwargs_weights = dict(kwargs) - kwargs_weights.update(copy=False) - weights = check_array(weights, **kwargs_weights) - - # Always use force_all_finite=False for X - kwargs_X = dict(kwargs) - kwargs_X.update(force_all_finite=False) - X = check_array(X, **kwargs_X) - - # Make sure shapes match and missing data has weights=0 - if X.shape != weights.shape: - raise ValueError("Shape of `X` and `weights` should match") - - Wzero = (weights == 0) - X[Wzero] = 0 - - if not np.all(np.isfinite(X)): - raise ValueError("Input contains NaN or infinity without " - "a corresponding zero in `weights`.") - return X, weights - - -def orthonormalize(X, rows=True): - """Orthonormalize X using QR-decomposition - Parameters - ---------- - X : array-like, [N, M] - matrix to be orthonormalized - rows : boolean (default=True) - If True, orthonormalize rows of X. Otherwise orthonormalize columns. - Returns - ------- - Y : ndarray, [N, M] - Orthonormalized version of X - """ - orient = lambda X: X.T if rows else X - Q, R = np.linalg.qr(orient(X)) - return orient(Q) - - -def random_orthonormal(N, M, rows=True, random_state=None): - """Construct a random orthonormal matrix - Parameters - ---------- - N, M : integers - The size of the matrix to construct. - rows : boolean, default=True - If True, return matrix with orthonormal rows. - Otherwise return matrix with orthonormal columns. - random_state : int or None - Specify the random state used in construction of the matrix. - """ - assert N <= M if rows else N >= M - rand = np.random.RandomState(random_state) - return orthonormalize(rand.randn(N, M), rows=rows) - - -def solve_weighted(A, b, w): - """solve Ax = b with weights w - Parameters - ---------- - A : array-like [N, M] - b : array-like [N] - w : array-like [N] - Returns - ------- - x : ndarray, [M] - """ - A, b, w = map(np.asarray, (A, b, w)) - ATw2 = A.T * w ** 2 - return np.linalg.solve(np.dot(ATw2, A), - np.dot(ATw2, b)) - - -def weighted_mean(x, w=None, axis=None): - """Compute the weighted mean along the given axis - The result is equivalent to (x * w).sum(axis) / w.sum(axis), - but large temporary arrays are not created. - Parameters - ---------- - x : array_like - data for which mean is computed - w : array_like (optional) - weights corresponding to each data point. If supplied, it must be the - same shape as x - axis : int or None (optional) - axis along which mean should be computed - Returns - ------- - mean : np.ndarray - array representing the weighted mean along the given axis - """ - if w is None: - return np.mean(x, axis) - - x = np.asarray(x) - w = np.asarray(w) - - if x.shape != w.shape: - raise NotImplementedError("Broadcasting is not implemented: " - "x and w must be the same shape.") - - if axis is None: - wx_sum = np.einsum('i,i', np.ravel(x), np.ravel(w)) - else: - try: - axis = tuple(axis) - except TypeError: - axis = (axis,) - - if len(axis) != len(set(axis)): - raise ValueError("duplicate value in 'axis'") - - trans = sorted(set(range(x.ndim)).difference(axis)) + list(axis) - operand = "...{0},...{0}".format(''.join(chr(ord('i') + i) - for i in range(len(axis)))) - wx_sum = np.einsum(operand, - np.transpose(x, trans), - np.transpose(w, trans)) - - return wx_sum / np.sum(w, axis) - -# Test de sphéricité de Bartlett -def BartlettSphericityTest(x): - test = calculate_bartlett_sphericity(x.values) - return pd.DataFrame(test,index=["stats","pvalue"],columns=["Bartlett test"]).T - -# Indice KMO Global -def global_kmo_index(x): - # Matrice des corrélations - corr = x.corr(method="pearson").values - # Matrice des corrélations partielles - pcorr = x.pcorr().values - # Indice KMO global - np.fill_diagonal(corr,0) - np.fill_diagonal(pcorr,0) - return np.sum(corr**2)/(np.sum(corr**2)+np.sum(pcorr**2)) - -# Indice KMO par variable -def per_item_kmo_index(x): - # Matrice des corrélations linéaires - corr = x.corr(method = "pearson").values - # Matrice des corrélations partielles - pcorr = x.pcorr().values - # Indice KMO global - np.fill_diagonal(corr,0) - np.fill_diagonal(pcorr,0) - A = np.sum(corr**2, axis=0) - B = np.sum(pcorr**2, axis=0) - kmo_per_item = A /(A+B) - return pd.DataFrame(kmo_per_item,index=x.columns,columns = ["KMO"]) - - -def paste(*args, sep = ' ', collapse = None): - """ - Port of paste from R - Args: - *args: lists to be combined - sep: a string to separate the terms - collapse: an optional string to separate the results - Returns: - A list of combined results or a string of combined results if collapse is not None - """ - combs = list(itertools.product(*args)) - out = [sep.join(str(j) for j in i) for i in combs] - if collapse is not None: - out = collapse.join(out) - return out -paste0 = partial(paste, sep = '') - -def paste(*args, sep = " ", collapse = None): - l = [list(arg) if isinstance(arg, str) else arg if hasattr(arg, '__len__') else list(str(arg)) for arg in args] - l = list(itertools.islice((sep.join(parts) for parts in zip(*(itertools.cycle(map(str, e)) for e in l))), (max((len(x) for x in l))))) - if collapse is not None: - l = collapse.join(l) - return l diff --git a/build/lib/scientisttools/version.py b/build/lib/scientisttools/version.py deleted file mode 100644 index def31c9..0000000 --- a/build/lib/scientisttools/version.py +++ /dev/null @@ -1,2 +0,0 @@ -# -*- coding: utf-8 -*- -__version__ = '0.0.5' \ No newline at end of file diff --git a/dist/scientisttools-0.0.5-py3-none-any.whl b/dist/scientisttools-0.0.5-py3-none-any.whl index 1a25c99..ddb7edc 100644 Binary files a/dist/scientisttools-0.0.5-py3-none-any.whl and b/dist/scientisttools-0.0.5-py3-none-any.whl differ diff --git a/dist/scientisttools-0.0.5.tar.gz b/dist/scientisttools-0.0.5.tar.gz index b01e766..77a3493 100644 Binary files a/dist/scientisttools-0.0.5.tar.gz and b/dist/scientisttools-0.0.5.tar.gz differ diff --git a/dist/scientisttools-0.0.6-py3-none-any.whl b/dist/scientisttools-0.0.6-py3-none-any.whl new file mode 100644 index 0000000..54485c8 Binary files /dev/null and b/dist/scientisttools-0.0.6-py3-none-any.whl differ diff --git a/dist/scientisttools-0.0.6.tar.gz b/dist/scientisttools-0.0.6.tar.gz new file mode 100644 index 0000000..bfe0b69 Binary files /dev/null and b/dist/scientisttools-0.0.6.tar.gz differ diff --git a/scientisttools.egg-info/PKG-INFO b/scientisttools.egg-info/PKG-INFO index c936b19..1d98bcb 100644 --- a/scientisttools.egg-info/PKG-INFO +++ b/scientisttools.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: scientisttools -Version: 0.0.5 +Version: 0.0.6 Summary: Python library for multidimensional analysis Home-page: UNKNOWN Author: Duverier DJIFACK ZEBAZE diff --git a/scientisttools.egg-info/SOURCES.txt b/scientisttools.egg-info/SOURCES.txt index a96253e..628efc4 100644 --- a/scientisttools.egg-info/SOURCES.txt +++ b/scientisttools.egg-info/SOURCES.txt @@ -1,26 +1,44 @@ -Data_Methodes_Factorielles.xlsx LICENSE.txt README.md -Tennis_Players_AFDM.xlsx -ca_example.ipynb -ca_example2.ipynb -classic_mds.ipynb -efa_example.ipynb -famd_example.ipynb -famd_example2.ipynb -ggcorrplot.ipynb -mca_example.ipynb -mds_example.ipynb -partial_pca.ipynb -pca_example.ipynb setup.py -utils.ipynb +data/CongressVotePipeline.xlsx +data/Data_Illustration_Livre_ADL.xlsx +data/Data_Methodes_Factorielles.xlsx +data/Tennis_Players_AFDM.xlsx +data/tea.xlsx +data/usarrests.xlsx +data/wine_quality.xls dist/scientisttools-0.0.2-py3-none-any.whl dist/scientisttools-0.0.2.tar.gz dist/scientisttools-0.0.3-py3-none-any.whl dist/scientisttools-0.0.3.tar.gz dist/scientisttools-0.0.4-py3-none-any.whl dist/scientisttools-0.0.4.tar.gz +dist/scientisttools-0.0.5-py3-none-any.whl +dist/scientisttools-0.0.5.tar.gz +notebooks/ca_example.ipynb +notebooks/ca_example2.ipynb +notebooks/candisc_example.ipynb +notebooks/candisc_iris.ipynb +notebooks/candisc_sklearn.ipynb +notebooks/candisc_wine.ipynb +notebooks/classic_mds.ipynb +notebooks/disca_example.ipynb +notebooks/dismix_example.ipynb +notebooks/disqual_example.ipynb +notebooks/efa_example.ipynb +notebooks/famd_example.ipynb +notebooks/famd_example2.ipynb +notebooks/ggcorrplot.ipynb +notebooks/hcpc_mca_example.ipynb +notebooks/hcpc_pca_example.ipynb +notebooks/lda_example.ipynb +notebooks/lda_example2.ipynb +notebooks/mca_example.ipynb +notebooks/mds_example.ipynb +notebooks/partial_pca.ipynb +notebooks/pca_example.ipynb +notebooks/utils.ipynb scientisttools/__init__.py scientisttools/clustering.py scientisttools/datasets.py @@ -37,11 +55,4 @@ scientisttools.egg-info/PKG-INFO scientisttools.egg-info/SOURCES.txt scientisttools.egg-info/dependency_links.txt scientisttools.egg-info/requires.txt -scientisttools.egg-info/top_level.txt -scientisttools/__pycache__/__init__.cpython-310.pyc -scientisttools/__pycache__/decomposition.cpython-310.pyc -scientisttools/__pycache__/extractfactor.cpython-310.pyc -scientisttools/__pycache__/ggplot.cpython-310.pyc -scientisttools/__pycache__/manifold.cpython-310.pyc -scientisttools/__pycache__/pyplot.cpython-310.pyc -scientisttools/__pycache__/utils.cpython-310.pyc \ No newline at end of file +scientisttools.egg-info/top_level.txt \ No newline at end of file diff --git a/setup.py b/setup.py index 6d9efab..b9b58e1 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ # Setting up setuptools.setup( name="scientisttools", - version="0.0.5", + version="0.0.6", author="Duverier DJIFACK ZEBAZE", author_email="duverierdjifack@gmail.com", description="Python library for multidimensional analysis",