find and score modules

Patrick Cahan · Patrick Cahan · commit 648bc5f557dd · 2023-10-24T10:31:44.000-04:00
diff --git a/pySingleCellNet/__init__.py b/pySingleCellNet/__init__.py
@@ -5,3 +5,4 @@
 from .scn_assess import *
 from .plots import *
 from .postclass_analysis import *
+from .rank_class import *
diff --git a/pySingleCellNet/postclass_analysis.py b/pySingleCellNet/postclass_analysis.py
@@ -107,6 +107,29 @@ def combine_gsea_dfs(
     return gsea_comb, my_series
 
 
+def enrichR_on_gene_modules(
+    adata: anndata,
+    geneset: dict,
+    result_name: str, # this should indicate the data source(s), but omit cell types and categories
+    module_method = 'knn',
+    seed: int = 3,
+    min_size: int = 10,
+    max_size: int = 500,
+    hvg = True
+) -> dict:
+    # trim geneset
+    geneset = filter_gene_list(geneset, min_size, max_size)
+    ans = dict()
+    bg_genes = adata.var_names.to_list()
+    if hvg:
+        bg_genes = adata.var_names[adNorm.var['highly_variable']].to_list()
+    modname = module_method + "_modules"
+    genemodules = adata.uns[modname].copy()
+    for gmod, genelist in genemodules.items():
+        tmp_enr = gp.enrichr(gene_list=genelist, gene_sets=geneset, background=bg_genes, outdir=None)
+        ans[gmod] = tmp_enr
+    return ans
+
 def gsea_on_diff_gene_dict(
     diff_gene_dict: dict,
     gene_set_name: str,
@@ -142,6 +165,43 @@ def gsea_on_diff_gene_dict(
 
     return ans
 
+
+def collect_enrichR_results_from_dict(
+    enr_results: dict,
+    adj_p_threshold = 1e-5
+):
+    # Initialize set of pathways. The order of these in prerank results and their composition will differ
+    # so we need to get the union first
+    pathways = pd.Index([])
+    gene_signatures= list(enr_results.keys())
+    for signature in gene_signatures:
+        tmpRes = enr_results[signature].res2d.copy()
+        gene_set_names = list(tmpRes['Term'])
+        pathways = pathways.union(gene_set_names)
+    # initialize an empty results data.frame
+    enr_df = pd.DataFrame(0, columns = gene_signatures, index=pathways)
+    for signature in gene_signatures:
+        tmpRes = enr_results[signature].res2d.copy() 
+        tmpRes.index = tmpRes['Term']
+        tmpRes.loc[lambda df: df['Adjusted P-value'] > adj_p_threshold, "Odds Ratio"] = 0
+        # nes_df.loc[ct_df.index,cell_type] = ct_df.loc[:,"NES"]
+        enr_df[signature] = tmpRes["Odds Ratio"]
+    enr_df = enr_df.apply(pd.to_numeric, errors='coerce')
+    return enr_df
+
+def what_module_has_gene(
+    adata,
+    target_gene,
+    module_method='knn'
+) -> list:
+    mod_slot = module_method + "_modules"
+    if mod_slot not in adata.uns.keys():
+        raise ValueError(mod_slot + " have not been identified.")
+    genemodules = adata.uns[mod_slot]
+    return [key for key, genes in genemodules.items() if target_gene in genes]
+
+        
+
 def collect_gsea_results_from_dict(
     gsea_dict: dict,
     fdr_thr = 0.25
diff --git a/pySingleCellNet/rank_class.py b/pySingleCellNet/rank_class.py
@@ -4,10 +4,12 @@
 import scanpy as sc
 import numpy as np
 from scipy.sparse import csr_matrix
-from scipy.sparse import is_sparse
+from scipy.sparse import issparse
 from sklearn.decomposition import FastICA
 import scanpy as sc
 import numpy as np
+import pandas as pd
+from .utils import *
 
 def rank_dense_submatrix(submatrix):
     # Operate on a dense submatrix to get the ranks
@@ -159,33 +161,46 @@ def findSigGenes(
         dictionary of cluster name : gene list
 
     """
-def find_gene_modules(
+def find_knn_modules(
     adata,
+    mean_cluster = True,
+    dLevel = 'leiden',
     use_hvg = True,
-    knn = 10,
-    n_pcs = 50,
-    prefix='gmod_'
+    knn = 5,
+    leiden_resolution=0.5,
+    prefix='gmod_',
+    npcs_adjust = 1
 ):
-    adtemp = adata.copy()
+    adOps = adata.copy()
     if use_hvg:
         # add test that hvg is set
         hvg_names = adata.var[adata.var['highly_variable']].index.tolist()
-        adtemp = adtemp[:,hvg_names].copy()
-    adata_T = adtemp.T
+        adOps = adOps[:,hvg_names].copy()
+    if mean_cluster:
+        adtemp = adOps.copy()
+        if dLevel not in adtemp.obs.columns:
+            raise ValueError(dLevel + " not in obs.")
+        compute_mean_expression_per_cluster(adtemp, dLevel)
+        adOps = adtemp.uns['mean_expression'].copy()        
+    adata_T = adOps.T
     sc.tl.pca(adata_T)
-    sc.pp.neighbors(adata_T, n_neighbors=knn, n_pcs=n_pcs)
-    # sc.tl.umap(adata_T)
-    sc.tl.leiden(adata_T)
+    elbow = find_elbow(adata_T)
+    n_pcs = elbow + npcs_adjust 
+    sc.pp.neighbors(adata_T, n_neighbors=knn, n_pcs=n_pcs, metric='correlation')
+    sc.tl.leiden(adata_T, leiden_resolution)
     adf = adata_T.obs.copy()
-    clusters = adf.groupby('leiden').apply(lambda x: x.index.tolist()).to_dict()
+    clusters = adf.groupby('leiden', observed=True).apply(lambda x: x.index.tolist()).to_dict()
     clusters = {prefix + k: v for k, v in clusters.items()}
-    adata.uns['gene_modules'] = clusters
+    adata.uns['knn_modules'] = clusters
+    pySCN.score_gene_modules(adata, method='knn')
 
 
 def score_gene_modules(
-    adata
+    adata,
+    method = 'knn'
 ):
-    gene_dict = adata.uns['gene_modules']
+    uns_name = method + "_modules"
+    gene_dict = adata.uns[uns_name]
     # Number of cells and clusters
     n_cells = adata.shape[0]
     # Initialize an empty matrix for scores
@@ -199,7 +214,8 @@ def score_gene_modules(
         scores_df[score_name] = adata.obs[score_name].values
         del(adata.obs[score_name])
     # Assign the scores DataFrame to adata.obsm
-    adata.obsm['module_scores'] = scores_df
+    obsm_name = method + "_module_scores"
+    adata.obsm[obsm_name] = scores_df
 
 
 def identify_ica_gene_modules(adata, k=10, max_iter=3):
@@ -231,7 +247,7 @@ def identify_ica_gene_modules(adata, k=10, max_iter=3):
     for i, component in enumerate(ica.components_):
         # Get the names of the genes in the current component/module
         gene_names = adata.var_names[adata.var['highly_variable']][component.argsort()[-10:][::-1]]  # Top 10 genes as an example
-        gene_modules[f"module_{i}"] = gene_names.tolist()
+        gene_modules[f"gmod_{i}"] = gene_names.tolist()
     # Store gene modules in adata.uns
     adata.uns['ica_modules'] = gene_modules
     #return adata
diff --git a/pySingleCellNet/utils.py b/pySingleCellNet/utils.py
@@ -6,6 +6,8 @@
 import mygene
 import anndata as ad
 import pySingleCellNet as pySCN
+from scipy.sparse import issparse
+
 
 def convert_ensembl_to_symbol(adata, species = 'mouse', batch_size=1000):
     mg = mygene.MyGeneInfo()
@@ -90,6 +92,25 @@ def read_gmt(file_path: str) -> dict:
             
     return gene_sets
 
+def filter_gene_list(genelist, min_genes, max_genes):
+    """
+    Filter the gene lists in the provided dictionary based on their lengths.
+
+    Parameters:
+    - genelist : dict
+        Dictionary with keys as identifiers and values as lists of genes.
+    - min_genes : int
+        Minimum number of genes a list should have.
+    - max_genes : int
+        Maximum number of genes a list should have.
+
+    Returns:
+    - dict
+        Filtered dictionary with lists that have a length between min_genes and max_genes (inclusive of min_genes and max_genes).
+    """
+    filtered_dict = {key: value for key, value in genelist.items() if min_genes <= len(value) <= max_genes}
+    return filtered_dict
+
 
 def pull_out_genes(
     diff_genes_dict: dict, 
@@ -172,8 +193,6 @@ def read_broken_geo_mtx(path: str, prefix: str) -> AnnData:
     adata.var_names = adata.var['gene']
     return adata
 
-
-
 def mito_rib(adQ: AnnData, species: str = "MM", clean: bool = True) -> AnnData:
     """
     Calculate mitochondrial and ribosomal QC metrics and add them to the `.var` attribute of the AnnData object.
@@ -236,7 +255,8 @@ def norm_hvg_scale_pca(
     min_disp: float = 0.25,
     scale_max: float = 10,
     n_comps: int = 100,
-    gene_scale: bool = False
+    gene_scale: bool = False,
+    use_hvg: bool = True
 ) -> AnnData:
     """
     Normalize, detect highly variable genes, optionally scale, and perform PCA on an AnnData object.
@@ -287,7 +307,7 @@ def norm_hvg_scale_pca(
         sc.pp.scale(adata, max_value=scale_max)
 
     # Perform PCA on the data
-    sc.tl.pca(adata, n_comps=n_comps)
+    sc.tl.pca(adata, n_comps=n_comps, use_highly_variable=use_hvg)
 
     return adata
 
@@ -545,6 +565,85 @@ def sample_cells(
 
     return sampled_adata
 
+from scipy.sparse import issparse
+
+def compute_mean_expression_per_cluster(
+    adata,
+    cluster_key
+):
+    """
+    Compute mean gene expression for each gene in each cluster, create a new anndata object, and store it in adata.uns.
+
+    Parameters:
+    - adata : anndata.AnnData
+        The input AnnData object with labeled cell clusters.
+    - cluster_key : str
+        The key in adata.obs where the cluster labels are stored.
+
+    Returns:
+    - anndata.AnnData
+        The modified AnnData object with the mean expression anndata stored in uns['mean_expression'].
+    """
+    if cluster_key not in adata.obs.columns:
+        raise ValueError(f"{cluster_key} not found in adata.obs")
+
+    # Extract unique cluster labels
+    clusters = adata.obs[cluster_key].unique().tolist()
+
+    # Compute mean expression for each cluster
+    mean_expressions = []
+    for cluster in clusters:
+        cluster_cells = adata[adata.obs[cluster_key] == cluster, :]
+        mean_expression = np.mean(cluster_cells.X, axis=0).A1 if issparse(cluster_cells.X) else np.mean(cluster_cells.X, axis=0)
+        mean_expressions.append(mean_expression)
+
+    # Convert to matrix
+    mean_expression_matrix = np.vstack(mean_expressions)
+    
+    # Create a new anndata object
+    mean_expression_adata = sc.AnnData(X=mean_expression_matrix, 
+                                       var=pd.DataFrame(index=adata.var_names), 
+                                       obs=pd.DataFrame(index=clusters))
+    
+    # Store this new anndata object in adata.uns
+    adata.uns['mean_expression'] = mean_expression_adata
+    #return adata
+
+
+def find_elbow(
+    adata
+):
+    """
+    Find the "elbow" index in the variance explained by principal components.
+
+    Parameters:
+    - variance_explained : list or array
+        Variance explained by each principal component, typically in decreasing order.
+
+    Returns:
+    - int
+        The index corresponding to the "elbow" in the variance explained plot.
+    """
+    variance_explained = adata.uns['pca']['variance_ratio']
+    # Coordinates of all points
+    n_points = len(variance_explained)
+    all_coords = np.vstack((range(n_points), variance_explained)).T
+    # Line vector from first to last point
+    line_vec = all_coords[-1] - all_coords[0]
+    line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2))
+    # Vector being orthogonal to the line
+    vec_from_first = all_coords - all_coords[0]
+    scalar_prod = np.sum(vec_from_first * np.tile(line_vec_norm, (n_points, 1)), axis=1)
+    vec_from_first_parallel = np.outer(scalar_prod, line_vec_norm)
+    vec_to_line = vec_from_first - vec_from_first_parallel
+    # Distance to the line
+    dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))
+    # Index of the point with max distance to the line
+    elbow_idx = np.argmax(dist_to_line)
+    return elbow_idx
+
+
+
 
 def ctMerge(sampTab, annCol, ctVect, newName):
     oldRows=np.isin(sampTab[annCol], ctVect)
@@ -652,7 +751,6 @@ def downSampleW(vector,total=1e5, dThresh=0):
     res[res<dThresh]=0
     return res
 
-
 def weighted_down(expDat, total, dThresh=0):
     rSums=expDat.sum(axis=1)
     dVector=np.divide(total, rSums)