Skip to content

Commit 648bc5f

Browse files
Patrick CahanPatrick Cahan
authored andcommitted
find and score modules
1 parent a9a5a12 commit 648bc5f

File tree

4 files changed

+197
-22
lines changed

4 files changed

+197
-22
lines changed

pySingleCellNet/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from .scn_assess import *
66
from .plots import *
77
from .postclass_analysis import *
8+
from .rank_class import *

pySingleCellNet/postclass_analysis.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,29 @@ def combine_gsea_dfs(
107107
return gsea_comb, my_series
108108

109109

110+
def enrichR_on_gene_modules(
111+
adata: anndata,
112+
geneset: dict,
113+
result_name: str, # this should indicate the data source(s), but omit cell types and categories
114+
module_method = 'knn',
115+
seed: int = 3,
116+
min_size: int = 10,
117+
max_size: int = 500,
118+
hvg = True
119+
) -> dict:
120+
# trim geneset
121+
geneset = filter_gene_list(geneset, min_size, max_size)
122+
ans = dict()
123+
bg_genes = adata.var_names.to_list()
124+
if hvg:
125+
bg_genes = adata.var_names[adNorm.var['highly_variable']].to_list()
126+
modname = module_method + "_modules"
127+
genemodules = adata.uns[modname].copy()
128+
for gmod, genelist in genemodules.items():
129+
tmp_enr = gp.enrichr(gene_list=genelist, gene_sets=geneset, background=bg_genes, outdir=None)
130+
ans[gmod] = tmp_enr
131+
return ans
132+
110133
def gsea_on_diff_gene_dict(
111134
diff_gene_dict: dict,
112135
gene_set_name: str,
@@ -142,6 +165,43 @@ def gsea_on_diff_gene_dict(
142165

143166
return ans
144167

168+
169+
def collect_enrichR_results_from_dict(
170+
enr_results: dict,
171+
adj_p_threshold = 1e-5
172+
):
173+
# Initialize set of pathways. The order of these in prerank results and their composition will differ
174+
# so we need to get the union first
175+
pathways = pd.Index([])
176+
gene_signatures= list(enr_results.keys())
177+
for signature in gene_signatures:
178+
tmpRes = enr_results[signature].res2d.copy()
179+
gene_set_names = list(tmpRes['Term'])
180+
pathways = pathways.union(gene_set_names)
181+
# initialize an empty results data.frame
182+
enr_df = pd.DataFrame(0, columns = gene_signatures, index=pathways)
183+
for signature in gene_signatures:
184+
tmpRes = enr_results[signature].res2d.copy()
185+
tmpRes.index = tmpRes['Term']
186+
tmpRes.loc[lambda df: df['Adjusted P-value'] > adj_p_threshold, "Odds Ratio"] = 0
187+
# nes_df.loc[ct_df.index,cell_type] = ct_df.loc[:,"NES"]
188+
enr_df[signature] = tmpRes["Odds Ratio"]
189+
enr_df = enr_df.apply(pd.to_numeric, errors='coerce')
190+
return enr_df
191+
192+
def what_module_has_gene(
193+
adata,
194+
target_gene,
195+
module_method='knn'
196+
) -> list:
197+
mod_slot = module_method + "_modules"
198+
if mod_slot not in adata.uns.keys():
199+
raise ValueError(mod_slot + " have not been identified.")
200+
genemodules = adata.uns[mod_slot]
201+
return [key for key, genes in genemodules.items() if target_gene in genes]
202+
203+
204+
145205
def collect_gsea_results_from_dict(
146206
gsea_dict: dict,
147207
fdr_thr = 0.25

pySingleCellNet/rank_class.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
import scanpy as sc
55
import numpy as np
66
from scipy.sparse import csr_matrix
7-
from scipy.sparse import is_sparse
7+
from scipy.sparse import issparse
88
from sklearn.decomposition import FastICA
99
import scanpy as sc
1010
import numpy as np
11+
import pandas as pd
12+
from .utils import *
1113

1214
def rank_dense_submatrix(submatrix):
1315
# Operate on a dense submatrix to get the ranks
@@ -159,33 +161,46 @@ def findSigGenes(
159161
dictionary of cluster name : gene list
160162
161163
"""
162-
def find_gene_modules(
164+
def find_knn_modules(
163165
adata,
166+
mean_cluster = True,
167+
dLevel = 'leiden',
164168
use_hvg = True,
165-
knn = 10,
166-
n_pcs = 50,
167-
prefix='gmod_'
169+
knn = 5,
170+
leiden_resolution=0.5,
171+
prefix='gmod_',
172+
npcs_adjust = 1
168173
):
169-
adtemp = adata.copy()
174+
adOps = adata.copy()
170175
if use_hvg:
171176
# add test that hvg is set
172177
hvg_names = adata.var[adata.var['highly_variable']].index.tolist()
173-
adtemp = adtemp[:,hvg_names].copy()
174-
adata_T = adtemp.T
178+
adOps = adOps[:,hvg_names].copy()
179+
if mean_cluster:
180+
adtemp = adOps.copy()
181+
if dLevel not in adtemp.obs.columns:
182+
raise ValueError(dLevel + " not in obs.")
183+
compute_mean_expression_per_cluster(adtemp, dLevel)
184+
adOps = adtemp.uns['mean_expression'].copy()
185+
adata_T = adOps.T
175186
sc.tl.pca(adata_T)
176-
sc.pp.neighbors(adata_T, n_neighbors=knn, n_pcs=n_pcs)
177-
# sc.tl.umap(adata_T)
178-
sc.tl.leiden(adata_T)
187+
elbow = find_elbow(adata_T)
188+
n_pcs = elbow + npcs_adjust
189+
sc.pp.neighbors(adata_T, n_neighbors=knn, n_pcs=n_pcs, metric='correlation')
190+
sc.tl.leiden(adata_T, leiden_resolution)
179191
adf = adata_T.obs.copy()
180-
clusters = adf.groupby('leiden').apply(lambda x: x.index.tolist()).to_dict()
192+
clusters = adf.groupby('leiden', observed=True).apply(lambda x: x.index.tolist()).to_dict()
181193
clusters = {prefix + k: v for k, v in clusters.items()}
182-
adata.uns['gene_modules'] = clusters
194+
adata.uns['knn_modules'] = clusters
195+
pySCN.score_gene_modules(adata, method='knn')
183196

184197

185198
def score_gene_modules(
186-
adata
199+
adata,
200+
method = 'knn'
187201
):
188-
gene_dict = adata.uns['gene_modules']
202+
uns_name = method + "_modules"
203+
gene_dict = adata.uns[uns_name]
189204
# Number of cells and clusters
190205
n_cells = adata.shape[0]
191206
# Initialize an empty matrix for scores
@@ -199,7 +214,8 @@ def score_gene_modules(
199214
scores_df[score_name] = adata.obs[score_name].values
200215
del(adata.obs[score_name])
201216
# Assign the scores DataFrame to adata.obsm
202-
adata.obsm['module_scores'] = scores_df
217+
obsm_name = method + "_module_scores"
218+
adata.obsm[obsm_name] = scores_df
203219

204220

205221
def identify_ica_gene_modules(adata, k=10, max_iter=3):
@@ -231,7 +247,7 @@ def identify_ica_gene_modules(adata, k=10, max_iter=3):
231247
for i, component in enumerate(ica.components_):
232248
# Get the names of the genes in the current component/module
233249
gene_names = adata.var_names[adata.var['highly_variable']][component.argsort()[-10:][::-1]] # Top 10 genes as an example
234-
gene_modules[f"module_{i}"] = gene_names.tolist()
250+
gene_modules[f"gmod_{i}"] = gene_names.tolist()
235251
# Store gene modules in adata.uns
236252
adata.uns['ica_modules'] = gene_modules
237253
#return adata

pySingleCellNet/utils.py

Lines changed: 103 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import mygene
77
import anndata as ad
88
import pySingleCellNet as pySCN
9+
from scipy.sparse import issparse
10+
911

1012
def convert_ensembl_to_symbol(adata, species = 'mouse', batch_size=1000):
1113
mg = mygene.MyGeneInfo()
@@ -90,6 +92,25 @@ def read_gmt(file_path: str) -> dict:
9092

9193
return gene_sets
9294

95+
def filter_gene_list(genelist, min_genes, max_genes):
96+
"""
97+
Filter the gene lists in the provided dictionary based on their lengths.
98+
99+
Parameters:
100+
- genelist : dict
101+
Dictionary with keys as identifiers and values as lists of genes.
102+
- min_genes : int
103+
Minimum number of genes a list should have.
104+
- max_genes : int
105+
Maximum number of genes a list should have.
106+
107+
Returns:
108+
- dict
109+
Filtered dictionary with lists that have a length between min_genes and max_genes (inclusive of min_genes and max_genes).
110+
"""
111+
filtered_dict = {key: value for key, value in genelist.items() if min_genes <= len(value) <= max_genes}
112+
return filtered_dict
113+
93114

94115
def pull_out_genes(
95116
diff_genes_dict: dict,
@@ -172,8 +193,6 @@ def read_broken_geo_mtx(path: str, prefix: str) -> AnnData:
172193
adata.var_names = adata.var['gene']
173194
return adata
174195

175-
176-
177196
def mito_rib(adQ: AnnData, species: str = "MM", clean: bool = True) -> AnnData:
178197
"""
179198
Calculate mitochondrial and ribosomal QC metrics and add them to the `.var` attribute of the AnnData object.
@@ -236,7 +255,8 @@ def norm_hvg_scale_pca(
236255
min_disp: float = 0.25,
237256
scale_max: float = 10,
238257
n_comps: int = 100,
239-
gene_scale: bool = False
258+
gene_scale: bool = False,
259+
use_hvg: bool = True
240260
) -> AnnData:
241261
"""
242262
Normalize, detect highly variable genes, optionally scale, and perform PCA on an AnnData object.
@@ -287,7 +307,7 @@ def norm_hvg_scale_pca(
287307
sc.pp.scale(adata, max_value=scale_max)
288308

289309
# Perform PCA on the data
290-
sc.tl.pca(adata, n_comps=n_comps)
310+
sc.tl.pca(adata, n_comps=n_comps, use_highly_variable=use_hvg)
291311

292312
return adata
293313

@@ -545,6 +565,85 @@ def sample_cells(
545565

546566
return sampled_adata
547567

568+
from scipy.sparse import issparse
569+
570+
def compute_mean_expression_per_cluster(
571+
adata,
572+
cluster_key
573+
):
574+
"""
575+
Compute mean gene expression for each gene in each cluster, create a new anndata object, and store it in adata.uns.
576+
577+
Parameters:
578+
- adata : anndata.AnnData
579+
The input AnnData object with labeled cell clusters.
580+
- cluster_key : str
581+
The key in adata.obs where the cluster labels are stored.
582+
583+
Returns:
584+
- anndata.AnnData
585+
The modified AnnData object with the mean expression anndata stored in uns['mean_expression'].
586+
"""
587+
if cluster_key not in adata.obs.columns:
588+
raise ValueError(f"{cluster_key} not found in adata.obs")
589+
590+
# Extract unique cluster labels
591+
clusters = adata.obs[cluster_key].unique().tolist()
592+
593+
# Compute mean expression for each cluster
594+
mean_expressions = []
595+
for cluster in clusters:
596+
cluster_cells = adata[adata.obs[cluster_key] == cluster, :]
597+
mean_expression = np.mean(cluster_cells.X, axis=0).A1 if issparse(cluster_cells.X) else np.mean(cluster_cells.X, axis=0)
598+
mean_expressions.append(mean_expression)
599+
600+
# Convert to matrix
601+
mean_expression_matrix = np.vstack(mean_expressions)
602+
603+
# Create a new anndata object
604+
mean_expression_adata = sc.AnnData(X=mean_expression_matrix,
605+
var=pd.DataFrame(index=adata.var_names),
606+
obs=pd.DataFrame(index=clusters))
607+
608+
# Store this new anndata object in adata.uns
609+
adata.uns['mean_expression'] = mean_expression_adata
610+
#return adata
611+
612+
613+
def find_elbow(
614+
adata
615+
):
616+
"""
617+
Find the "elbow" index in the variance explained by principal components.
618+
619+
Parameters:
620+
- variance_explained : list or array
621+
Variance explained by each principal component, typically in decreasing order.
622+
623+
Returns:
624+
- int
625+
The index corresponding to the "elbow" in the variance explained plot.
626+
"""
627+
variance_explained = adata.uns['pca']['variance_ratio']
628+
# Coordinates of all points
629+
n_points = len(variance_explained)
630+
all_coords = np.vstack((range(n_points), variance_explained)).T
631+
# Line vector from first to last point
632+
line_vec = all_coords[-1] - all_coords[0]
633+
line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2))
634+
# Vector being orthogonal to the line
635+
vec_from_first = all_coords - all_coords[0]
636+
scalar_prod = np.sum(vec_from_first * np.tile(line_vec_norm, (n_points, 1)), axis=1)
637+
vec_from_first_parallel = np.outer(scalar_prod, line_vec_norm)
638+
vec_to_line = vec_from_first - vec_from_first_parallel
639+
# Distance to the line
640+
dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))
641+
# Index of the point with max distance to the line
642+
elbow_idx = np.argmax(dist_to_line)
643+
return elbow_idx
644+
645+
646+
548647

549648
def ctMerge(sampTab, annCol, ctVect, newName):
550649
oldRows=np.isin(sampTab[annCol], ctVect)
@@ -652,7 +751,6 @@ def downSampleW(vector,total=1e5, dThresh=0):
652751
res[res<dThresh]=0
653752
return res
654753

655-
656754
def weighted_down(expDat, total, dThresh=0):
657755
rSums=expDat.sum(axis=1)
658756
dVector=np.divide(total, rSums)

0 commit comments

Comments
 (0)