-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add blacklist testings for 10xpbmc and sciATACseq
- Loading branch information
1 parent
d2fa794
commit 325e0f6
Showing
8 changed files
with
7,069 additions
and
0 deletions.
There are no files selected for viewing
823 changes: 823 additions & 0 deletions
823
Extra/10x_PBMC_5k/test_blacklist/SCRAT_10xpbmc_5k_no_blacklist_filtering.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
1,531 changes: 1,531 additions & 0 deletions
1,531
Extra/10x_PBMC_5k/test_blacklist/SnapATAC_10xpbmc_5k_no_blacklist_filtering.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
815 changes: 815 additions & 0 deletions
815
Extra/10x_PBMC_5k/test_blacklist/run_clustering_10xpbmc5k_no_blacklist_filtering.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
638 changes: 638 additions & 0 deletions
638
Extra/10x_PBMC_5k/test_blacklist/run_umap_10xpbmc5k_no_blacklist_filtering.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
586 changes: 586 additions & 0 deletions
586
...ovich_2018_subset/test_blacklist/SCRAT_cusanovich2018_subset_no_blacklist_filtering.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
1,518 changes: 1,518 additions & 0 deletions
1,518
...ch_2018_subset/test_blacklist/SnapATAC_cusanovich2018_subset_no_blacklist_filtering.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
394 changes: 394 additions & 0 deletions
394
...18_subset/test_blacklist/run_clustering_cusanovich2018subset_no_blacklist_filtering.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,394 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import scanpy as sc\n", | ||
"import os\n", | ||
"from sklearn.cluster import KMeans\n", | ||
"from sklearn.cluster import AgglomerativeClustering\n", | ||
"from sklearn.metrics.cluster import adjusted_rand_score\n", | ||
"from sklearn.metrics.cluster import adjusted_mutual_info_score\n", | ||
"from sklearn.metrics.cluster import homogeneity_score\n", | ||
"import rpy2.robjects as robjects\n", | ||
"from rpy2.robjects import pandas2ri" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',\n", | ||
" 'AMI_Louvain','AMI_kmeans','AMI_HC',\n", | ||
" 'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"0" | ||
] | ||
}, | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"workdir = '../output/'\n", | ||
"path_fm = os.path.join(workdir,'feature_matrices/')\n", | ||
"path_clusters = os.path.join(workdir,'clusters/')\n", | ||
"path_metrics = os.path.join(workdir,'metrics/')\n", | ||
"os.system('mkdir -p '+path_clusters)\n", | ||
"os.system('mkdir -p '+path_metrics)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"13\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"metadata = pd.read_csv('../input/metadata.tsv',sep='\\t',index_col=0)\n", | ||
"num_clusters = len(np.unique(metadata['label']))\n", | ||
"print(num_clusters)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"2" | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"files = [x for x in os.listdir(path_fm) if x.startswith('FM')]\n", | ||
"len(files)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['FM_SnapATAC_cusanovich2018_subset_no_blacklist_filtering.rds',\n", | ||
" 'FM_SCRAT_cusanovich2018subset_motifs_no_blacklist_filtering.rds']" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"files" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):\n", | ||
" this_step = 0\n", | ||
" this_min = float(range_min)\n", | ||
" this_max = float(range_max)\n", | ||
" while this_step < max_steps:\n", | ||
" print('step ' + str(this_step))\n", | ||
" this_resolution = this_min + ((this_max-this_min)/2)\n", | ||
" sc.tl.louvain(adata,resolution=this_resolution)\n", | ||
" this_clusters = adata.obs['louvain'].nunique()\n", | ||
" \n", | ||
" print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))\n", | ||
" \n", | ||
" if this_clusters > n_cluster:\n", | ||
" this_max = this_resolution\n", | ||
" elif this_clusters < n_cluster:\n", | ||
" this_min = this_resolution\n", | ||
" else:\n", | ||
" return(this_resolution, adata)\n", | ||
" this_step += 1\n", | ||
" \n", | ||
" print('Cannot find the number of clusters')\n", | ||
" print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": { | ||
"scrolled": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"SnapATAC_subset_no_blacklist_filtering\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", | ||
" res = PandasDataFrame.from_items(items)\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"step 0\n", | ||
"got 26 at resolution 1.5\n", | ||
"step 1\n", | ||
"got 20 at resolution 0.75\n", | ||
"step 2\n", | ||
"got 18 at resolution 0.375\n", | ||
"step 3\n", | ||
"got 14 at resolution 0.1875\n", | ||
"step 4\n", | ||
"got 12 at resolution 0.09375\n", | ||
"step 5\n", | ||
"got 14 at resolution 0.140625\n", | ||
"step 6\n", | ||
"got 13 at resolution 0.1171875\n", | ||
"SCRAT_motifs_no_blacklist_filtering\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n", | ||
" res = PandasDataFrame.from_items(items)\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"step 0\n", | ||
"got 26 at resolution 1.5\n", | ||
"step 1\n", | ||
"got 12 at resolution 0.75\n", | ||
"step 2\n", | ||
"got 19 at resolution 1.125\n", | ||
"step 3\n", | ||
"got 17 at resolution 0.9375\n", | ||
"step 4\n", | ||
"got 15 at resolution 0.84375\n", | ||
"step 5\n", | ||
"got 13 at resolution 0.796875\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for file in files:\n", | ||
" file_split = file.split('_')\n", | ||
" method = file_split[1]\n", | ||
" dataset = file_split[2].split('.')[0]\n", | ||
" if(len(file_split)>3):\n", | ||
" method = method + '_' + '_'.join(file_split[3:]).split('.')[0]\n", | ||
" print(method)\n", | ||
"\n", | ||
" pandas2ri.activate()\n", | ||
" readRDS = robjects.r['readRDS']\n", | ||
" df_rds = readRDS(os.path.join(path_fm,file))\n", | ||
" fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))\n", | ||
" fm_mat.fillna(0,inplace=True)\n", | ||
" fm_mat.columns = metadata.index\n", | ||
" \n", | ||
" adata = sc.AnnData(fm_mat.T)\n", | ||
" adata.var_names_make_unique()\n", | ||
" adata.obs = metadata.loc[adata.obs.index,]\n", | ||
" df_metrics.loc[method,] = \"\"\n", | ||
" #Louvain\n", | ||
" sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')\n", | ||
"# sc.tl.louvain(adata)\n", | ||
" getNClusters(adata,n_cluster=num_clusters)\n", | ||
" #kmeans\n", | ||
" kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)\n", | ||
" adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')\n", | ||
" #hierachical clustering\n", | ||
" hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)\n", | ||
" adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')\n", | ||
" #clustering metrics\n", | ||
" \n", | ||
" #adjusted rank index\n", | ||
" ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])\n", | ||
" ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])\n", | ||
" ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])\n", | ||
" #adjusted mutual information\n", | ||
" ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')\n", | ||
" ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic') \n", | ||
" ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')\n", | ||
" #homogeneity\n", | ||
" homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])\n", | ||
" homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])\n", | ||
" homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])\n", | ||
"\n", | ||
" df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]\n", | ||
" df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]\n", | ||
" df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] \n", | ||
" adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\\t')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_metrics.to_csv(path_metrics+'clustering_scores_no_bl_filt.csv')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>ARI_Louvain</th>\n", | ||
" <th>ARI_kmeans</th>\n", | ||
" <th>ARI_HC</th>\n", | ||
" <th>AMI_Louvain</th>\n", | ||
" <th>AMI_kmeans</th>\n", | ||
" <th>AMI_HC</th>\n", | ||
" <th>Homogeneity_Louvain</th>\n", | ||
" <th>Homogeneity_kmeans</th>\n", | ||
" <th>Homogeneity_HC</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>SnapATAC_subset_no_blacklist_filtering</th>\n", | ||
" <td>0.309287</td>\n", | ||
" <td>0.310815</td>\n", | ||
" <td>0.26289</td>\n", | ||
" <td>0.582052</td>\n", | ||
" <td>0.58204</td>\n", | ||
" <td>0.569305</td>\n", | ||
" <td>0.548089</td>\n", | ||
" <td>0.546123</td>\n", | ||
" <td>0.530073</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>SCRAT_motifs_no_blacklist_filtering</th>\n", | ||
" <td>0.155567</td>\n", | ||
" <td>0.0763765</td>\n", | ||
" <td>0.0721623</td>\n", | ||
" <td>0.293087</td>\n", | ||
" <td>0.174156</td>\n", | ||
" <td>0.169365</td>\n", | ||
" <td>0.297476</td>\n", | ||
" <td>0.176039</td>\n", | ||
" <td>0.170353</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" ARI_Louvain ARI_kmeans ARI_HC \\\n", | ||
"SnapATAC_subset_no_blacklist_filtering 0.309287 0.310815 0.26289 \n", | ||
"SCRAT_motifs_no_blacklist_filtering 0.155567 0.0763765 0.0721623 \n", | ||
"\n", | ||
" AMI_Louvain AMI_kmeans AMI_HC \\\n", | ||
"SnapATAC_subset_no_blacklist_filtering 0.582052 0.58204 0.569305 \n", | ||
"SCRAT_motifs_no_blacklist_filtering 0.293087 0.174156 0.169365 \n", | ||
"\n", | ||
" Homogeneity_Louvain Homogeneity_kmeans \\\n", | ||
"SnapATAC_subset_no_blacklist_filtering 0.548089 0.546123 \n", | ||
"SCRAT_motifs_no_blacklist_filtering 0.297476 0.176039 \n", | ||
"\n", | ||
" Homogeneity_HC \n", | ||
"SnapATAC_subset_no_blacklist_filtering 0.530073 \n", | ||
"SCRAT_motifs_no_blacklist_filtering 0.170353 " | ||
] | ||
}, | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df_metrics" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [conda env:ATACseq_clustering]", | ||
"language": "python", | ||
"name": "conda-env-ATACseq_clustering-py" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.