Skip to content

Commit

Permalink
add blacklist testings for 10xpbmc and sciATACseq
Browse files Browse the repository at this point in the history
  • Loading branch information
huidongchen committed Nov 18, 2019
1 parent d2fa794 commit 325e0f6
Show file tree
Hide file tree
Showing 8 changed files with 7,069 additions and 0 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,394 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import scanpy as sc\n",
"import os\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.cluster import AgglomerativeClustering\n",
"from sklearn.metrics.cluster import adjusted_rand_score\n",
"from sklearn.metrics.cluster import adjusted_mutual_info_score\n",
"from sklearn.metrics.cluster import homogeneity_score\n",
"import rpy2.robjects as robjects\n",
"from rpy2.robjects import pandas2ri"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',\n",
" 'AMI_Louvain','AMI_kmeans','AMI_HC',\n",
" 'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"workdir = '../output/'\n",
"path_fm = os.path.join(workdir,'feature_matrices/')\n",
"path_clusters = os.path.join(workdir,'clusters/')\n",
"path_metrics = os.path.join(workdir,'metrics/')\n",
"os.system('mkdir -p '+path_clusters)\n",
"os.system('mkdir -p '+path_metrics)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"13\n"
]
}
],
"source": [
"metadata = pd.read_csv('../input/metadata.tsv',sep='\\t',index_col=0)\n",
"num_clusters = len(np.unique(metadata['label']))\n",
"print(num_clusters)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files = [x for x in os.listdir(path_fm) if x.startswith('FM')]\n",
"len(files)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['FM_SnapATAC_cusanovich2018_subset_no_blacklist_filtering.rds',\n",
" 'FM_SCRAT_cusanovich2018subset_motifs_no_blacklist_filtering.rds']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):\n",
" this_step = 0\n",
" this_min = float(range_min)\n",
" this_max = float(range_max)\n",
" while this_step < max_steps:\n",
" print('step ' + str(this_step))\n",
" this_resolution = this_min + ((this_max-this_min)/2)\n",
" sc.tl.louvain(adata,resolution=this_resolution)\n",
" this_clusters = adata.obs['louvain'].nunique()\n",
" \n",
" print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))\n",
" \n",
" if this_clusters > n_cluster:\n",
" this_max = this_resolution\n",
" elif this_clusters < n_cluster:\n",
" this_min = this_resolution\n",
" else:\n",
" return(this_resolution, adata)\n",
" this_step += 1\n",
" \n",
" print('Cannot find the number of clusters')\n",
" print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SnapATAC_subset_no_blacklist_filtering\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
" res = PandasDataFrame.from_items(items)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"step 0\n",
"got 26 at resolution 1.5\n",
"step 1\n",
"got 20 at resolution 0.75\n",
"step 2\n",
"got 18 at resolution 0.375\n",
"step 3\n",
"got 14 at resolution 0.1875\n",
"step 4\n",
"got 12 at resolution 0.09375\n",
"step 5\n",
"got 14 at resolution 0.140625\n",
"step 6\n",
"got 13 at resolution 0.1171875\n",
"SCRAT_motifs_no_blacklist_filtering\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
" res = PandasDataFrame.from_items(items)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"step 0\n",
"got 26 at resolution 1.5\n",
"step 1\n",
"got 12 at resolution 0.75\n",
"step 2\n",
"got 19 at resolution 1.125\n",
"step 3\n",
"got 17 at resolution 0.9375\n",
"step 4\n",
"got 15 at resolution 0.84375\n",
"step 5\n",
"got 13 at resolution 0.796875\n"
]
}
],
"source": [
"for file in files:\n",
" file_split = file.split('_')\n",
" method = file_split[1]\n",
" dataset = file_split[2].split('.')[0]\n",
" if(len(file_split)>3):\n",
" method = method + '_' + '_'.join(file_split[3:]).split('.')[0]\n",
" print(method)\n",
"\n",
" pandas2ri.activate()\n",
" readRDS = robjects.r['readRDS']\n",
" df_rds = readRDS(os.path.join(path_fm,file))\n",
" fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))\n",
" fm_mat.fillna(0,inplace=True)\n",
" fm_mat.columns = metadata.index\n",
" \n",
" adata = sc.AnnData(fm_mat.T)\n",
" adata.var_names_make_unique()\n",
" adata.obs = metadata.loc[adata.obs.index,]\n",
" df_metrics.loc[method,] = \"\"\n",
" #Louvain\n",
" sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')\n",
"# sc.tl.louvain(adata)\n",
" getNClusters(adata,n_cluster=num_clusters)\n",
" #kmeans\n",
" kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)\n",
" adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')\n",
" #hierachical clustering\n",
" hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)\n",
" adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')\n",
" #clustering metrics\n",
" \n",
" #adjusted rank index\n",
" ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])\n",
" ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])\n",
" ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])\n",
" #adjusted mutual information\n",
" ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')\n",
" ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic') \n",
" ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')\n",
" #homogeneity\n",
" homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])\n",
" homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])\n",
" homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])\n",
"\n",
" df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]\n",
" df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]\n",
" df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] \n",
" adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df_metrics.to_csv(path_metrics+'clustering_scores_no_bl_filt.csv')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ARI_Louvain</th>\n",
" <th>ARI_kmeans</th>\n",
" <th>ARI_HC</th>\n",
" <th>AMI_Louvain</th>\n",
" <th>AMI_kmeans</th>\n",
" <th>AMI_HC</th>\n",
" <th>Homogeneity_Louvain</th>\n",
" <th>Homogeneity_kmeans</th>\n",
" <th>Homogeneity_HC</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>SnapATAC_subset_no_blacklist_filtering</th>\n",
" <td>0.309287</td>\n",
" <td>0.310815</td>\n",
" <td>0.26289</td>\n",
" <td>0.582052</td>\n",
" <td>0.58204</td>\n",
" <td>0.569305</td>\n",
" <td>0.548089</td>\n",
" <td>0.546123</td>\n",
" <td>0.530073</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SCRAT_motifs_no_blacklist_filtering</th>\n",
" <td>0.155567</td>\n",
" <td>0.0763765</td>\n",
" <td>0.0721623</td>\n",
" <td>0.293087</td>\n",
" <td>0.174156</td>\n",
" <td>0.169365</td>\n",
" <td>0.297476</td>\n",
" <td>0.176039</td>\n",
" <td>0.170353</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ARI_Louvain ARI_kmeans ARI_HC \\\n",
"SnapATAC_subset_no_blacklist_filtering 0.309287 0.310815 0.26289 \n",
"SCRAT_motifs_no_blacklist_filtering 0.155567 0.0763765 0.0721623 \n",
"\n",
" AMI_Louvain AMI_kmeans AMI_HC \\\n",
"SnapATAC_subset_no_blacklist_filtering 0.582052 0.58204 0.569305 \n",
"SCRAT_motifs_no_blacklist_filtering 0.293087 0.174156 0.169365 \n",
"\n",
" Homogeneity_Louvain Homogeneity_kmeans \\\n",
"SnapATAC_subset_no_blacklist_filtering 0.548089 0.546123 \n",
"SCRAT_motifs_no_blacklist_filtering 0.297476 0.176039 \n",
"\n",
" Homogeneity_HC \n",
"SnapATAC_subset_no_blacklist_filtering 0.530073 \n",
"SCRAT_motifs_no_blacklist_filtering 0.170353 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_metrics"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:ATACseq_clustering]",
"language": "python",
"name": "conda-env-ATACseq_clustering-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 325e0f6

Please sign in to comment.