add blacklist testings for 10xpbmc and sciATACseq

pinellolab · Nov 18, 2019 · 325e0f6 · 325e0f6
1 parent d2fa794
commit 325e0f6
Show file tree

Hide file tree

Showing 8 changed files with 7,069 additions and 0 deletions.
diff --git a/Extra/10x_PBMC_5k/test_blacklist/SCRAT_10xpbmc_5k_no_blacklist_filtering.ipynb b/Extra/10x_PBMC_5k/test_blacklist/SCRAT_10xpbmc_5k_no_blacklist_filtering.ipynb
diff --git a/Extra/10x_PBMC_5k/test_blacklist/SnapATAC_10xpbmc_5k_no_blacklist_filtering.ipynb b/Extra/10x_PBMC_5k/test_blacklist/SnapATAC_10xpbmc_5k_no_blacklist_filtering.ipynb
diff --git a/Extra/10x_PBMC_5k/test_blacklist/run_clustering_10xpbmc5k_no_blacklist_filtering.ipynb b/Extra/10x_PBMC_5k/test_blacklist/run_clustering_10xpbmc5k_no_blacklist_filtering.ipynb
diff --git a/Extra/10x_PBMC_5k/test_blacklist/run_umap_10xpbmc5k_no_blacklist_filtering.ipynb b/Extra/10x_PBMC_5k/test_blacklist/run_umap_10xpbmc5k_no_blacklist_filtering.ipynb
diff --git a/...ovich_2018_subset/test_blacklist/SCRAT_cusanovich2018_subset_no_blacklist_filtering.ipynb b/...ovich_2018_subset/test_blacklist/SCRAT_cusanovich2018_subset_no_blacklist_filtering.ipynb
diff --git a/...ch_2018_subset/test_blacklist/SnapATAC_cusanovich2018_subset_no_blacklist_filtering.ipynb b/...ch_2018_subset/test_blacklist/SnapATAC_cusanovich2018_subset_no_blacklist_filtering.ipynb
diff --git a/...18_subset/test_blacklist/run_clustering_cusanovich2018subset_no_blacklist_filtering.ipynb b/...18_subset/test_blacklist/run_clustering_cusanovich2018subset_no_blacklist_filtering.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import scanpy as sc\n",
+    "import os\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.cluster import AgglomerativeClustering\n",
+    "from sklearn.metrics.cluster import adjusted_rand_score\n",
+    "from sklearn.metrics.cluster import adjusted_mutual_info_score\n",
+    "from sklearn.metrics.cluster import homogeneity_score\n",
+    "import rpy2.robjects as robjects\n",
+    "from rpy2.robjects import pandas2ri"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',\n",
+    "                                   'AMI_Louvain','AMI_kmeans','AMI_HC',\n",
+    "                                   'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "workdir = '../output/'\n",
+    "path_fm = os.path.join(workdir,'feature_matrices/')\n",
+    "path_clusters = os.path.join(workdir,'clusters/')\n",
+    "path_metrics = os.path.join(workdir,'metrics/')\n",
+    "os.system('mkdir -p '+path_clusters)\n",
+    "os.system('mkdir -p '+path_metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "13\n"
+     ]
+    }
+   ],
+   "source": [
+    "metadata = pd.read_csv('../input/metadata.tsv',sep='\\t',index_col=0)\n",
+    "num_clusters = len(np.unique(metadata['label']))\n",
+    "print(num_clusters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "files = [x for x in os.listdir(path_fm) if x.startswith('FM')]\n",
+    "len(files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['FM_SnapATAC_cusanovich2018_subset_no_blacklist_filtering.rds',\n",
+       " 'FM_SCRAT_cusanovich2018subset_motifs_no_blacklist_filtering.rds']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):\n",
+    "    this_step = 0\n",
+    "    this_min = float(range_min)\n",
+    "    this_max = float(range_max)\n",
+    "    while this_step < max_steps:\n",
+    "        print('step ' + str(this_step))\n",
+    "        this_resolution = this_min + ((this_max-this_min)/2)\n",
+    "        sc.tl.louvain(adata,resolution=this_resolution)\n",
+    "        this_clusters = adata.obs['louvain'].nunique()\n",
+    "        \n",
+    "        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))\n",
+    "        \n",
+    "        if this_clusters > n_cluster:\n",
+    "            this_max = this_resolution\n",
+    "        elif this_clusters < n_cluster:\n",
+    "            this_min = this_resolution\n",
+    "        else:\n",
+    "            return(this_resolution, adata)\n",
+    "        this_step += 1\n",
+    "    \n",
+    "    print('Cannot find the number of clusters')\n",
+    "    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SnapATAC_subset_no_blacklist_filtering\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
+      "  res = PandasDataFrame.from_items(items)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step 0\n",
+      "got 26 at resolution 1.5\n",
+      "step 1\n",
+      "got 20 at resolution 0.75\n",
+      "step 2\n",
+      "got 18 at resolution 0.375\n",
+      "step 3\n",
+      "got 14 at resolution 0.1875\n",
+      "step 4\n",
+      "got 12 at resolution 0.09375\n",
+      "step 5\n",
+      "got 14 at resolution 0.140625\n",
+      "step 6\n",
+      "got 13 at resolution 0.1171875\n",
+      "SCRAT_motifs_no_blacklist_filtering\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.\n",
+      "  res = PandasDataFrame.from_items(items)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step 0\n",
+      "got 26 at resolution 1.5\n",
+      "step 1\n",
+      "got 12 at resolution 0.75\n",
+      "step 2\n",
+      "got 19 at resolution 1.125\n",
+      "step 3\n",
+      "got 17 at resolution 0.9375\n",
+      "step 4\n",
+      "got 15 at resolution 0.84375\n",
+      "step 5\n",
+      "got 13 at resolution 0.796875\n"
+     ]
+    }
+   ],
+   "source": [
+    "for file in files:\n",
+    "    file_split = file.split('_')\n",
+    "    method = file_split[1]\n",
+    "    dataset = file_split[2].split('.')[0]\n",
+    "    if(len(file_split)>3):\n",
+    "        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]\n",
+    "    print(method)\n",
+    "\n",
+    "    pandas2ri.activate()\n",
+    "    readRDS = robjects.r['readRDS']\n",
+    "    df_rds = readRDS(os.path.join(path_fm,file))\n",
+    "    fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))\n",
+    "    fm_mat.fillna(0,inplace=True)\n",
+    "    fm_mat.columns = metadata.index\n",
+    "    \n",
+    "    adata = sc.AnnData(fm_mat.T)\n",
+    "    adata.var_names_make_unique()\n",
+    "    adata.obs = metadata.loc[adata.obs.index,]\n",
+    "    df_metrics.loc[method,] = \"\"\n",
+    "    #Louvain\n",
+    "    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')\n",
+    "#     sc.tl.louvain(adata)\n",
+    "    getNClusters(adata,n_cluster=num_clusters)\n",
+    "    #kmeans\n",
+    "    kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)\n",
+    "    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')\n",
+    "    #hierachical clustering\n",
+    "    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)\n",
+    "    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')\n",
+    "    #clustering metrics\n",
+    "    \n",
+    "    #adjusted rank index\n",
+    "    ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])\n",
+    "    ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])\n",
+    "    ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])\n",
+    "    #adjusted mutual information\n",
+    "    ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')\n",
+    "    ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   \n",
+    "    ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')\n",
+    "    #homogeneity\n",
+    "    homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])\n",
+    "    homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])\n",
+    "    homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])\n",
+    "\n",
+    "    df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]\n",
+    "    df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]\n",
+    "    df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] \n",
+    "    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_metrics.to_csv(path_metrics+'clustering_scores_no_bl_filt.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ARI_Louvain</th>\n",
+       "      <th>ARI_kmeans</th>\n",
+       "      <th>ARI_HC</th>\n",
+       "      <th>AMI_Louvain</th>\n",
+       "      <th>AMI_kmeans</th>\n",
+       "      <th>AMI_HC</th>\n",
+       "      <th>Homogeneity_Louvain</th>\n",
+       "      <th>Homogeneity_kmeans</th>\n",
+       "      <th>Homogeneity_HC</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>SnapATAC_subset_no_blacklist_filtering</th>\n",
+       "      <td>0.309287</td>\n",
+       "      <td>0.310815</td>\n",
+       "      <td>0.26289</td>\n",
+       "      <td>0.582052</td>\n",
+       "      <td>0.58204</td>\n",
+       "      <td>0.569305</td>\n",
+       "      <td>0.548089</td>\n",
+       "      <td>0.546123</td>\n",
+       "      <td>0.530073</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>SCRAT_motifs_no_blacklist_filtering</th>\n",
+       "      <td>0.155567</td>\n",
+       "      <td>0.0763765</td>\n",
+       "      <td>0.0721623</td>\n",
+       "      <td>0.293087</td>\n",
+       "      <td>0.174156</td>\n",
+       "      <td>0.169365</td>\n",
+       "      <td>0.297476</td>\n",
+       "      <td>0.176039</td>\n",
+       "      <td>0.170353</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                       ARI_Louvain ARI_kmeans     ARI_HC  \\\n",
+       "SnapATAC_subset_no_blacklist_filtering    0.309287   0.310815    0.26289   \n",
+       "SCRAT_motifs_no_blacklist_filtering       0.155567  0.0763765  0.0721623   \n",
+       "\n",
+       "                                       AMI_Louvain AMI_kmeans    AMI_HC  \\\n",
+       "SnapATAC_subset_no_blacklist_filtering    0.582052    0.58204  0.569305   \n",
+       "SCRAT_motifs_no_blacklist_filtering       0.293087   0.174156  0.169365   \n",
+       "\n",
+       "                                       Homogeneity_Louvain Homogeneity_kmeans  \\\n",
+       "SnapATAC_subset_no_blacklist_filtering            0.548089           0.546123   \n",
+       "SCRAT_motifs_no_blacklist_filtering               0.297476           0.176039   \n",
+       "\n",
+       "                                       Homogeneity_HC  \n",
+       "SnapATAC_subset_no_blacklist_filtering       0.530073  \n",
+       "SCRAT_motifs_no_blacklist_filtering          0.170353  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_metrics"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:ATACseq_clustering]",
+   "language": "python",
+   "name": "conda-env-ATACseq_clustering-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}