-
Notifications
You must be signed in to change notification settings - Fork 25
/
individual_clustering_modified.R
executable file
·258 lines (220 loc) · 11.9 KB
/
individual_clustering_modified.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# NOTE! This scripts has been modified from the one downloaded from
# https://yunliweb.its.unc.edu//safe/download.php, in order to set the number of
# cores to use for the clustering. All modifications are indicated in the code.
#
# !/usr/bin/env Rscript
# 11/10/2017
# @title Single-cell Aggregated clustering From Ensemble (SAFE)
#
# @description This function performs single-cell clustering using four state-of-the-art methods,
# SC3, CIDR, Seurat and tSNE+kmeans.
#
# @param inputTags a G*N matrix with G genes and N cells.
# @param datatype defines the type of data, which could be "count", "CPM", "RPKM" and "FPKM".
# Default is "count".
# @param SC3 a boolean variable that defines whether to cluster cells using SC3 method.
# Default is "TRUE".
# @param gene_filter a boolean variable defines whether to perform gene filtering
# before SC3 clustering, when \code{SC3 = TRUE}.
# @param svm_max, if \code{SC3 = TRUE}, then defines the mimimum number of cells above which SVM will be run.
# @param CIDR a boolean parameter that defines whether to cluster cells using CIDR method.
# Default is "TRUE".
# @param Seurat is a boolean variable that defines whether to cluster cells using Seurat method.
# Default is "TRUE".
# @param nPC defines the number of princple compoents used in Seurat clustering, when \code{Seurat = TRUE}.
# Default value is esimated by \code{nPC} of \code{CIDR}.
# @param resolution defines the value of resolution used in Seurat clustering, when \code{Seurat = TRUE}.
# @param seurat_min_cell defines the mimimum number of cells in input dataset below which
# \code{resolution} is set to 1.2, when \code{Seurat = TRUE}.
# @param resolution_min defines the resolution used in Seurat clustering for small dataset,
# when \code{Seurat == TRUE} and cell number of input file < \code{seurat_min_cell}.
# @param tSNE is a boolean variable that defines whether to cluster cells using tSNE method.
# Default is "TRUE".
# @param var_genes defines the number of variable genes used by tSNE analysis, when \code{tSNE = TRUE}.
# @param SEED sets the seed of the random number generator. Setting the seed to a fixed value can
# produce reproducible clustering results.
#
# @return a matrix of indiviudal clustering results is returned.
#
# @author Yuchen Yang <yyuchen@email.unc.edu>, Ruth Huh <rhuh@live.unc.edu>,
# Houston Culpepper <hculpepp@live.unc.edu>, Yun Li <yunli@med.unc.edu>
# @citation Yuchen Yang, Ruth Huh, Houston Culpepper, Yun Li. SAFE (Single-cell Aggregated clustering From Ensemble): Cluster ensemble for single-cell RNA-seq data. 2017
## Modification: add "n_cores" argument
individual_clustering <- function(inputTags, datatype = "count",
SC3 = TRUE, gene_filter = FALSE, svm_num_cells = 5000, CIDR = TRUE,
Seurat = TRUE, nPC = NULL, resolution = 0.9, seurat_min_cell = 200, resolution_min = 1.2,
tSNE = TRUE, var_genes = NULL, SEED = 1, n_cores){
## Modification: Only set seed to SEED if not NULL, otherwise generate a random seed
if (is.null(SEED)) {
SEED <- round(1e6*runif(1))
}
set.seed(SEED)
cluster_results <- NULL
inputTags = as.matrix(inputTags)
##### SC3
if(SC3 == TRUE){
library(scater) #https://github.com/davismcc/scater
library(SC3) #https://github.com/hemberg-lab/SC3
exp_cell_exprs <- NULL
sc3OUTPUT <- NULL
# cell expression
if (datatype == "count") {
### If the input data is original count data, it would be normalized by the total cound number and then log2 transformed
exp_cell_exprs <- SingleCellExperiment(assays = list(counts = as.matrix(inputTags)))
normcounts(exp_cell_exprs) <- t(t(inputTags)/colSums(inputTags))*1000000
logcounts(exp_cell_exprs) <- log2(normcounts(exp_cell_exprs) + 1)
#exp_cell_exprs <- inputTags
} else if (datatype == "CPM" || datatype == "FPKM" || datatype == "RPKM" || datatype == "TPM") {
### If the input data is CPM, FPKM, RPKM or TPM, it would be log2 transformed
exp_cell_exprs <- SingleCellExperiment(assays = list(normcounts = as.matrix(inputTags)))
logcounts(exp_cell_exprs) <- log2(normcounts(exp_cell_exprs) + 1)
}
rowData(exp_cell_exprs)$feature_symbol <- rownames(exp_cell_exprs)
exp_cell_exprs <- exp_cell_exprs[!duplicated(rownames(exp_cell_exprs)), ]
### Estimating optimal number of clustering
exp_cell_exprs <- sc3_estimate_k(exp_cell_exprs)
optimal_K = metadata(exp_cell_exprs)$sc3$k_estimation
### Clustering by SC3 at the optimal K
## Modification: set n_cores and pct_dropout_min/max
if (ncol(inputTags) < svm_num_cells){
exp_cell_exprs <- sc3(exp_cell_exprs, ks = optimal_K, biology = FALSE, gene_filter = gene_filter, rand_seed = SEED, n_cores = n_cores, pct_dropout_min = 0, pct_dropout_max = 100)
}
### Runing SVM
if (ncol(inputTags) >= svm_num_cells){
## Modification: set n_cores and pct_dropout_min/max
exp_cell_exprs <- sc3(exp_cell_exprs, ks = optimal_K, biology = FALSE, gene_filter = gene_filter,
svm_max = svm_num_cells, svm_num_cells = svm_num_cells, rand_seed = SEED,
n_cores = n_cores, pct_dropout_min = 0, pct_dropout_max = 100)
exp_cell_exprs <- sc3_run_svm(exp_cell_exprs, ks = optimal_K)
}
### Exporting SC3 results
p_Data <- colData(exp_cell_exprs)
col_name <- paste("sc3_", optimal_K, "_clusters", sep = '')
sc3OUTPUT <- p_Data[, grep(col_name, colnames(p_Data))]
cluster_results = rbind(cluster_results, matrix(c(sc3OUTPUT), nrow = 1, byrow = T))
}
##### CIDR
if(CIDR == TRUE){
library(cidr) #https://github.com/VCCRI/CIDR
cidrOUTPUT <- NULL
cidrOUTPUT <- scDataConstructor(inputTags)
cidrOUTPUT <- determineDropoutCandidates(cidrOUTPUT)
cidrOUTPUT <- wThreshold(cidrOUTPUT)
## Modification: set n_cores
cidrOUTPUT <- scDissim(cidrOUTPUT, threads = n_cores)
cidrOUTPUT <- scPCA(cidrOUTPUT)
cidrOUTPUT <- nPC(cidrOUTPUT)
### nPC is not necessarily the same between CIDR and Seurat. Here we use the same nPC value for both two methods for convenience
if(!is.null(nPC)) {
cidrOUTPUT@nPC <- nPC
} else {
nPC <- cidrOUTPUT@nPC
}
### Clustering by CIDR.
# The optimal clustering number is determined automatically
cidrOUTPUT <- scCluster(cidrOUTPUT, nPC = nPC)
cluster_results = rbind(cluster_results, matrix(c(cidrOUTPUT@clusters), nrow = 1, byrow = T))
}
##### Seurat
if (Seurat == TRUE){
library(dplyr)
library(Seurat) #http://satijalab.org/seurat/get_started.html
library(Matrix)
seuratOUTPUT <- NULL
# Initialize the Seurat object with the raw data (non-normalized data)
# Keep all genes expressed in >= 3 cells, keep all cells with >= 200 genes
## Modification: remove cell and gene filtering
seuratOUTPUT <- CreateSeuratObject(raw.data = inputTags, min.cells = 0, min.genes = 0, project = "single-cell clustering")
# Perform log-normalization, first scaling each cell to a total of 1e4 molecules (as in Macosko et al. Cell 2015)
if (datatype == "count"){
seuratOUTPUT = NormalizeData(object = seuratOUTPUT, normalization.method = "LogNormalize", scale.factor = 10000)
} else if (datatype == "CPM" || datatype == "FPKM" || datatype == "RPKM" || datatype == "TPM"){
raw.data <- GetAssayData(object = seuratOUTPUT, slot = "raw.data")
normalized.data <- log(raw.data+1)
colnames(x = normalized.data) <- colnames(x = raw.data)
rownames(x = normalized.data) <- rownames(x = raw.data)
seuratOUTPUT <- SetAssayData(object = seuratOUTPUT, assay.type = "RNA",slot = "data", new.data = normalized.data)
}
# Detection of variable genes across the single cells
## Modification: remove search for variable genes since we provide pre-filtered data
# seuratOUTPUT = FindVariableGenes(object = seuratOUTPUT, mean.function = ExpMean, dispersion.function = LogVMR,
# x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
# Regress out unwanted sources of variation
seuratOUTPUT <- ScaleData(object = seuratOUTPUT, vars.to.regress = c("nUMI"))
### Perform linear dimensional reduction
## Modification: use all genes
seuratOUTPUT <- RunPCA(object = seuratOUTPUT, pc.genes = rownames(seuratOUTPUT@data), do.print = FALSE)
if (length(inputTags[1,]) >= seurat_min_cell){
### Determine statistically significant principal components
# NOTE: This process can take a long time for big datasets, comment out for expediency.
# More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time
# Here we chooes the same number of PCs used in CIDR
### Clustering the cells by Seurat
seuratOUTPUT <- Seurat::FindClusters(object = seuratOUTPUT, reduction.type = "pca", dims.use = 1:nPC, algorithm = 3,
resolution = resolution, print.output = F, random.seed = SEED)
} else {
resolution <- resolution_min
seuratOUTPUT <- Seurat::FindClusters(object = seuratOUTPUT, reduction.type = "pca", dims.use = 1:nPC, algorithm = 3,
resolution = resolution_min, print.output = F, random.seed = SEED)
}
### Complementing the missing data
cells_dropout <- NULL
num_genes <- colSums(seuratOUTPUT@raw.data > 0)
cells_dropout <- names(num_genes[which(num_genes <= 0)])
if (length(cells_dropout != 0)){
seurat_output <- matrix(NA, ncol = ncol(seuratOUTPUT@raw.data), byrow = T)
colnames(seurat_output) <- colnames(seuratOUTPUT@raw.data)
seurat_retained <- t(as.matrix(as.numeric(seuratOUTPUT@ident)))
colnames(seurat_retained) <- colnames(seuratOUTPUT@data)
for (i in 1:ncol(seurat_retained)){
seurat_output[1,colnames(seurat_retained)[i]] <- seurat_retained[1,colnames(seurat_retained)[i]]
}
} else {
seurat_output <- t(as.matrix(as.numeric(seuratOUTPUT@ident)))
}
cluster_results = rbind(cluster_results, matrix(c(as.numeric(seuratOUTPUT@ident)), nrow = 1, byrow = T))
}
##### tSNE+kmeans
if(tSNE == TRUE){
library(Rtsne)
library(ADPclust)
input_lcpm <- NULL
tsne_input <- NULL
tsne_output <- NULL
tsne_kmeansOUTPUT <- NULL
adpOUTPUT <- NULL
### Data tranformation
if (datatype == "count") {
### If the input data is original count data or CPM, it would be tranformed to CPM
input_lcpm <- log2(t(t(inputTags)/colSums(inputTags))*1000000+1)
tsne_input <- input_lcpm
} else if (datatype == "CPM" || datatype == "FPKM" || datatype == "RPKM" || datatype == "TPM") {
### If the input data is FPKM or RPKM, we use the transformed TPM data generated before as the input
tsne_input <- log2(inputTags + 1)
}
### Dimensionality reduction by Rtsne
if(length(inputTags[1,]) >= 200) {
perplexity = 30
} else {
perplexity = 10
}
if (is.null(var_genes)){
tsne_output <- Rtsne(t(tsne_input), perplexity = perplexity)
} else{
se_genes = rep(NA, nrow(tsne_input))
for (i in 1:nrow(tsne_input)){
se_genes[i] = sqrt(var(tsne_input[i,])/length(tsne_input[i,]))
}
decreasing_rank = order(se_genes, decreasing = T)
tsne_output <- Rtsne(t(tsne_input[decreasing_rank[1:var_genes],]), perplexity = perplexity)
}
### Determining the optimal cluster number (k) by ADPclust
mim_K = 2
max_K = max(optimal_K, cidrOUTPUT@nCluster, max(as.numeric(seuratOUTPUT@ident)))
adpOUTPUT <- adpclust(tsne_output$Y, htype = "amise",centroids="auto", nclust = mim_K:max_K)
### Clustering the cells by kmeans
tsne_kmeansOUTPUT <- kmeans(tsne_output$Y, tsne_output$Y[adpOUTPUT$centers,], adpOUTPUT$nclust)
cluster_results = rbind(cluster_results, matrix(c(tsne_kmeansOUTPUT$cluster), nrow = 1, byrow = T))
}
return(cluster_results)
}