-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtSpace.R
249 lines (195 loc) · 10.7 KB
/
tSpace.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#' tSpace
#'
#' tSpace is the main function for trajectory analysis. The algorithm is described in the publication
#' Dermadi et al. 2018 pre-print available: doi: https://doi.org/10.1101/336313
#' Originally, it was developed for single cell analysis, however it can be applied on any type of large data.
#'
#' If you use it please cite: doi: https://doi.org/10.1101/336313
#'
#' @param df a data frame or a matrix of expression values, which contain information on data straucture,
#' e.g. expression of variable genes, developmentally relevant genes/proteins, or significant principal components of your data.
#' @param K an integer specifying the K-nearest-neighbors
#' @param L an integer specifying the random L out of K-nearest-neighbors, L < K, usually L= 0.75*K
#' @param D a string specfying metric for distance calculation. Supported: ’euclidean’, ’pearson_correlation’,
#' ’manhattan’, ’chebyshev’, ’canberra’, ’braycurtis’, ’simple_matching_coefficient’, ’minkowski’,
#' ’hamming’, ’mahalanobis’, ’jaccard_coefficient’, ’Rao_coefficient’
#' @param graph an integer specifying how many L-K-NN graphs will be used for final trajectory calculation
#' @param trajectories an integer specifying how many trajectories will be calculatedl. Default value is 100,
#' see ground_truth for more details
#' @param wp an integer specifying the number of waypoints for trajectory refinement
#' @param ground_truth a booolean (TRUE or FALSE) specifying if trajectories are calculated for every data point.
#' As default tSpace calculates an aproximation using 100 trajectories, which is usually sufficient for understanding
#' of developmental relations in single cell data. If set to TRUE, calculation time will be longer
#' and trajectories parameter will be overridden.
#' @param weights a string specfying method to calculate the weights for refinement of the trajectory distances. Supported:
#' uniform, linear, quadratic and exponential.
#' @param dr a string specifying type of embbeding for visualization. Options: 'pca', 'umap' or 'both'.
#' 'pca' embbeds trajectory space matrix in principal components,
#' 'umap' uses umap function with config parameter filled with umap.defaults modified for min_dist = 0.8 and metric = 'manhattan', for details see documentation of umap package,
#' 'both' calculates pca and umap
#' @param seed an integer specifying seed for set.seed function in order to have reproducible umap
#' @param core_no and integer specifying number of cores for parallelization, check how many cores your machine has and adjust accordingly
#' @return tSpace returns a list of objects: 1. **ts_file**: a data frame of pca and/or umap embbedings of trajectory space matrix and input data,
#' 2. pca_tspace and/or **umap_tspace**: pca and/or UMAP objects. pca object contians all the outputs of pca analysis,
#' umap contians all the outputs of the umap analysis, see \code{\link[umap:umap]{umap}}
#' 3. **tspace_matrix**: trajectory space matrix with calculated distances. In case negative distances are calculated during knn graph computation, these will be aproximated to zero
#' for trajectory inference completion, and reported in an object **negative_distances**, and a message will be reported in console.
#' @importFrom foreach %dopar%
#' @export
tSpace <- function(df, K = 20, L = NULL, D = 'pearson_correlation', graph = 5, trajectories = 200, wp = 20, ground_truth = F, weights = 'exponential', dr = 'pca', seed = NULL, core_no = 1, ...){
if(any(sapply(df, function(x) is.numeric(x)) == F)){
stop("Check that all values in your data frame are numeric")
}
# to do
# Evaluate inputs, fix once all is running smootly
if(!is.numeric(core_no)){
stop("Number of cores is not numeric")
}
if(!is.numeric(K) | !is.numeric(graph) | !is.numeric(wp) | !is.numeric(trajectories)){
stop("K, graph, waypoints or trajectories variables are not numbers")
}
if(!(D %in% c('euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'))){
stop( "distance can be any of 'euclidean', 'manhattan', 'chebyshev', 'canberra', 'braycurtis', 'pearson_correlation', 'simple_matching_coefficient', 'minkowski', 'hamming', 'mahalanobis', 'jaccard_coefficient', 'Rao_coefficient'" )
}
if(!(weights %in% c('uniform', 'linear', 'quadratic', 'exponential'))){
stop( "weights can be any of 'uniform', 'linear', 'quadratic', 'exponential'" )
}
if(!(dr %in% c('pca', 'umap', 'both'))){
stop( "dimensionality reduction can be any of 'pca', 'umap', 'both'" )
}
if(is.null(seed)){
seed <- 1111
}
if(is.null(L)){
L = as.numeric(round(0.75*K, digits = 0))
}
#########################
if(ground_truth == T){
numPop <- 1:nrow(df)
tspacem <- matrix(data = NA, nrow = dim(df)[1], ncol = length(unique(numPop)))
graph_panel <- list()
s <- numPop
trajectories <- length(numPop)
Index <- numPop
} else {
numPop <- kmeans(df, centers = trajectories, iter.max = 10000)
Index <- seq(1, nrow(df), by=1)
tspacem <- matrix(data = NA, nrow = dim(df)[1], ncol = length(unique(numPop$cluster)))
graph_panel <- list()
s <- unlist(lapply(split(Index, as.factor(numPop$cluster)),
function(x) {
as.numeric(x[1])
}))
}
#########
## Functions
cat(paste0('Step 1:Finding graph'))
knn <- graphfinder(x = df, k = K, distance = D, core_n = core_no)
# Some users experienced negative values in knn graph and core shortest path algorithm (Dijkstra) does not operate on
# negative distance, therefore warning message is implemented, and all non-positive values are aproximated with a zero
# In addition user can see which cells are involved in negative values pairs and can examine them.
if(min(knn) < 0){
cat(paste0("\nSome cell-cell pairs have negative distances. \nIn order for this analysis to proceed, these distances will be aproximated to zero. \nIf substantial number of cells exhibit negative distances\nplease check the original data and examine which cells are causing the issue. \nSome of these cells may be just noise and should be removed"))
negative.distance <- knn[which(knn[,3] < 0), ]
knn[which(knn[,3] < 0), 3] <- 0
}
knn <- igraph::get.adjacency(igraph::graph.adjacency(Matrix::sparseMatrix(i=knn[,'I'], j=knn[,'J'], x=knn[,'D']), mode ='max', weighted = TRUE), attr = 'weight') # For comapriosn wiht MATLAB , index1 = F)
cat(paste0('\nStep 2: Finding trajectories in sub-graphs \nCalculation may take time, don\'t close R'))
graph_panel <- list()
percentage <- seq(100/graph, 100, by = 100/graph)
tictoc::tic('graphs_loop')
for(graph_iter in 1:graph){
svMisc::progress(percentage[graph_iter], progress.bar = T)
if(K != L){
l.knn = find_lknn(knn, l = L, core_n = core_no)
# at this stage lknn is directed graph
}
else{
l.knn = knn
}
cl <- parallel::makeCluster(core_no)
doParallel::registerDoParallel(cl)
tspacem <- foreach::foreach(i = 1:trajectories, .combine = cbind, .packages = c('igraph', 'Matrix', 'KernelKnn', 'pracma')) %dopar%{
s_c = as.numeric(s[i])
tspacem <- pathfinder(data = df, lknn = l.knn, s = s_c, waypoints = wp, voting_scheme = weights, distance = D)$final_trajectory
}
parallel::stopCluster(cl)
graph_panel[[graph_iter]] <- tspacem
if(graph_iter == graph) cat("\nLast sub-graph is done!")
}
time <- tictoc::toc()
#Remove last large tspace matrix, for large datasets not to be kept in the environment
rm(tspacem)
tspace_mat <- array( unlist(graph_panel) , c(nrow(graph_panel[[1]]),ncol(graph_panel[[1]]),graph) )
#Remove last large tspace matrix, for large datasets not to be kept in the environment
rm(graph_panel)
tspace_mat <- rowMeans( tspace_mat , dims = 2 )
colnames(tspace_mat) <- paste0('T_', s)
cat(paste0('\nStep 3: Low dimensionality embbeding for visualization step'))
if( dr == 'pca'){
#PCA calculation
if(ncol(tspace_mat) > 20){
set.seed(seed)
pca_tspace <- prcomp(t(tspace_mat), rank. = 20)
} else {
set.seed(seed)
pca_tspace <- prcomp(t(tspace_mat), center = T, scale. = T)
}
pca_out <- pca_tspace$rotation
colnames(pca_out) <- paste0('tPC', seq(1, ncol(pca_out), 1))
#Shaping data output
data.out <- as.data.frame(cbind(Index = Index, pca_out, df))
if(exists("negative.distance") == T){
tspace_obj <- list(ts_file = data.out, pca_embbeding = pca_tspace, tspace_matrix = tspace_mat, negative_distances = negative.distance)
}else{
tspace_obj <- list(ts_file = data.out, pca_embbeding = pca_tspace, tspace_matrix = tspace_mat)
}
}
if( dr == 'umap'){
#UMAP calculation
config_tspace <- umap::umap.defaults
config_tspace$n_neighbors <- 7
config_tspace$min_dist <- 0.2
config_tspace$metric <- 'manhattan'
set.seed(seed)
umap_tspace <- umap::umap(tspace_mat, config = config_tspace)
umap_out <- as.data.frame(umap_tspace$layout)
colnames(umap_out) <- paste0('umap', seq(1, ncol(umap_tspace$layout), 1))
#Shaping data output
data.out <- as.data.frame(cbind(Index = Index, umap_out, df))
if(exists("negative.distance") == T){
tspace_obj <- list(ts_file = data.out, umap_embbeding = umap_tspace, tspace_matrix = tspace_mat, negative_distances = negative.distance)
}else{
tspace_obj <- list(ts_file = data.out, umap_embbeding = umap_tspace, tspace_matrix = tspace_mat)
}
}
if( dr == 'both'){
#PCA calculation
if(ncol(tspace_mat) > 20){
set.seed(seed)
pca_tspace <- prcomp(t(tspace_mat), rank. = 20)
} else {
set.seed(seed)
pca_tspace <- prcomp(t(tspace_mat), center = T, scale. = T)
}
pca_out <- as.data.frame(pca_tspace$rotation)
colnames(pca_out) <- paste0('tPC', seq(1, ncol(pca_out), 1))
#UMAP calculation
config_tspace <- umap::umap.defaults
config_tspace$n_neighbors <- 7
config_tspace$min_dist <- 0.2
config_tspace$metric <- 'manhattan'
set.seed(seed)
umap_tspace <- umap::umap(tspace_mat, config = config_tspace)
umap_out <- as.data.frame(umap_tspace$layout)
colnames(umap_out) <- paste0('umap', seq(1, ncol(umap_tspace$layout), 1))
#Shaping data output
data.out <- as.data.frame(cbind(Index = Index, pca_out, umap_out, df))
if(exists("negative.distance") == T){
tspace_obj <- list(ts_file = data.out, pca = pca_tspace, umap_embbeding = umap_tspace, tspace_matrix = tspace_mat, negative_distances = negative.distance)
}else{
tspace_obj <- list(ts_file = data.out, pca = pca_tspace, umap_embbeding = umap_tspace, tspace_matrix = tspace_mat)
}
}
return(tspace_obj)
}