-
Notifications
You must be signed in to change notification settings - Fork 0
/
Figure_5_Supplementary_8_&_9.R
158 lines (131 loc) · 7.22 KB
/
Figure_5_Supplementary_8_&_9.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Define jaccard index function
jaccard_ind <- function(x){
res <- matrix(data=NA,nrow=ncol(x),ncol=ncol(x))
rownames(res) <- colnames(x)
colnames(res) <- colnames(x)
for(i in 1:ncol(x)){
uno <- rownames(x)[which(x[,i]==1)]
for(j in 1:ncol(x)){
if(is.na(res[i,j])){
dos <- rownames(x)[which(x[,j]==1)]
numerador <- length(intersect(uno,dos))
denom <- length(union(uno,dos))
out <- numerador/denom
res[i,j] <- out
res[j,i] <- res[i,j]
}
}
}
attr(res,"method") <- "jaccard_ind"
return(as.dist(1-res)) ###Also: We have to be calculating a dissimilarity matrix with the function, so 1-x.
}
# Load dependencies
library(sigclust2)
library(pvclust)
library(fpc)
library(jaccard)
library(CmmD)
library(readr)
library(knitr)
library(data.table)
# Create a vector with the paths where Molti's Output files are saved.
structures_12 <- paste0("data/Molti_Output/",seq(0.5,12,0.5),".csv")
# Detect community trajectories and tree distances between each gene.
curie_to_12_full <- CmmD_from_community_structures(nodelist = NULL, community_structures = structures_12, resolution_start = 0.5,resolution_end = 12,interval = 0.5,distmethod = "hamming",threads = 7)
curie_to_12_full$hamming_distance_matrix = as.matrix(curie_to_12_full$distance_matrix) * 24 # This transformation is needed because parallel dist is weighted.
# 24 = length(seq(0.5,12,0.5)) -> number of resolution values analyzed
# Load genes associated to each patient from CURIE data
tata <- as.data.frame(read.csv(file = "https://raw.githubusercontent.com/iPC-project-H2020/ipcrg/master/scripts/CURIE2gr/multi.layer.net.gr",sep = "\t",header = F, stringsAsFactors=F))
splited_patients <- split(tata[,2],tata[1])
# Generate ground truth table from CURIE data
ground_truth_patients <- c("MB30","MB31","MB34",
"MB04","MB05","MB06","MB24","MB40","MB25","MB43","MB46","MB49","MB55",
"MB01","MB02","MB03","MB14","MB19","MB47","MB50","MB51","MB52","MB53",
"MB07","MB08","MB09","MB13","MB15","MB16","MB17","MB20","MB22","MB39","MB48","MB54")
ground_truth <- matrix(nrow= length(ground_truth_patients),ncol=2)
ground_truth[,1] <- ground_truth_patients
ground_truth[,2] <- c(rep("WNT",3),rep("SHH",10),rep("G3",10),rep("G4",12))
colnames(ground_truth) <- c("Patient","Real_class")
rownames(ground_truth) <- ground_truth[,1]
message("Performing filtering based on tetha (0 to 10)")
genes_per_patient_list <- list()
for(k in 0:10){
genes_per_patient <- list()
for(i in 1:length(splited_patients)){
genes_interesantes <- names(which(table(splited_patients[[i]])>=1))
# Filter CURIE genes to those included in the multilayer network.
genes_interesantes <- genes_interesantes[genes_interesantes %in% rownames(curie_to_12_full$hamming_distance_matrix)]
matricin <- curie_to_12_full$hamming_distance_matrix[genes_interesantes,genes_interesantes]
# For each value of theta between 0 and 10, we create a vector (suc), where for each gene of the multilayer we calculate the
# number of other multilayer genes that are below a maximum k value of theta -distance in the tree-.
names_matricin <- colnames(matricin)
suc <- vector("numeric",length = length(names_matricin))
names(suc) <- names_matricin
for(j in 1:ncol(matricin)){
leng_matricin <- length(matricin[,j][matricin[,j]<=k]) ### k (in our case, a representation of theta) is maximum distance allowed in the clustering plot. (Sauron's eye)
suc[j] <- leng_matricin
}
suc <- suc[suc>1] # Filter genes that have no patient associated partners
genes_per_patient[[i]] <- suc - 1 # As a value of 2 mean that only there is one more gene with the gene being analyzed, we substract 1.
}
names(genes_per_patient) <- names(splited_patients)
genes_per_patient_list[[k+1]] <- genes_per_patient
names(genes_per_patient_list)[[k+1]] <- as.character(k)
message(paste0("theta= ",k))
}
message("Tetha based filtering finished. Calculating clustering accuracies for tetha 0 to 10 and lambda 1 to 20")
# Start a 11 x 20 matrix to be filled with the hierarchical clustering accuracy values.
final_accuracy_matrix <- matrix(0, ncol= 20, nrow= 11)
final_kk_used <- matrix(0, ncol= 20, nrow= 11)
rownames(final_accuracy_matrix) <- as.character(0:10)
rownames(final_kk_used) <- as.character(0:10)
u <- 1 # theta = 0
val <- 6 #lambda
preserve_genes_per_patient <- genes_per_patient_list[[u]] # u==k == tetha + 1
genes_per_patient <- preserve_genes_per_patient
genes_per_patient <- lapply(genes_per_patient,function(x) x[x<=val]) #val = lambda. We filter the genes that are over the lambda value tested
genes_per_patient_names <- lapply(genes_per_patient,function(x) names(x))
all_genes_possible <- unique(unlist(genes_per_patient_names,use.names=F))
# Generate a 0-1 patient x genes matrix that acts as input for pamk, jaccard_ind and hclust.
n_genes_p_patients <- matrix(data= 0, nrow= 38,ncol= length(all_genes_possible))
colnames(n_genes_p_patients) <- all_genes_possible
rownames(n_genes_p_patients) <- names(genes_per_patient)
for(rowi in 1:nrow(n_genes_p_patients)){
n_genes_p_patients[rowi,] <- as.integer(colnames(n_genes_p_patients) %in% genes_per_patient_names[[rowi]])
}
# Save Table 1
# values <- as.matrix(jaccard_ind(t(patient_matrix)))
# table_2 <- 1-values[,c("MB10","MB21","MB33")]
# write.table(table_2,file="Tables/Table_1.csv", row.names=T, col.names=T, sep="\t")
# Exclude validation set
WHATEVER <- c("MB10","MB21","MB33")
patient_matrix <- n_genes_p_patients
patient_matrix2 <- patient_matrix[- which(rownames(patient_matrix) %in% WHATEVER),] # Exclude patients with missing data from clustering
#Obtain optimal clusters
pamk.best <- pamk(patient_matrix2)
kk <- pamk.best$nc ## kk is the optimal number of clusters for the particular optimization.
# Perform hierarchical clustering with the suggested number of clusters
patient_matrix3 <- t(patient_matrix2)
set.seed(2020)
# Pvclust Bootstrap
res_pvclust <- pvclust(patient_matrix3, method.dist=jaccard_ind,method.hclust = "ward.D2", nboot=10000,store = TRUE,parallel = TRUE)
# Add 2 decimal positions for plotting
res_pvclust_100 <- res_pvclust
res_pvclust_100$edges <- res_pvclust$edges * 100
# shc Bootstrap
vfun <- function(x, y) {1 - jaccard(x, y)}
mfun <- function(x) {
as.dist(outer(split(x, f = row(x)), split(x, f = row(x)),
Vectorize(vfun)))
}
res_shc <- shc(patient_matrix2, matmet=mfun, linkage="ward.D2", n_sim = 10000)
# Original hclust output
res_hclust <- hclust(jaccard_ind(patient_matrix3),"ward.D2")
# Plotting
pdf("data/Plots/clustering_pv_shc_hclust.pdf",height= 14, width= 20)
plot(res_hclust,main= "Ward Hierarchical clustering of Medulloblastoma patients")
plot(res_pvclust_100,main= "Ward Hierarchical Clustering Bootstrap AU p−values (%) (package pvclust)",print.pv= "au")
# Plot empirical values
plot(res_shc,alpha=0.5,ci_emp=T) # we change alpha in order to plot the values. With predefined alpha not all partition values would be plotted
plot(res_shc,alpha=0.5,ci_emp=F) # Plot Gaussian aproximate p-values
dev.off()