-
Notifications
You must be signed in to change notification settings - Fork 1
/
simulation.R
211 lines (170 loc) · 6.63 KB
/
simulation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
library(phangorn)
library(devtools)
library(TreeSim)
library(phytools)
library(EnvStats)
library(phylobase)
library(TreePar)
n_gene_trees <- 200
locus_size <- 4000
n_reps <- 1
effective_population_size_approx <- 120
n_tips <- 4
###########
file_names <- paste(seq(1, n_gene_trees, 1), ".nexus", sep="")
##########
tip_vector <- seq(1, n_tips, 1)
node_numbers <- seq(length(tip_vector)+1, (length(tip_vector)+2) + (length(tip_vector)-3), 1)
##########SIMULATE_SPECIES_TREE
for (z in 1:n_reps){
entire_tree <- read.tree("entire_tree_balanced.tre")
##########
coalescent_populations <- vector("list", (length(tip_vector)-1))
for (i in 1:length(coalescent_populations)){
coalescent_populations[[i]] <- extract.clade(entire_tree, node_numbers[[i]])$tip.label
}
coalescent_populations <- coalescent_populations[order(sapply(coalescent_populations,length),decreasing=F)]
##########MINIMUM_AND_MAXIMUM_AGES_OF_COALESCENT_POPULATION_AS_DETERMINED_BY_SPECIES_TREE
coalescent_populations_minimum_ages <- vector("list", length(coalescent_populations))
coalescent_populations_maximum_ages <- vector("list", length(coalescent_populations))
for (i in 1:length(coalescent_populations_minimum_ages)){
coalescent_populations_minimum_ages[[i]] <- max(node.depth.edgelength(extract.clade(entire_tree, findMRCA(entire_tree, coalescent_populations[[i]], "node"))))
if(length(coalescent_populations[[i]]) < length(entire_tree$tip.label)){
coalescent_populations_maximum_ages[[i]] <- max(node.depth.edgelength(extract.clade(entire_tree, entire_tree[[1]][,1][[which(entire_tree[[1]][,2] == findMRCA(entire_tree, coalescent_populations[[i]], "node"))]])))
}
}
#############
gene_trees <- vector("list", n_gene_trees)
for (y in 1:length(gene_trees)){
pool <- vector("list", length(entire_tree$tip.label))
for (i in 1:length(pool)){
pool[[i]] <- list(edge=matrix(c(2,1),1,2), tip.label=entire_tree$tip.label[[i]], edge.length=1.0, Nnode=1)
class(pool[[i]]) <- "phylo"
}
#############
coalescent_ages <- vector("list", length(coalescent_populations_minimum_ages))
for (i in 1:length(coalescent_ages)){
age_generation <- vector(mode="numeric", length=0)
for (a in 1:length(pool)){
counter <- vector(mode="numeric", length=0)
for (b in 1:length(pool[[a]]$tip.label)){
if (pool[[a]]$tip.label[[b]] %in% coalescent_populations[[i]]){
counter <- append(counter, 1)
}
}
if (length(counter) == length(pool[[a]]$tip.label)){
age_generation <- append(age_generation, 1)
}
}
coalescent_ages[[i]] <- coalescent_populations_minimum_ages[[i]] + rexp((length(age_generation)-1), 1/(effective_population_size_approx/1000))
coalescent_ages[[i]] <- sort(coalescent_ages[[i]])
remove <- vector(mode="numeric", length=0)
for (a in 1:length(coalescent_ages[[i]])){
if(is.numeric(coalescent_populations_maximum_ages[[i]]) == TRUE)
if(coalescent_ages[[i]][[a]] > coalescent_populations_maximum_ages[[i]]){
remove <- append(remove, a)
}
}
if(length(remove) > 0){
coalescent_ages[[i]] <- coalescent_ages[[i]][-remove]
}
if (length(coalescent_ages[[i]]) > 0){
for (a in 1:length(coalescent_ages[[i]])){
a
correct <- 0
if (length(pool) >= 2){
while (correct == 0){
counter <- vector(mode="numeric", length=0)
to_join_index <- sample(seq(1, length(pool), 1), 2, replace = FALSE)
to_join <- pool[to_join_index]
for (b in 1:length(c(to_join[[1]]$tip.label, to_join[[2]]$tip.label))){
if (c(to_join[[1]]$tip.label, to_join[[2]]$tip.label)[[b]] %in% coalescent_populations[[i]]){
counter <- append(counter, 1)
}
}
if (length(counter) == length(c(to_join[[1]]$tip.label, to_join[[2]]$tip.label))){
correct <- 1
}
}
if(correct == 1){
if ((length(to_join[[1]]$tip.label) == 1) & (length(to_join[[2]]$tip.label) == 1)){
to_join[[1]]$edge.length <- coalescent_ages[[i]][[a]]
to_join[[2]]$edge.length <- coalescent_ages[[i]][[a]]
}
if ((length(to_join[[1]]$tip.label) > 1) & (length(to_join[[2]]$tip.label) == 1)){
to_join[[1]] <- addroot(to_join[[1]], coalescent_ages[[i]][[a]] - max(node.depth.edgelength(to_join[[1]])))
to_join[[2]]$edge.length <- coalescent_ages[[i]][[a]]
}
if ((length(to_join[[1]]$tip.label) == 1) & (length(to_join[[2]]$tip.label) > 1)){
to_join[[1]]$edge.length <- coalescent_ages[[i]][[a]]
to_join[[2]] <- addroot(to_join[[2]], coalescent_ages[[i]][[a]] - max(node.depth.edgelength(to_join[[2]])))
}
if ((length(to_join[[1]]$tip.label) > 1) & (length(to_join[[2]]$tip.label) > 1)){
to_join[[1]] <- addroot(to_join[[1]], coalescent_ages[[i]][[a]] - max(node.depth.edgelength(to_join[[1]])))
to_join[[2]] <- addroot(to_join[[2]], coalescent_ages[[i]][[a]] - max(node.depth.edgelength(to_join[[2]])))
}
new <- bind.tree(to_join[[1]], to_join[[2]])
pool <- pool[-c(to_join_index[[1]], to_join_index[[2]])]
pool[[length(pool)+1]] <- new
}
}
}
}
}
gene_trees[[y]] <- pool[[1]]
}
#########
gene_clock_trees <- gene_trees
for (i in 1:length(gene_clock_trees)){
gene_clock_trees[[i]]$edge.length <- gene_clock_trees[[i]]$edge.length*0.05
}
sequences <- vector("list", n_gene_trees)
for (i in 1:length(sequences)){
sequences[[i]] <- simSeq(gene_clock_trees[[i]], l = locus_size, type = "DNA", rate = 1)
}
#########
dir.create("data")
dir.create(paste("data/gene_sequences", sep=""))
for (i in 1:length(file_names)){
write.phyDat(sequences[[i]], file = paste("data/gene_sequences/", file_names[[i]], sep=""), format = "nexus")
}
dir.create(paste("data/gene_trees", sep=""))
for (i in 1:length(file_names)){
write.tree(gene_trees[[i]], file = paste("data/gene_trees/", file_names[[i]], sep=""))
}
#########
concat <- sequences[[1]]
for (i in 2:length(gene_trees)){
concat <- append(concat, sequences[[i]])
}
dir.create("data/concat_sequence")
write.phyDat(concat, file = "data/concat_sequence/concat.nexus", format = "nexus")
#########
species_tree_clades <- vector("list", length(entire_tree$tip.label) - 2)
for (i in 1:length(species_tree_clades)){
species_tree_clades[[i]] <- extract.clade(entire_tree, length(entire_tree$tip.label)+1+i)
}
species_tree_clade_support <- vector("list", length(species_tree_clades))
for (j in 1:length(species_tree_clades)){
for (i in 1:length(gene_trees)){
for (x in 1:(length(gene_trees[[i]]$tip.label)-2)){
if (setequal(extract.clade(gene_trees[[i]], length(gene_trees[[i]]$tip.label)+1+x)$tip.label, species_tree_clades[[j]]$tip.label) == TRUE){
species_tree_clade_support[[j]] <- append(species_tree_clade_support[[j]], 1)
}
}
}
}
for (i in 1:length(species_tree_clade_support)){
if (length(species_tree_clade_support[[i]]) > 0){
species_tree_clade_support[[i]] <- length(species_tree_clade_support[[i]])/length(gene_trees)
}
if (length(species_tree_clade_support[[i]]) == 0){
species_tree_clade_support[[i]] <- 0
}
}
dir.create("data/clade_support")
sink("data/clade_support/clade_support.txt")
print(unlist(species_tree_clade_support))
sink()
#######
}