-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtree_props.R
242 lines (191 loc) · 7.71 KB
/
tree_props.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# disable scientific notation
options(scipen=999)
# install required libraries
# uncomment if you don't have these installed
#install.packages("ape")
#install.packages("seqinr")
#install.packages("data.table")
# load needed libraries
library("ape")
library("seqinr")
library("data.table")
### SET WORKING DIRECTORY ###
setwd("/media/mlb/Seagate Backup Plus Drive/Good_genes/new")
trees_dir <- file.path("./trees_rerooted/")
aln_dir <- file.path("./masked_loci/")
trees_files <- dir(path=trees_dir, pattern="*tre")
aln_files <- dir(path=aln_dir, pattern="*fas")
tree_regex <- "([0-9]+).tre"
aln_regex <- "([0-9]+).fas"
### AVERAGE BOOTSTRAP SUPPORT ###
# This code should work with any Newick tree files
# that have some measure of support at nodes,
# including PP from Bayesian analysis.
# define a function to calculate average support
Avg_support <- function(file) {
# get UCE number from filename
locus_no <- sub(tree_regex, "\\1", perl=TRUE, x=file)
# read in tree
tree <- read.tree(paste(trees_dir,file, sep=""))
# store support values in a vector
support <- c(as.numeric(tree$node.label))
# calculate average support
avg_supp <- mean(support, na.rm=T)
return(c(locus_no,avg_supp))
}
# loop over all files
average_bootstrap <- lapply(trees_files, Avg_support)
average_bootstrap <- data.frame(matrix(unlist(average_bootstrap), nrow=(length(average_bootstrap)), byrow=T))
colnames(average_bootstrap) <- c("Locus", "Average_bootstrap")
write.csv(average_bootstrap, file="avg_bootstrap.csv")
### AVERAGE BRANCH LENGTHS ###
# This takes a Newick tree with branch lengths
# and returns the number of tips for each tree,
# and calculates average branch length.
Br_length.trees <- function(file) {
# reads the phylogenetic tree
tree <- read.tree(paste(trees_dir,file, sep=""))
# gets number of tips
no_tips <- length(tree$tip.label)
# calculate avg branch length
avg_br_length <- mean(tree$edge.length)
# get UCE number from filename
locus_no <- sub(tree_regex, "\\1", perl=TRUE, x=file)
return(c(locus_no,avg_br_length))
}
# loop over all files
br_lengths <- lapply(trees_files, Br_length.trees)
br_lengths <- data.frame(matrix(unlist(br_lengths), nrow=(length(br_lengths)), byrow=T))
colnames(br_lengths) <- c("Locus", "Average_branch_length")
write.csv(br_lengths, file="avg_br_len.csv")
### CLOCKLIKENESS ###
# define a function to calculate coefficient of variation
# of root to tip distances for the root minimizing the coefficient
Clocklikeness <- function(file) {
# get UCE number from filename
locus_no <- sub(tree_regex, "\\1", perl=TRUE, x=file)
# read in tree
tree <- read.tree(paste(trees_dir,file, sep=""))
# record coefficient for all possible outgroups
CV <- c()
taxa <- tree$tip.label
for (taxon in taxa) {
# root tree
rooted_tr <- root(phy=tree,outgroup=taxon,resolve.root=T)
# get matrix diagonal of phylogenetic variance-covariance matrix
# these are your distances from root
root_dist <- diag(vcv.phylo(rooted_tr))
std_dev_root_dist <- sd(root_dist)
mean_root_dist <- mean(root_dist)
CV <- c(CV, (std_dev_root_dist/mean_root_dist)*100)
}
# get lowest CV
minCV <- min(CV)
print(minCV)
return(c(locus_no,minCV))
}
# loop over all files
cv_clocklikeness <- lapply(trees_files, Clocklikeness)
cv_clocklikeness <- data.frame(matrix(unlist(cv_clocklikeness), nrow=(length(cv_clocklikeness)), byrow=T))
colnames(cv_clocklikeness) <- c("Locus", "Clocklikeness")
write.csv(cv_clocklikeness, "cv_clocklikeness.csv")
### SATURATION ###
# The code below takes as input FASTA alignments and tree files with branch lengths
# and calculates regression slope and r-squared for each locus
# also saving regression plots for each locus
# in a newly created 'Saturation_plots' folder
# define function to perform regression,
# calculate R-squared, and save saturation plots
# needs fasta alignments and
# corresponding raxml's tree files
Saturation <- function(seq, tree) {
# read in alignment
alignment <- read.alignment(file=seq, format="fasta")
# read in tree
tree <- read.tree(tree)
# get locus number from filename
# this has to be different regex from other functions used here
# because file names here include directory
locus_no <- sub(".+?([0-9]+).fas", "\\1", perl=TRUE, x=seq)
# matrix with pairwise identity
mat <- dist.alignment(alignment, matrix="identity")
# matrix with uncorrected p-distances
p_mat <- mat*mat
# mean p-distance
avg_p_mat <- mean(p_mat, na.rm=TRUE)
# make matrix of pairwise distances in branch lengths from the tree
cophentr <- cophenetic(tree)
# store as matrix objects
mat_mat <- as.matrix(mat)
mat_p_mat <- as.matrix(p_mat)
# order p-distance matrix by names
mat_p_mat <- mat_p_mat[order(row.names(mat_p_mat)),order(row.names(mat_p_mat))]
mat_co <- as.matrix(cophentr)
# order pairwise distances matrix by names
mat_co <- mat_co[order(row.names(mat_co)),order(row.names(mat_co))]
# get lower triangulars of both matrices
branch_dist <- mat_co[lower.tri(mat_co)]
p_dist <- mat_p_mat[lower.tri(mat_p_mat)]
# perform simple linear regression
regress <- lm(p_dist ~ branch_dist)
# get slope
slope <- coef(regress)[2]
# get r-squared
Rsquared <- summary(regress)$r.squared
# plot branch length pairwise distances on x
# and uncorrected p-distances on y
# open png file
png(file=paste(sat_dir, locus_no, "-saturation.png", sep=""), width=600, height=600)
plot(branch_dist, p_dist)
# add simple linear regression line
abline(lm(p_dist ~ branch_dist), col="red")
# give title as locus number and subtitle as tree length
title(main=locus_no,sub=paste("Slope: ", round(slope, digits=3), " R-squared: ", round(Rsquared, digits=3), sep=""), cex.sub=1.25)
# close png file
dev.off()
return(list(locus_no, avg_p_mat, slope, Rsquared))
}
# get current working directory
work_dir <- getwd()
# create a folder for saturation plots
dir.create("./saturation_plots")
sat_dir <- file.path(work_dir, "saturation_plots/")
# create a table with file names
files_table <- as.data.frame(cbind(paste(aln_dir, aln_files, sep=""), paste(trees_dir, trees_files, sep="")))
saturation_table <- t(mapply(Saturation, as.matrix(files_table$V1), as.matrix(files_table$V2)))
saturation_table <- as.data.frame(saturation_table)
colnames(saturation_table) <- c("Locus","Avg_p_dist","Slope","R_squared")
row.names(saturation_table) <- NULL
ds.saturation <- data.frame(lapply(saturation_table, as.character), stringsAsFactors=F)
write.csv(ds.saturation, "saturation_table.csv")
### PLOTTING GENE TREES ###
# This will save png files of 600 x 600 pixel plots
# of all unrooted phylogenetic trees with tip labels
# and support values
Plot_trees <- function(file) {
# reads the phylogenetic tree
tree <- read.tree(paste(trees_dir,file, sep=""))
# extracts plot name (locus) from file name
plot_name <- sub(tree_regex, "\\1", perl=TRUE, x=file)
# open png file
png(file=paste(tree_plots_dir, plot_name, "-tree.png", sep=""), width=600, height=1200)
plot.phylo(tree, show.node.label=T)
# give title as locus number and subtitle as tree length
title(main=plot_name)
# close png file
dev.off()
}
# create directory for tree plots
dir.create("./tree_plots")
tree_plots_dir <- file.path(work_dir, "tree_plots/")
#loop over all files
lapply(trees_files, Plot_trees)
# putting together all the data
dfs <- list(average_bootstrap, br_lengths, cv_clocklikeness, saturation_table)
Multmerge <- function(dfs){
datalist <- lapply(dfs, function(x){data.frame(x)})
Reduce(function(x,y) {merge(x,y)}, datalist)
}
all_loci_stats <- Multmerge(dfs)
all_loci_stats <- data.frame(lapply(all_loci_stats, as.character), stringsAsFactors=FALSE)
write.csv(all_loci_stats, file="tree_stats_table.csv", quote=FALSE, row.names=FALSE)