-
Notifications
You must be signed in to change notification settings - Fork 1
/
envision_figure_code.Rmd
140 lines (98 loc) · 7.69 KB
/
envision_figure_code.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
---
title: "envision_figures_veg"
author: "VEG"
date: "04/04/2017"
output: html_document
---
Load necessary libraries
```{r}
library('ggplot2')
library('xlsx')
library('reshape2')
```
Read in large scale mutagenesis data sets
```{r}
training <- read.csv('~/data/dmsTraining_2017-02-20.csv',header = TRUE)
training1 <-training[which( training$mut_type =='missense'),]
```
Figure 1. Deep mutational data and annotations used to train Envision
```{r}
protein_counts <- summary(factor(training1$dms_id))
summ <- as.data.frame(protein_counts)
names <- rownames(summ)
summ_n <- cbind(summ,names)
## panel A: Filtered mutational counts
ggplot(data = summ_n, aes(x = names, y = protein_counts)) + geom_bar(stat = "identity",fill = "#253494") + xlab("Large-scale mutagenesis data sets") + ylab("Absolute frequency") + coord_flip() + theme(axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold", color = 'black'))
## panel B: Mutational completeness of each data set
dataset_mut_count <- aggregate(training1$position, list(training1$dms_id), length)
min_position <- aggregate(training1$position, list(training1$dms_id), min)
max_position <- aggregate(training1$position, list(training1$dms_id), max)
position_length <- max_position$x-min_position$x+1
mutation_density <- dataset_mut_count$x/(position_length*19)
nam<-dataset_mut_count$Group.1
names(mutation_density) <-nam
mutation_density.f <- melt(mutation_density,)
names <- rownames(mutation_density.f)
mutation_density.f1 <- cbind(mutation_density.f,names)
mut_densityBar <- ggplot(mutation_density.f1 , aes(x = names,y = value)) + geom_bar(stat = "identity", fill = "#253494")+ theme(axis.text=element_text(size=14,color='black'),axis.title=element_text(size=14,face="bold",color='black')) + coord_flip()
mut_densityBar + xlab("Large-scale mutagenesis data sets") + ylab("Mutational completeness")
## panel C: distribution of normalized scores
myPal = c('#9e0142','#d53e4f','#f46d43','#fdae61','#fee08b','#ffffbf','#e6f598','#abdda4','#66c2a5','#3288bd','#5e4fa2','#3f007d', 'gray', 'black')
scaledEffectHistogram <- ggplot(training1, aes(x=scaled_effect1, y = (..count..)/sum(..count..),fill = dms_id)) + geom_histogram(color = "black") + xlim(0,1.5) + theme(axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold")) + scale_fill_manual(values= myPal)+ xlab("Normalized variant effect score") + ylab("Frequency of variants") + scale_y_continuous(labels = scales::percent, limits = c(0,0.25))
scaledEffectHistogram
## Supplementary figure 1: Reported fitness distribution
reportedEffectHistogram <- ggplot(training1, aes(x=reported_fitness, y = (..count..)/sum(..count..),fill = dms_id)) + geom_histogram(color = NA) + xlim(-10,2) + theme(axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold")) + scale_fill_manual(values = myPal)+ xlab("Reported variant effect score") + ylab("Percent of variants") + scale_y_continuous(labels = scales::percent, limits = c(0,0.25))
reportedEffectHistogram
## Figure 1 panel D: Feature completeness
colnames(training1)
## count the NAs in each column
na_count <-sapply(training1, function(y) sum(length(which(is.na(y)))))
total <- length(training1$id3)
## Calculate featue completeness
completeness <- (total-na_count)/total
completeness1 <- as.data.frame(completeness)
## Remove non-features
completeness2 <- completeness1[c(14,15,22:44,47),,drop = FALSE]
names<- c("WT amino acid", "MT amino acid", "WT polarity", "MT polarity", "WT pI", "MT pI", "pI change", "WT weight", "MT weight", "Weight change", "WT volume", "MT volume", "Volume change", "Grantham", "WT likelihood", "MT likelihood", "Likelihood change", "Solvent accessibility", "Secondary structure","Phi-psi","Accessibility change", "B factor", "MSA Substitution score", "MT MSA Substitution score","Homolog with MT", "Evolutionary coupling")
completeness3 <- cbind(names, completeness2)
## Categorize the features
completeness3$group <- NA
completeness3$group <- c("physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","physicochemical","evolutionary","evolutionary","evolutionary","structural","structural","structural","structural","structural","evolutionary","evolutionary","evolutionary","evolutionary")
## Graph completeness
ggplot(completeness3, aes(x=reorder(names,-completeness),y = completeness, fill = group)) + geom_bar(stat = "identity", color = "black") + theme(axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold")) + scale_fill_brewer(type = "seq", palette = "YlGnBu") + theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "top") + xlab("Feature") + ylab("Completeness")
```
Figure 2B: Pab1 Heatmap W/ observed and predicted scores
```{r}
pab1 <- read.csv("~/data/pab1_out.csv", header = TRUE)
ggplot(pab1, aes(x = position, y = aa2, fill = predictions)) + geom_tile(aes(color = predicted), size =0.5) + theme(axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold")) + theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "top") + ylab("Mutant Amino Acid") + xlab("Position") + scale_fill_distiller( direction = 1,palette = "YlGnBu") + scale_color_manual( values = c("gray","black"))
```
P53 amino aid correlation analysis
```{r}
## Supplementary figure 3G
## Heatmap showing the predictive performance of predictors for different mutant amino acid types
data <- read.csv('~/data/P53_correlationByAminoAcid.csv', header = T )
#library(reshape2)
data1 <- melt(data)
ggplot(data1, aes(x = variable, y = aa2)) + geom_tile(aes(fill = value)) + theme(axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold")) + theme(axis.text.x = element_text(angle = 55, hjust = 1), legend.position = "top") + ylab("Mutant Amino Acid") + xlab("Predictor") + scale_fill_distiller( direction = 1,palette = "PuRd")
## Figure 3E
## Violin plot showing the distribution of correlation coeffients between predictions and P53 scores across predictors
#install.packages("devtools")
#devtools::install_github("slowkow/ggrepel")
library(ggrepel)
colors = c("#225ea8","#41b6c4", "#7fcdbb","#c7e9b4")
ggplot(data1, aes(x = variable, y = value)) + geom_violin(aes(fill = variable)) + geom_text(aes(label = aa2 ), position=position_jitter(width=.35,height=0), size = 4) + theme( axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold")) + theme(axis.text.x = element_text(angle = 55, hjust = 1), legend.position = "top") + ylab("Correlation") + xlab("Predictor") + scale_fill_manual(values=colors) + geom_hline(yintercept = median(data$Envision)) + ylim(c(0,0.85))
## Supplementary figure 3H
## Heatmap for wild-type amino acid mutations
data <- read.csv('~/data/P53_correlationByAminoAcid_WT.csv', header = T )
data1 <- melt(data)
ggplot(data1, aes(x = variable, y = WT.AA)) + geom_tile(aes(fill = value)) + theme(axis.text=element_text(size=14, color = 'black'), axis.title=element_text(size=14,face="bold")) + theme(axis.text.x = element_text(angle = 55, hjust = 1), legend.position = "top") + ylab("Wild-type Amino Acid") + xlab("Predictor") + scale_fill_distiller( direction = 1,palette = "PuRd")
```
Supplementary figure 3I
```{r}
clinvar2 <- read.csv('~/data/clinvar_predicted_2017-03-21.csv', header = TRUE)
library('pROC')
plot(roc(clinvar2$effect, clinvar2$Envision_predictions),col = "#081d58")
lines(roc(clinvar2$effect, clinvar2$Cscore),col = "#225ea8")
lines(roc(clinvar2$effect, clinvar2$SIFTval),col = "#7fcdbb")
lines(roc(clinvar2$effect, clinvar2$PolyPhenVal),col = "#edf8b1")
```