Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
rajwanir authored Aug 6, 2021
1 parent da759f8 commit c6d6741
Show file tree
Hide file tree
Showing 9 changed files with 351 additions and 0 deletions.
11 changes: 11 additions & 0 deletions src/extras/bonito.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#! /bin/bash
reads_folder=$1
source /gpfs/gsfs11/users/rajwanir2/conda/etc/profile.d/conda.sh
conda activate bonito
mkdir -p $reads_folder/bonito
bonito basecaller dna_r9.4.1 $reads_folder > $reads_folder/bonito/bonito.fasta
module load seqtk
seqtk seq -F '#' $reads_folder/bonito/bonito.fasta | gzip -c > $reads_folder/bonito/bonito.fastq.gz ## adding fake quality scores for conversion
bash src/qcat.sh $reads_folder/bonito
# bonito has the --fastq flag but currently not complete. It will outputs constant quality scores.
#sbatch --partition=gpu --cpus-per-task=14 --mem=16g --gres=lscratch:200,gpu:p100:1 src/bonito.s /gpfs/gsfs11/users/rajwanir2/soil_metagenomics/data/rawdata/vanRsoilactino_10132020/20201013_1905_MN31218_AEZ324_3a6e0404/fast5
11 changes: 11 additions & 0 deletions src/extras/interproscan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
folder='data/rawdata/simulation/3'
input_file="$folder/3.fastq.gz"
orignal_reads=203702
orginal_bases=887484358
orignal_coverage=110
seqtk sample -s100 $input_file $(echo $(echo 60/$orignal_coverage|bc -l)*$orignal_reads|bc -l) > $folder/x60.fq
seqtk sample -s100 $input_file $(echo $(echo 30/$orignal_coverage|bc -l)*$orignal_reads|bc -l) > $folder/x30.fq
seqtk sample -s100 $input_file $(echo $(echo 15/$orignal_coverage|bc -l)*$orignal_reads|bc -l) > $folder/x15.fq
seqtk sample -s100 $input_file $(echo $(echo 7/$orignal_coverage|bc -l)*$orignal_reads|bc -l) > $folder/x7.fq
seqtk sample -s100 $input_file $(echo $(echo 3/$orignal_coverage|bc -l)*$orignal_reads|bc -l) > $folder/x3.fq
18 changes: 18 additions & 0 deletions src/extras/kraken2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
barcode=$1
date='09172020'
##input_file="data/assembly/${date}/$barcode/canu/$barcode.contigs.fasta"
input_file="data/assembly/${date}/$barcode/wtdbg2/assm.ctg.fa"
module load kraken/1.1
kraken --db /fdb/kraken/20200428_bacteria_64GB \
--output data/kraken/$barcode.krakenout \
--threads ${SLURM_CPUS_PER_TASK} \
--fasta-input \
$input_file
##--report data/kraken/$barcode.krakenreport \
##--use-names \
##--use-mpa-style \
#############
#translate
kraken-translate --db /fdb/kraken/20200428_bacteria_64GB \
data/kraken/$barcode.krakenout > data/kraken/$barcode.translated
13 changes: 13 additions & 0 deletions src/extras/phylophlan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#! /bin/bash
source /gpfs/gsfs11/users/rajwanir2/conda/etc/profile.d/conda.sh
conda activate phylophlan
input_genomes=$(find . -path "*data/assembly/*/*/medeka/bar*.contigs.fasta")
cp $input_genomes data/phylophylan/input
phylophlan -i data/phylophylan/input \
-d phylophlan \
-f data/phylophylan/02_tol.cfg \
--diversity high \
--fast \
-o output_tol \
--nproc 16 \
--verbose 2>&1 | tee logs/phylophlan.log
13 changes: 13 additions & 0 deletions src/extras/read_quality.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
library(tidyverse)
# library(nanoR)

guppy_stats = read.csv("data/rawdata/vanRsoilactino_10222020/basecalled_fastq/sequencing_summary.txt",
sep = '\t')

ggplot(data=guppy_stats %>% filter(mean_qscore_template<7),
aes(x=sequence_length_template))+
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 1:15)


guppy_stats %>% group_by(channel) %>% summarize(output = sum(sequence_length_template))
122 changes: 122 additions & 0 deletions src/extras/target_screen_blast.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
library(tidyverse)
library(condformat)
library(grid)
library(gridExtra)


pident_highight = 40
cov_highlight = 70

## samples and paths input ####
# date = "09182020"
# genomes_to_check = c("barcode08", "barcode07", "barcode10")
# genomes_path = lapply(genomes_to_check, function (x)
# sprintf("data/assembly/09182020/%s/medeka/%s.contigs.fasta", x, x))

sample_metadata = read.csv(
"data/metadata.csv",
colClasses = c(
"integer",
"character",
rep("numeric", 5),
"integer",
"character",
"character",
"logical",
"character",
"character"
)
)
genomes_path = Sys.glob(file.path(getwd(), "data/assembly/*/*/medeka/*.contigs.fasta"))
genomes_path = bind_cols(s = str_extract(genomes_path, pattern = "\\w+/barcode\\d+"),
genomes_path = genomes_path) %>% separate(col = "s",
into = c("seq_date", "barcode"),
sep = '/')
genomes_path$barcode = str_remove(genomes_path$barcode, "barcode0|barcode")
sample_metadata = merge(
sample_metadata,
genomes_path,
by.x = c("barcode", "seq_date"),
by.y = c("barcode", "seq_date"),
all.x = F,
all.y = T
)


## blast search ######
blast_cmds = lapply(sample_metadata$genomes_path, function(x)
sprintf(
"tblastn -query GPA_evolution/select_genes/selected_genes.fasta -subject %s -max_target_seqs 1 -outfmt \'6 delim=@ qseqid pident qlen length\'",
x
))
blast_results = lapply(blast_cmds, function(x)
system(x, intern = T) %>% read.table(sep = '@'))

%>% bind_cols(.,))
# blast_results = lapply(blast_results, function(x)
# read.table(text = x, sep = '@'))

for (s in 1:length(sample_metadata$sample)) {
blast_results[[s]] <-
blast_results[[s]] %>% mutate(sample = sample_metadata$sample[s])
}

blast_results = blast_results %>% bind_rows()
colnames(blast_results) <-
c("gene",
"percent_identity",
"query_length",
"alignment_length",
"sample")
blast_results$gene = blast_results$gene %>% str_remove("_\\w+")
blast_results = blast_results %>% distinct(gene, sample, .keep_all = T) # only keeps the first hit
blast_results = blast_results %>% mutate(coverage = alignment_length / query_length * 100) %>% select(-c(alignment_length, query_length))
blast_results = blast_results %>% mutate(across(is.numeric, ~ round(., 1)))

## create tables ####
create_table <- function(blast_result){
table = condformat(blast_result) %>% rule_fill_discrete(sample,
expression = percent_identity > pident_highight & coverage > cov_highlight,
colours = c("TRUE" = "lightgreen")) %>% condformat2grob()
return(table)}

t_blast_results = lapply(genomes_to_check,
function(x){
create_table(blast_results %>% filter(sample == x))
})

png("figures/blast_hits.png", width = 13, height = 5, units = "in", res = 300)
grid.arrange(gtable_combine(t_blast_results[[1]]),
gtable_combine(t_blast_results[[2]]),
gtable_combine(t_blast_results[[3]]),
ncol = length(genomes_to_check))
dev.off()


# t_coverage = blast_results %>% select(-percent_identity) %>% pivot_wider(names_from = sample, values_from =
# coverage)
# t_percent_identity = blast_results %>% select(-coverage) %>% pivot_wider(names_from = sample, values_from =
# percent_identity)
# ## sepearate table for coverage
# p_coverage = condformat(t_coverage) %>% rule_fill_discrete(barcode08,
# expression = barcode08 > 80,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode07,
# expression = barcode07 > 80,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode10,
# expression = barcode10 > 80,
# colours = c("TRUE" = "lightgreen")) %>% condformat2grob()
#
# ## separate table for identity
# p_percent_identity = condformat(t_percent_identity) %>% rule_fill_discrete(barcode08,
# expression = barcode08 > 50,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode07,
# expression = barcode07 > 50,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode10,
# expression = barcode10 > 50,
# colours = c("TRUE" = "lightgreen")) %>% condformat2grob()
#
# grid.arrange(gtable_combine(p_percent_identity,p_coverage))
144 changes: 144 additions & 0 deletions src/extras/target_screen_blast_v1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
library(tidyverse)
library(gggenes)
library(ggsci)


## samples and paths input ####
# date = "09182020"
# genomes_to_check = c("barcode08", "barcode07", "barcode10")
# genomes_path = lapply(genomes_to_check, function (x)
# sprintf("data/assembly/09182020/%s/medeka/%s.contigs.fasta", x, x))

sample_metadata = read.csv(
"data/metadata.csv",
colClasses = c(
"integer",
"character",
rep("numeric", 5),
"integer",
"character",
"character",
"logical",
"character",
"character"
)
)
genomes_path = Sys.glob(file.path(getwd(), "data/assembly/*/*/medeka/*.contigs.fasta"))
genomes_path = bind_cols(s = str_extract(genomes_path, pattern = "\\w+/barcode\\d+"),
genomes_path = genomes_path) %>% separate(col = "s",
into = c("seq_date", "barcode"),
sep = '/')
genomes_path$barcode = str_remove(genomes_path$barcode, "barcode0|barcode")
sample_metadata = merge(
sample_metadata,
genomes_path,
by.x = c("barcode", "seq_date"),
by.y = c("barcode", "seq_date"),
all.x = F,
all.y = T
)


## blast search ######
query_path = Sys.glob(file.path(getwd(), "GPA_evolution/select_genes/split/*.fasta"))

get_blast_hits = function(q){
blast_cmds = lapply(sample_metadata$genomes_path, function(x)
sprintf(
"tblastn -query %s -subject %s -max_target_seqs 1 -outfmt \'6 delim=@ qseqid pident qlen length sseqid sstart send evalue score\' | head -n1",
q,x
))
blast_results = lapply(blast_cmds, function(x)
system(x, intern = T))

blast_results = lapply(blast_results, function(x)
read.table(text = x, sep = '@')) %>% bind_rows() %>%
rename(target = V1, pident = V2,query_len = V3, alignment_length = V4,
contig = V5, start = V6, end = V7, evalue = V8, score = V9) %>%
bind_cols(.,sample_metadata)

return(blast_results)
}

blast_results = lapply(query_path,get_blast_hits)

blast_results = blast_results %>% bind_rows() %>% separate(col = "target", into = c("gene","source_bgc"), sep = "_")
blast_results = blast_results %>% mutate(
contig_clean = str_remove(blast_results$contig,pattern = "tig(0)\\1+") %>% str_remove(pattern = "_segment\\d+") %>% str_pad(., max(nchar(.)), side = "left", pad = 0)
)
blast_results = blast_results %>% mutate(sample_clean = blast_results$sample %>% str_pad(., max(nchar(.)), side = "right", pad = ' '))

blast_results$contig_full = paste(blast_results$sample_clean,blast_results$contig_clean, sep = '\t')
blast_results = blast_results[order(blast_results$contig_full),]

# blast_results_filtered = blast_results %>% filter(evalue < 1)

p_blast_hits = ggplot(blast_results, aes(xmin = start, xmax = end,
y = contig_full, alpha = pident, fill = gene)) +
geom_gene_arrow(arrowhead_height = unit(20, "mm"), arrowhead_width = unit(20, "mm"), arrow_body_height = unit(15, "mm")) +
facet_wrap(drop = T,~contig_full, scales = "free",ncol = 2, dir = "v") +
theme_genes() +
labs(title = "Targeted BLAST screen - Glycopeptide") +
theme(#axis.text.y = element_blank(),
#axis.text.x = element_blank(),
# strip.text.y.right = element_text(angle = 0),
#strip.text = element_blank(),
axis.text.y = element_text(face = "bold", size = 30, hjust = 0),
axis.title.y = element_blank(),
legend.position="top",legend.text=element_text(face="bold",size=30),
text = element_text(size = 30),
plot.title = element_text(hjust = 0.5, face = "bold")) +
guides(fill = guide_legend(nrow = 1),override.aes = list(size = 5)) +
scale_y_discrete(expand = c(0.01,0.01)) +
scale_fill_jco() + scale_alpha(range =c(0.5,1))

ggsave(p_blast_hits, filename = "figures/p_blast_hits.png", height = 25, width = 49)


## create tables ####
create_table <- function(blast_result){
table = condformat(blast_result) %>% rule_fill_discrete(sample,
expression = percent_identity > pident_highight & coverage > cov_highlight,
colours = c("TRUE" = "lightgreen")) %>% condformat2grob()
return(table)}

t_blast_results = lapply(genomes_to_check,
function(x){
create_table(blast_results %>% filter(sample == x))
})

png("figures/blast_hits.png", width = 13, height = 5, units = "in", res = 300)
grid.arrange(gtable_combine(t_blast_results[[1]]),
gtable_combine(t_blast_results[[2]]),
gtable_combine(t_blast_results[[3]]),
ncol = length(genomes_to_check))
dev.off()


# t_coverage = blast_results %>% select(-percent_identity) %>% pivot_wider(names_from = sample, values_from =
# coverage)
# t_percent_identity = blast_results %>% select(-coverage) %>% pivot_wider(names_from = sample, values_from =
# percent_identity)
# ## sepearate table for coverage
# p_coverage = condformat(t_coverage) %>% rule_fill_discrete(barcode08,
# expression = barcode08 > 80,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode07,
# expression = barcode07 > 80,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode10,
# expression = barcode10 > 80,
# colours = c("TRUE" = "lightgreen")) %>% condformat2grob()
#
# ## separate table for identity
# p_percent_identity = condformat(t_percent_identity) %>% rule_fill_discrete(barcode08,
# expression = barcode08 > 50,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode07,
# expression = barcode07 > 50,
# colours = c("TRUE" = "lightgreen")) %>%
# rule_fill_discrete(barcode10,
# expression = barcode10 > 50,
# colours = c("TRUE" = "lightgreen")) %>% condformat2grob()
#
# grid.arrange(gtable_combine(p_percent_identity,p_coverage))
12 changes: 12 additions & 0 deletions src/extras/wtdbg2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
barcode=$1
date='09172020'
input_file="data/rawdata/vanRsoilactino_${date}/basecalled_fastq/demultiplexed/$barcode.fastq"
output_folder="data/assembly/$date/$barcode/wtdbg2/"
mkdir -p $output_folder
#first stage
~/wtdbg2/wtdbg2 -x ont -g 8m -t 16 \
-i $input_file \
-L 1000 -fo $output_folder/assm --edge-min 2 --rescue-low-cov-edges --node-max 6000 -S1 -X 6000 -K 10000
#second stage
~/wtdbg2/wtpoa-cns -t 16 -i $output_folder/assm.ctg.lay.gz -fo $output_folder/assm.ctg.fa
7 changes: 7 additions & 0 deletions src/extras/wtdbg2_jobsub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
barcodes='barcode10 barcode08 barcode07 barcode01'
for barcode in $barcodes
do
sbatch --cpus-per-task=16 --output=data/assm/$barcode_slurm.out --error=data/assm/$barcode_slurm.err \
--mem=8g src/wtdbg2.sh $barcode
done

0 comments on commit c6d6741

Please sign in to comment.