forked from AdelaideBioinfo/horizontalTransfer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1a_tblastn_and_extract.sbatch
91 lines (74 loc) · 2.94 KB
/
1a_tblastn_and_extract.sbatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash
# Example usage:
#
# DIR=test_genome \
# DATABASE=YarrowiaLipolytica_ASM252v1.fa \
# QUERY=L1_ORFp.fasta \
# RESULTSDIR=results \
# sbatch 1a_tblastn_and_extract.sbatch
#SBATCH -p short
#SBATCH -N 1
#SBATCH -c 2
#SBATCH --time=12:00:00
#SBATCH --mem=32GB
# Load the necessary modules
module load ncbi-blast
module load bedtools
# Run tblastn
tblastn \
-db "$DIR/$DATABASE" \
-num_threads 2 \
-query test_query/"$QUERY" \
-out "$DATABASE"_"$QUERY".out \
-outfmt "6 qseqid sseqid bitscore qcovs evalue pident length mismatch gapopen qstart qend qseq sstart send sseq" \
-evalue 1e-5 \
-show_gis \
2> "$DATABASE"_"$QUERY".log
# Sort output
cat "$DATABASE"_"$QUERY".out \
| sort -nr -k3 \
> "$DATABASE"_"$QUERY".out.sorted
# Put the target seq hits in FASTA format
# i.e, remove gaps ("-") from the target sequence (column 15)
# then print out all of the target seqs, with the header in the form:
# >sseqid:sstart-send qseqid:qstart-qend score:*,qcov:*,evalue:*,pident:*
cat "$DATABASE"_"$QUERY".out.sorted \
| awk -F" " '{gsub(/[-]/,"",$15)}1' OFS=" " \
| awk '{print ">" $2 ":" $13 "-" $14 "\t" $1 ":" $10 "-" $11 "\t" "score:" $3 ",qcov:" $4 ",eval:" $5 ",pid:" $6 "\n" $15}' \
| sed 's/ref|//g' \
| sed 's/|:/:/g' \
> "$DATABASE"_"$QUERY"_sseq.fasta
# Note: This output is the protein hit sequences, not the nucleotide sequences
# So now, extract nucleotide FASTA sequence for each hit
# Grab the headers from the amino acid FASTA file
grep '>' "$DATABASE"_"$QUERY"_sseq.fasta > "$DATABASE"_"$QUERY"_sseq_headers.txt
# Pull out all the plus strand sequences
cat "$DATABASE"_"$QUERY"_sseq_headers.txt \
| cut -f 1 \
| sed 's/>//g' \
| sed 's/:/\t/g' \
| awk 'BEGIN {OFS=FS="\t"} {gsub(/\-/,"\t",$2)}1' \
| awk '{if ($2 < $3) print $1 "\t" $2 "\t" $3 "\t" "L1hit" "\t" "1" "\t" "+"}' \
> "$DATABASE"_"$QUERY"_plus.txt
# Pull out all the minus strand sequences
cat "$DATABASE"_"$QUERY"_sseq_headers.txt \
| cut -f 1 \
| sed 's/>//g' \
| sed 's/:/\t/g' \
| awk 'BEGIN {OFS=FS="\t"} {gsub(/\-/,"\t",$2)}1' \
| awk '{if ($2 > $3) print $1 "\t" $3 "\t" $2 "\t" "L1hit" "\t" "1" "\t" "-"}' \
> "$DATABASE"_"$QUERY"_minus.txt
# Combine minus and plus strand sequences
cat "$DATABASE"_"$QUERY"_plus.txt "$DATABASE"_"$QUERY"_minus.txt > "$DATABASE"_"$QUERY"_all.txt
# Sort and merge
bedtools sort -i "$DATABASE"_"$QUERY"_all.txt \
| bedtools merge -s -i - -c 4,5,6 -o distinct,distinct,distinct > "$DATABASE"_"$QUERY"_merged.bed
# Extract FASTA from corrected merged BED file
bedtools getfasta -s -fi "$DIR/$DATABASE" -bed "$DATABASE"_"$QUERY"_merged.bed -fo "$DATABASE"_"$QUERY"_results.fasta
# Sort sequences by length
usearch -sortbylength "$DATABASE"_"$QUERY"_results.fasta -fastaout "$DATABASE"_"$QUERY"_nucl_seqs.fasta
# Move output bed and fasta to results folder
mv "$DATABASE"_"$QUERY"_nucl_seqs.fasta $RESULTSDIR
mv "$DATABASE"_"$QUERY"_merged.bed $RESULTSDIR
# Remove or move remaining files to a temporary folder
rm "$DATABASE"_"$QUERY"*