forked from lmrodriguezr/enveomics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunus-genome-phylogeny.bash
executable file
·100 lines (85 loc) · 3.22 KB
/
unus-genome-phylogeny.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/bin/bash
#
# @author Luis M. Rodriguez-R
# @update Oct-20-2015
# @license artistic license 2.0
#
ORG=$1 # <- Organism (see help)
THR=2 # <- Number or threads
# This is just the help message
if [[ "$ORG" == "" ]] ; then
echo "
Use case: Unus genome phylogeny of a species. The unus genome is the collection
of orthologous groups in a set of genomes that has exactly one gene per genome,
i.e., the core genome minus in-paralogs.
IMPORTANT
This script is functional, but it's mainly intended for illustrative purposes.
Please take a look at the code first.
Usage:
$0 <organism>
<organism> The organism to use (e.g., Streptococcus_pneumoniae).
" >&2
exit
fi
# 00. Create environment
export PATH=$(dirname $0)/../Scripts:$PATH
if [[ -e $ORG ]] ; then
echo "Cowardly refusing to overwrite $ORG, please remove archive first." >&2
exit 1
fi
mkdir $ORG
for i in 01.proteome 02.rbm 03.ogs 04.aln 05.cat 06.raxml ; do
mkdir $ORG/$i
done
# 01. Download proteomes
echo "[01/06] Downloading and guzipping data"
RefSeq.download.bash $ORG .faa.gz "Complete Genome" $ORG/01.proteome
rm $ORG/01.proteome/assembly_summary.txt
for i in $ORG/01.proteome/* ; do
b=$(basename $i | perl -pe 's/[^A-Za-z0-9]/_/g' | perl -pe 's/_+$//')
for j in $i/*.faa.gz ; do gunzip $j ; done
cat $i/*.faa > $ORG/01.proteome/$b.faa.tmp
FastA.tag.rb -i $ORG/01.proteome/$b.faa.tmp -o $ORG/01.proteome/$b.faa.tmp -d
rm -R $i $ORG/01.proteome/$b.faa.tmp
done
# 02. Reciprocal Best Matches
echo "[02/06] Idenfifying Reciprocal Best Matches"
for i in $ORG/01.proteome/*.faa ; do # <- This nested loop could be parallelized
genomeA=$(basename $i .faa)
for j in $ORG/01.proteome/*.faa ; do
genomeB=$(basename $j .faa)
rbm.rb -1 $i -2 $j -t $THR > $ORG/02.rbm/$genomeA-$genomeB.rbm
[[ "$i" == "$j" ]] && continue # <- Ignore if it simplifies distribution
done
done
# 03. Orthologous Groups
echo "[03/06] Compiling Orthologous Groups"
ogs.mcl.rb -d $ORG/02.rbm -o $ORG/03.ogs/pangenome.ogs -t $THR
# 04. Extract unus genome and align groups
echo "[04/06] Extracting unus genome and aligning OGs"
ogs.extract.rb -i $ORG/03.ogs/pangenome.ogs -s $ORG/01.proteome/%s.faa \
-o $ORG/04.aln/ -c 1 -d 1 -p
for i in $ORG/04.aln/*.fa ; do # <- This loop could be parallelized
b=$(basename $i .fa)
clustalo -i $i -o $ORG/04.aln/$b.aln --threads=$THR
done
# 05. Concatenate alignment
echo "[05/06] Concatenating alignments and removing invariable sites"
Aln.cat.rb -I -c $ORG/05.cat/unus.raxcoords -i - $ORG/04.aln/*.aln \
> $ORG/05.cat/unus.aln 2> $ORG/05.cat/unus.log
# 06. Run RAxML
echo "[06/06] Inferring phylogeny"
# You REALLY should consider running the following with more threads (-T) and,
# if possible, multi-nodes using MPI
cd $ORG/06.raxml
raxmlHPC-PTHREADS -T $THR -p 1234 \
-s ../05.cat/unus.aln -q ../05.cat/unus.raxcoords \
-m PROTCATGTR -n UNUS # IMPORTANT: Please read the documentation of RAxML
# before running this line, so you know
# that you're running what you really
# want. Check options for bootstrapping
# and the different algorithms (-f). Note
# that -m is required, but the file
# unus.raxcoords specifies "AUTO", so
# RAxML will attempt to find the model
# resulting in the highest likelihood.