forked from tomas-fer/HybPhyloMaker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHybPhyloMaker2_readmapping.sh
267 lines (251 loc) · 8.9 KB
/
HybPhyloMaker2_readmapping.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=2d
#PBS -l nodes=1:ppn=4
#PBS -j oe
#PBS -l mem=4gb
#PBS -l scratch=80gb
#PBS -N HybPhyloMaker2_readmapping
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -pe mthread 12
#$ -q sThC.q
#$ -cwd
#$ -j y
#$ -N HybPhyloMaker2_readmapping
#$ -o HybPhyloMaker2_readmapping.log
# ********************************************************************************
# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building *
# * Script 02 - Read mapping using bowtie2 *
# * v.1.3.2 *
# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 *
# * tomas.fer@natur.cuni.cz *
# ********************************************************************************
#Complete path and set configuration for selected location
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\nHybPhyloMaker2 is running on MetaCentrum...\n"
#settings for MetaCentrum
#Copy file with settings from home and set variables from settings.cfg
cp $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
. /packages/run/modules-2.0/init/bash
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
#Move to scratch
cd $SCRATCHDIR
#Add necessary modules
module add bowtie2-2.2.4
#module add bcftools-1.3.1
module add samtools-1.3
module add perl-5.10.1
module add gcc-4.8.4
module add ococo-2016-11
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\nHybPhyloMaker2 is running on Hydra...\n"
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir02
cd workdir02
#Add necessary modules
module load bioinformatics/bowtie2/2.2.6
module load bioinformatics/samtools/1.3
else
echo -e "\nHybPhyloMaker2 is running locally...\n"
#settings for local run
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir02
cd workdir02
fi
#Setting for the case when working with cpDNA
if [[ $cp =~ "yes" ]]; then
echo -e "Working with cpDNA\n"
type="cp"
else
echo -e "Working with exons\n"
type="exons"
fi
#Test if 'workdir' exist
if [[ ! $location == "1" ]]; then
if [ "$(ls -A ../workdir02)" ]; then
echo -e "Directory 'workdir02' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir02/ 2>/dev/null
exit 3
fi
fi
#Test if folders for results exist
if [ -d "$path/$type/21mapped" ] && [[ $mapping =~ "yes" ]]; then
echo -e "Directory '$path/$type/21mapped' already exists. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir02/ 2>/dev/null
exit 3
elif [ -d "$path/$type/30consensus" ]; then
echo -e "Directory '$path/$type/30consensus' already exists. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir02/ 2>/dev/null
exit 3
fi
#Test data structure
echo -en "Testing input data structure..."
if [ -f "$path/10rawreads/SamplesFileNames.txt" ]; then
#Copy SamplesFileNames.txt and modify it
cp $path/10rawreads/SamplesFileNames.txt .
#Add LF at the end of last line in SamplesFileNames.txt if missing
sed -i.bak '$a\' SamplesFileNames.txt
#Delete empty lines from SamplesFileNames.txt (if any)
sed -i.bak2 '/^$/d' SamplesFileNames.txt
rm *bak*
for sample in $(cat SamplesFileNames.txt); do
if [ $mapping == "yes" ]; then
if [ ! -d "$path/20filtered/$sample" ]; then #Test if each samples-specific folder exists
echo -e "Directory $sample does not exist within '20filtered'.\n"
rm SamplesFileNames.txt
rm -d ../workdir02/ 2>/dev/null
exit 3
else
if [ ! -f "$path/20filtered/$sample/${sample}-1P_no-dups.fastq" ] || [ ! -f "$path/20filtered/$sample/${sample}-2P_no-dups.fastq" ] || [ ! -f "$path/20filtered/$sample/${sample}-1U" ] || [ ! -f "$path/20filtered/$sample/${sample}-2U" ]; then #Test if filtered FASTQ files exist
echo -e "Appropriate filtered fastq files missing in $sample folder...\n"
rm SamplesFileNames.txt
rm -d ../workdir02/ 2>/dev/null
exit 3
fi
fi
else
if [ ! -f "$path/$type/21mapped/${sample}.bam" ]; then
echo -e "$sample.bam does not exist within '$type/21mapped'.\n"
rm SamplesFileNames.txt
rm -d ../workdir02/ 2>/dev/null
exit 3
fi
fi
done
if [[ $cp =~ "yes" ]]; then
if [ ! -f "$source/$cpDNACDS" ]; then
echo -e "'$cpDNACDS' is missing in 'HybSeqSource'. Exiting...\n"
rm -d ../workdir03/ 2>/dev/null
exit 3
else
cpDNACDS=$(echo $cpDNACDS | cut -d"." -f1)
if [ ! -f "$source/${cpDNACDS}_with${nrns}Ns_beginend.fas" ]; then
echo -e "${cpDNACDS}_with${nrns}Ns_beginend.fas does not exist within 'HybSeqSource'. Run HybPhyloMaker0b_preparereference.sh first.\n"
rm SamplesFileNames.txt
rm -d ../workdir02/ 2>/dev/null
exit 3
fi
fi
else
if [ ! -f "$source/$probes" ]; then
echo -e "$probes does not exist within 'HybSeqSource'.\n"
rm SamplesFileNames.txt
rm -d ../workdir02/ 2>/dev/null
exit 3
else
probes=$(echo $probes | cut -d"." -f1)
if [ ! -f "$source/${probes}_with${nrns}Ns_beginend.fas" ]; then
echo -e "${probes}_with${nrns}Ns_beginend.fas does not exist within 'HybSeqSource'. Run HybPhyloMaker0b_preparereference.sh first.\n"
rm SamplesFileNames.txt
rm -d ../workdir02/ 2>/dev/null
exit 3
fi
fi
fi
else
echo -e "List of samples (SamplesFileNames.txt) is missing. Should be in 10rawreads...\n"
rm -d ../workdir02/ 2>/dev/null
exit 3
fi
echo -e "OK for running HybPhyloMaker2\n"
#Make a new folder for results
mkdir -p $path/$type
#Copy pseudoreference
if [[ $cp =~ "yes" ]]; then
probes=$cpDNACDS
fi
probes=$(echo $probes | cut -d"." -f1)
cp $source/${probes}_with${nrns}Ns_beginend.fas .
#Make a new folder for results
if [ ! -d "$path/$type/21mapped" ]; then
mkdir $path/$type/21mapped
fi
#Make index from pseudoreference
if [[ $mapping =~ "yes" ]]; then
echo -en "Indexing pseudoreference..."
bowtie2-build ${probes}_with${nrns}Ns_beginend.fas pseudoreference.index 1>indexing_pseudoreference.log
cp indexing_pseudoreference.log $path/$type/21mapped/
fi
#Copy list of samples
cp $path/10rawreads/SamplesFileNames.txt .
#A loop to process all samples in folders named as specified in SamplesFileNames.txt
numberfiles=$(cat SamplesFileNames.txt | wc -l)
calculating=0
for file in $(cat SamplesFileNames.txt); do
calculating=$((calculating + 1))
echo -e "\nProcessing sample $file ($calculating out of $numberfiles)"
#set parameters for mapping
score=G,20,8
#sensitive mapping
if [[ $mapping =~ "yes" ]]; then
#copy fastq files
cp $path/20filtered/${file}/${file}-1P_no-dups.fastq .
cp $path/20filtered/${file}/${file}-2P_no-dups.fastq .
cp $path/20filtered/${file}/${file}-1U .
cp $path/20filtered/${file}/${file}-2U .
echo "Mapping..."
#Bowtie2 parameters are derived from --very-sensitive-local
bowtie2 --local -D 20 -R 3 -N 0 -L 10 -i S,1,0.50 --score-min $score -x pseudoreference.index -1 ${file}-1P_no-dups.fastq -2 ${file}-2P_no-dups.fastq -U ${file}-1U,${file}-2U -S ${file}.sam 2>${file}_bowtie2_out.txt
#create SAM from BAM
echo "Converting to BAM..."
samtools view -bS -o ${file}.bam ${file}.sam
#copy results to home
cp ${file}.bam $path/$type/21mapped
cp ${file}_bowtie2_out.txt $path/$type/21mapped
else
echo "Copying BAM..."
cp $path/$type/21mapped/${file}.bam .
fi
#CONSENSUS USING OCOCO
echo "Making consensus with OCOCO..."
ococo -i ${file}.bam -c $mincov -F ${file}.fasta 2>/dev/null
#change name in fasta file
sed -i '1d' ${file}.fasta #delete first line
sed -i "1i >$file" ${file}.fasta #insert fasta header as a first line
#Remove line breaks from fasta file
awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' ${file}.fasta > tmp && mv tmp ${file}.fasta
#put $nrns Ns to variable 'a' and $nrns ?s to variable 'b'
a=$(printf "%0.sN" $(seq 1 $nrns))
b=$(printf "%0.s?" $(seq 1 $nrns))
#replace all Ns separating exons by '?'
sed -i "s/$a/$b/g" ${file}.fasta
#copy results to home
cp ${file}.fasta $path/$type/21mapped
#delete BAM
rm ${file}.bam
done
#Combine all fasta file into one
cat *.fasta > consensus.fasta
#Remove line breaks from fasta file
awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' consensus.fasta > tmp && mv tmp consensus.fasta
mkdir -p $path/$type/30consensus
if [[ $cp =~ "yes" ]]; then
mv consensus.fasta consensus_cpDNA.fasta
cp consensus_cpDNA.fasta $path/$type/30consensus
else
cp consensus.fasta $path/$type/30consensus
fi
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r workdir02
fi
echo -e "\nScript HybPhyloMaker2 finished...\n"