-
Notifications
You must be signed in to change notification settings - Fork 4
/
vcfToPseudoref.sh
executable file
·292 lines (279 loc) · 14.4 KB
/
vcfToPseudoref.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/bin/bash
SAMPLE="$1"
#SAMPLE: Prefix used for all intermediate and output files in the pipeline
REFERENCE="$2"
#REFERENCE: Path to the FASTA used as a reference for mapping
CALLER="$3"
#CALLER: Short name for the variant caller used
# e.g. HC for GATK HaplotypeCaller, MPILEUP for samtools mpileup and bcftools call
SPECIAL="$4"
#SPECIAL: Special options indicating input files to use, e.g. no_markdup, no_IR
if [[ $SPECIAL =~ 'jointgeno' ]]; then
JOINTPREFIX="$5"
FILTERSTR="${@:6}"
else
JOINTPREFIX=""
if [[ -z "${@:5}" ]]; then
FILTERSTR="${@:4}"
else
FILTERSTR="${@:5}"
fi
fi
echo "${FILTERSTR}"
#JOINTPREFIX: Prefix for joint genotyping VCF, only passed in if SPECIAL
# contains 'jointgeno'
#FILTERSTR: Expression string to use for filtering sites -- JEXL string
# used for -select option in GATK SelectVariants for masked sites, or an
# expression string for use with bcftools filter --include
#If SPECIAL is empty, bash won't parse it as $4, instead FILTERSTR will be $4
#Check if we want to mask positives or all sites around indels:
INDELWINDOW=""
INDELMASKRE='(indelmaskp?)_([0-9]+)'
if [[ ${SPECIAL} =~ $INDELMASKRE ]]; then
INDELMASKTYPE="${BASH_REMATCH[1]}"
INDELWINDOW="${BASH_REMATCH[2]}"
echo "Masking ${INDELWINDOW} bp around each indel for sample ${SAMPLE}"
fi
OUTPUTDIR=""
if [[ ${SAMPLE} =~ \/ ]]; then #If the prefix has a path
OUTPUTDIR="`dirname ${SAMPLE}`/"
SAMPLE=`basename ${SAMPLE}`
fi
mkdir -p ${OUTPUTDIR}logs
JOINTOUTPUTDIR=""
if [[ ${JOINTPREFIX} =~ \/ ]]; then #If the joint VCF prefix has a path
JOINTOUTPUTDIR="`dirname ${JOINTPREFIX}`/"
JOINTPREFIX=`basename ${JOINTPREFIX}`
fi
NOMARKDUP=""
REALIGNED=""
#Check that the input VCF file is there:
if [[ $SPECIAL =~ "no_markdup" ]]; then
NOMARKDUP="_nomarkdup"
fi
if [[ ! $SPECIAL =~ "no_IR" ]]; then
REALIGNED="_realigned"
fi
if [[ $CALLER =~ "HC" ]]; then
VCFSUFFIX="_HC_GGVCFs.vcf"
elif [[ $CALLER =~ "MPILEUP" ]]; then
VCFSUFFIX="_mpileupcall.vcf.gz"
else
echo "Unable to determine VCF suffix for variant caller ${CALLER}"
exit 2
fi
if [[ -z "${JOINTPREFIX}" ]]; then
OUTPREFIX="${SAMPLE}${NOMARKDUP}${REALIGNED}_${CALLER}"
INPUTVCF="${OUTPUTDIR}${SAMPLE}${NOMARKDUP}${REALIGNED}${VCFSUFFIX}"
else
OUTPREFIX="${SAMPLE}${NOMARKDUP}${REALIGNED}_${CALLER}_joint"
# echo "VCFSUFFIX is ${VCFSUFFIX}"
if [[ ${VCFSUFFIX} =~ gz$ ]]; then
VCFSUFFIX=".vcf.gz"
else
VCFSUFFIX=".vcf"
fi
INPUTVCF="${JOINTOUTPUTDIR}${JOINTPREFIX}${NOMARKDUP}${REALIGNED}_${CALLER}_joint${VCFSUFFIX}"
echo "Using jointly-genotyped VCF ${INPUTVCF} with suffix ${VCFSUFFIX}"
fi
if [[ ! -e "${INPUTVCF}" ]]; then
if [[ $CALLER =~ "HC" && -e "${INPUTVCF}.gz" ]]; then
INPUTVCF="${INPUTVCF}.gz"
else
echo "Unable to find input VCF ${INPUTVCF} for variant caller ${CALLER}"
exit 3
fi
fi
LOGPREFIX="${OUTPUTDIR}logs/${OUTPREFIX}"
INTPREFIX="${OUTPUTDIR}${OUTPREFIX}"
#SPECIAL options may indicate cleanup of intermediate files,
# but not log files:
if [[ $SPECIAL =~ "cleanup" ]]; then
if [[ $CALLER =~ "HC" ]]; then
INDELMASKINTERVALS="${INTPREFIX}_indelmaskintervals.bed"
INDELMASKVCF="${INTPREFIX}_indelmask_${INDELWINDOW}.vcf"
UPDATEVCF="${INTPREFIX}_sitesToUse.vcf"
MASKINGBED="${INTPREFIX}_sitesToMask.bed"
UPDATEFASTA="${INTPREFIX}_wonky_pseudoref.fasta"
RENAMEDFASTA="${INTPREFIX}_wonky_pseudoref_renamed.fasta"
PSEUDOREF="${INTPREFIX}_final_pseudoref.fasta"
rm -f ${UPDATEVCF} ${MASKINGBED} ${UPDATEFASTA} ${RENAMEDFASTA} ${PSEUDOREF}
if [[ ! -z "${INDELWINDOW}" ]]; then
rm -f ${INDELMASKINTERVALS}
if [[ "${INDELMASKTYPE}" == "indelmaskp" ]]; then
rm -f ${INDELMASKVCF}
fi
fi
elif [[ $CALLER =~ "MPILEUP" ]]; then
UPDATEVCF="${INTPREFIX}_filtered.vcf.gz"
SITESTOUSEVCF="${INTPREFIX}_sitesToUse.vcf.gz"
MASKINGBED="${INTPREFIX}_sitesToMask.bed"
INDELMASKINTERVALS="${INTPREFIX}_indelmaskintervals.bed"
PSEUDOREF="${INTPREFIX}_final_pseudoref.fasta"
rm -f ${UPDATEVCF} ${MASKINGBED} ${SITESTOUSEVCF} ${SITESTOUSEVCF}.tbi ${PSEUDOREF}
if [[ "${INDELMASKTYPE}" == "indelmask" ]]; then
rm -f ${INDELMASKINTERVALS}
fi
fi
echo "CLeanup complete for sample ${PREFIX}"
exit 0
fi
#Load the appropriate path variables for the filtering and masking tools:
SCRIPTDIR=`dirname $0`
source ${SCRIPTDIR}/pipeline_environment.sh
#Check that the necessary scripts/tools exist (excluding awk and java):
if [[ ! -x "$(command -v ${BEDTOOLS})" ]]; then
echo "BEDtools appears to be missing, could not find at BEDTOOLS=${BEDTOOLS}."
exit 21;
fi
if [[ $CALLER =~ "HC" ]]; then
if [[ ! -e "${GATK}" ]]; then
echo "GATK appears to be missing, could not find at GATK=${GATK}."
exit 20;
fi
#If indelmask has been specified, identify positives near indels
INDELMASK=""
if [[ ! -z "${INDELWINDOW}" ]]; then
#Identify indel positions, and output their windowed regions:
echo "Identifying indel positions and outputting windowed flanking regions for sample ${SAMPLE}"
INDELMASKINTERVALS="${INTPREFIX}_indelmaskintervals.bed"
java -jar ${GATK} -T SelectVariants -R ${REFERENCE} -V ${INPUTVCF} -selectType INDEL -sn ${SAMPLE} 2> ${LOGPREFIX}_GATKSelectVariants_indels.stderr | ${SCRIPTDIR}/GATK_indel_windows.awk - ${INDELWINDOW} | sort -k1,1 -k2,2n -k3,3n | bedtools merge -i - 2> ${LOGPREFIX}_bedtoolsMergeIndelMask.stderr > ${INDELMASKINTERVALS}
INDELINTCODE=$?
if [[ $INDELINTCODE -ne 0 ]]; then
echo "GATK SelectVariants for indels failed for sample ${SAMPLE} with exit code ${INDELINTCODE}"
exit 14
fi
if [[ "${INDELMASKTYPE}" == "indelmaskp" ]]; then
#Now extract positives within these intervals:
echo "Extracting variant calls within flanking regions of indels for sample ${SAMPLE}"
INDELMASKVCF="${INTPREFIX}_indelmask_${INDELWINDOW}.vcf"
java -jar ${GATK} -T SelectVariants -R ${REFERENCE} -V ${INPUTVCF} -o ${INDELMASKVCF} --intervals ${INDELMASKINTERVALS} -selectType SNP -sn ${SAMPLE} 2> ${LOGPREFIX}_GATKSelectVariants_indelmask_${INDELWINDOW}.stderr > ${LOGPREFIX}_GATKSelectVariants_indelmask_${INDELWINDOW}.stdout
INDELMASKCODE=$?
if [[ $INDELMASKCODE -ne 0 ]]; then
echo "GATK SelectVariants for SNPs near indels failed for sample ${SAMPLE} with exit code ${INDELMASKCODE}"
exit 15
fi
INDELMASK="--excludeIntervals ${INDELMASKVCF}"
fi
fi
#Extract SNPs to update by inverting the FILTERSTR:
#Be sure to exclude sites with uncalled genotypes
#They lack INFO/DP, and GATK JEXL doesn't handle FORMAT/DP (correctly?)
echo "Extracting SNPs and invariant sites passing the filters for sample ${SAMPLE}"
UPDATEVCF="${INTPREFIX}_sitesToUse.vcf"
java -jar ${GATK} -T SelectVariants -R ${REFERENCE} -V ${INPUTVCF} -o ${UPDATEVCF} ${INDELMASK} -selectType SNP -selectType NO_VARIATION -select "${FILTERSTR}" -invertSelect -sn ${SAMPLE} 2> ${LOGPREFIX}_GATKSelectVariants_updating.stderr > ${LOGPREFIX}_GATKSelectVariants_updating.stdout
SNPUPDCODE=$?
if [[ $SNPUPDCODE -ne 0 ]]; then
echo "GATK SelectVariants for updated sites on ${INPUTVCF} failed with exit code ${SNPUPDCODE}"
exit 5
fi
#Identify sites to mask by complementing the sites to update:
echo "Constructing BED of sites to mask for sample ${SAMPLE}"
MASKINGBED="${INTPREFIX}_sitesToMask.bed"
if [[ "${INDELMASKTYPE}" == "indelmask" ]]; then
awk 'BEGIN{FS="\t";OFS="\t";}!/^#/{split($9, formatarr, ":"); for (elem in formatarr) {if (formatarr[elem] == "GT") {gtindex=elem;break;};}; split($10, samplearr, ":"); if (samplearr[gtindex] != "./.") {print $1, $2-1, $2;};}' ${UPDATEVCF} | sort -k1,1 -k2,2n -k3,3n | ${BEDTOOLS} merge -i - | ${BEDTOOLS} complement -i - -g <(cut -f1,2 ${REFERENCE}.fai | sort -k1,1) 2> ${LOGPREFIX}_bedtoolscomplement.stderr | cat - ${INDELMASKINTERVALS} | sort -k1,1 -k2,2n -k3,3n | ${BEDTOOLS} merge -i - > ${MASKINGBED}
else
awk 'BEGIN{FS="\t";OFS="\t";}!/^#/{split($9, formatarr, ":"); for (elem in formatarr) {if (formatarr[elem] == "GT") {gtindex=elem;break;};}; split($10, samplearr, ":"); if (samplearr[gtindex] != "./.") {print $1, $2-1, $2;};}' ${UPDATEVCF} | sort -k1,1 -k2,2n -k3,3n | ${BEDTOOLS} merge -i - | ${BEDTOOLS} complement -i - -g <(cut -f1,2 ${REFERENCE}.fai | sort -k1,1) 2> ${LOGPREFIX}_bedtoolscomplement.stderr > ${MASKINGBED}
fi
MASKBEDCODE=$?
if [[ $MASKBEDCODE -ne 0 ]]; then
echo "bedtools merge or bedtools complement for sample ${SAMPLE} failed with exit code ${MASKBEDCODE}"
exit 6
fi
#Update SNPs using FastaAlternateReferenceMaker (masking doesn't work here):
echo "Updating FASTA with SNPs passing filters for sample ${SAMPLE}"
UPDATEFASTA="${INTPREFIX}_wonky_pseudoref.fasta"
java -jar ${GATK} -T FastaAlternateReferenceMaker -R ${REFERENCE} -V ${UPDATEVCF} -o ${UPDATEFASTA} -IUPAC ${SAMPLE} -lw 60 2> ${LOGPREFIX}_GATKFastaAlternateReferenceMaker.stderr > ${LOGPREFIX}_GATKFastaAlternateReferenceMaker.stdout
FAUPDCODE=$?
if [[ $FAUPDCODE -ne 0 ]]; then
echo "GATK FastaAlternateReferenceMaker for updating sites in ${UPDATEVCF} failed with exit code ${FAUPDCODE}"
exit 7
fi
#Fix the scaffold names, since GATK renames them to integers...:
echo "Fixing FASTA scaffold names for sample ${SAMPLE} because GATK renames them to integers..."
RENAMEDFASTA="${INTPREFIX}_wonky_pseudoref_renamed.fasta"
awk 'BEGIN{FS="\t";fastarecord=1;}FNR==NR{scafs[FNR]=$1;}FNR!=NR&&/^>/{print ">"scafs[fastarecord++];}FNR!=NR&&!/^>/{print;}' ${REFERENCE}.fai ${UPDATEFASTA} > ${RENAMEDFASTA}
FARENAMECODE=$?
if [[ $FARENAMECODE -ne 0 ]]; then
echo "awk script for renaming GATK FARM scaffolds for sample ${SAMPLE} failed with exit code ${FARENAMECODE}"
exit 8
fi
#Finally, mask sites with bedtools maskfasta:
echo "Masking sites failing the filters for sample ${SAMPLE}"
PSEUDOREF="${INTPREFIX}_final_pseudoref.fasta"
${BEDTOOLS} maskfasta -fi ${RENAMEDFASTA} -bed ${MASKINGBED} -fo ${PSEUDOREF} 2> ${LOGPREFIX}_bedtoolsmaskfasta.stderr > ${LOGPREFIX}_bedtoolsmaskfasta.stdout
MASKFASTACODE=$?
if [[ $MASKFASTACODE -ne 0 ]]; then
echo "bedtools maskfasta for sample ${SAMPLE} failed with exit code ${MASKFASTACODE}"
exit 9
fi
elif [[ $CALLER =~ "MPILEUP" ]]; then
if [[ ! -x "$(command -v ${BCFTOOLS})" ]]; then
echo "BCFtools appears to be missing, could not find at BCFTOOLS=${BCFTOOLS}."
exit 17;
fi
if [[ ! -x "$(command -v ${TABIX})" ]]; then
echo "Tabix appears to be missing, could not find at TABIX=${TABIX}."
exit 19;
fi
UPDATEVCF="${INTPREFIX}_filtered.vcf.gz"
SITESTOUSEVCF="${INTPREFIX}_sitesToUse.vcf.gz"
MASKINGBED="${INTPREFIX}_sitesToMask.bed"
INDELMASKINTERVALS="${INTPREFIX}_indelmaskintervals.bed"
PSEUDOREF="${INTPREFIX}_final_pseudoref.fasta"
#Mask positives within INDELWINDOW around each indel if INDELMASKTYPE="indelmaskp":
INDELMASK=""
if [[ "${INDELMASKTYPE}" == "indelmaskp" ]]; then
INDELMASK="--SnpGap ${INDELWINDOW}"
fi
#Filter sites using the filtering expression:
echo "Applying filters to the input VCF for sample ${SAMPLE}"
${BCFTOOLS} view -Ou -s ${SAMPLE} ${INPUTVCF} 2> ${LOGPREFIX}_bcftoolsviewForFilter.stderr | ${BCFTOOLS} filter -mx -S. -Oz -sFAIL -e "${FILTERSTR}" ${INDELMASK} -o ${UPDATEVCF} - 2> ${LOGPREFIX}_bcftoolsfilter.stderr > ${LOGPREFIX}_bcftoolsfilter.stdout
FILTERCODE=$?
if [[ $FILTERCODE -ne 0 ]]; then
echo "bcftools filter for sample ${SAMPLE} failed with exit code ${FILTERCODE}"
exit 10
fi
#Generate BED of masked sites:
echo "Constructing BED of filtered variant and invariant sites for sample ${SAMPLE}"
if [[ "${INDELMASKTYPE}" == "indelmask" ]]; then
echo "Merging in indel masking windows as well"
${BCFTOOLS} view -v indels -H -s ${SAMPLE} -Ov ${INPUTVCF} 2> ${LOGPREFIX}_bcftoolsViewIndels.stderr | ${SCRIPTDIR}/GATK_indel_windows.awk - ${INDELWINDOW} | sort -k1,1 -k2,2n -k3,3n | ${BEDTOOLS} merge -i - 2> ${LOGPREFIX}_bedtoolsMergeIndelWindows.stderr > ${INDELMASKINTERVALS}
INDELINTCODE=$?
if [[ $INDELINTCODE -ne 0 ]]; then
echo "bcftools view for indels, awk for making windows, sort, or bedtools merge for sample ${SAMPLE} failed with exit code ${INDELINTCODE}"
exit 16
fi
${BCFTOOLS} query -i 'FILTER=="PASS" && (TYPE=="SNP" || TYPE=="REF")' -f '%CHROM\t%POS0\t%POS\n' -s ${SAMPLE} ${UPDATEVCF} 2> ${LOGPREFIX}_bcftoolsqueryused.stderr | ${BEDTOOLS} complement -i - -g <(cut -f1,2 ${REFERENCE}.fai) 2> ${LOGPREFIX}_bedtoolscomplement.stderr | cat - ${INDELMASKINTERVALS} | sort -k1,1 -k2,2n -k3,3n | ${BEDTOOLS} merge -i - > ${MASKINGBED}
else
${BCFTOOLS} query -i 'FILTER=="PASS" && (TYPE=="SNP" || TYPE=="REF")' -f '%CHROM\t%POS0\t%POS\n' -s ${SAMPLE} ${UPDATEVCF} 2> ${LOGPREFIX}_bcftoolsqueryused.stderr | ${BEDTOOLS} complement -i - -g <(cut -f1,2 ${REFERENCE}.fai) 2> ${LOGPREFIX}_bedtoolscomplement.stderr > ${MASKINGBED}
fi
MASKBEDCODE=$?
if [[ $MASKBEDCODE -ne 0 ]]; then
echo "bcftools query or bedtools complement for sample ${SAMPLE} failed with exit code ${MASKBEDCODE}"
exit 11
fi
#Extract sites to use:
#We skip REF sites, since they don't need updating, and that makes the VCF smaller
echo "Extracting variant sites that pass the filters for updating for sample ${SAMPLE}"
${BCFTOOLS} view -i 'FILTER=="PASS" && TYPE=="SNP"' -Oz -o ${SITESTOUSEVCF} -s ${SAMPLE} ${UPDATEVCF} 2> ${LOGPREFIX}_bcftoolsviewsitestouse.stderr > ${LOGPREFIX}_bcftoolsviewsitestouse.stdout
SITESTOUSECODE=$?
if [[ $SITESTOUSECODE -ne 0 ]]; then
echo "bcftools view to extract used sites for sample ${SAMPLE} failed with error code ${SITESTOUSECODE}"
exit 12
fi
echo "Indexing sitesToUse.vcf.gz for sample ${SAMPLE}"
${TABIX} ${SITESTOUSEVCF}
#Generate pseudoref:
echo "Generating pseudoreference FASTA for sample ${SAMPLE}"
${BCFTOOLS} consensus --iupac-codes -f ${REFERENCE} -m ${MASKINGBED} -s ${SAMPLE} -o ${PSEUDOREF} ${SITESTOUSEVCF} 2> ${LOGPREFIX}_bcftoolsconsensus.stderr > ${LOGPREFIX}_bcftoolsconsensus.stdout
PSEUDOREFCODE=$?
if [[ $PSEUDOREFCODE -ne 0 ]]; then
echo "bcftools consensus for sample ${SAMPLE} failed with exit code ${PSEUDOREFCODE}"
exit 13
fi
else
echo "Unknown variant caller ${CALLER}, making pseudoref not yet supported"
exit 4
fi
echo "Done making pseudoreference FASTA for sample ${SAMPLE} and variant caller ${CALLER}!"