-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwgs_gatk_single.sh
204 lines (171 loc) · 10.2 KB
/
wgs_gatk_single.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/bash
## 这个流程假设你只有一个样本,这个样本只有一对用Illumina测序仪测序的PE fastq数据文件。
# 一些软件和工具的路径, 根据实际
trimmomatic=/your_path_to/Trimmomatic/0.36/trimmomatic-0.36.jar
bwa=/your_path_to/bwa-0.7.15/bwa
samtools=/your_path_to/samtools-1.3/samtools
gatk=/your_path_to/gatk/4.0.3.0/gatk
#reference
reference=/your_path_to/reference/hg38/Homo_sapiens_assembly38.fasta
GATK_bundle=/your_path_to/GATK_bundle/hg38
## 这一步不是必须的,取决于GATK_bundle中的这4份文件是否已经有建索引,如没有再执行
#$gatk IndexFeatureFile --feature-file $GATK_bundle/hapmap_3.3.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/1000G_omni2.5.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/1000G_phase1.snps.high_confidence.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/dbsnp_146.hg38.vcf
## shell执行参数
fq1=$1
fq2=$2
RGID=$3 ## Read Group,一般用Lane ID代替
library=$4 ## 测序文库编号
sample=$5 ## 样本ID
outdir=$6 ## 输出目录的路径
## 按样本设置目录
outdir=${outdir}/${sample}
## 通过fastq1获得fastq的前缀名字,这里假设了原始的fastq1和fastq2有相同的前缀名字
## 并且假设fastq1的文件名格式为*.1.fq.gz;
fq_file_name=`basename $fq1`
fq_file_name=${fq_file_name%%.1.fq.gz}
# output diretory
if [ ! -d $outdir/cleanfq ]
then mkdir -p $outdir/cleanfq
fi
if [ ! -d $outdir/bwa ]
then mkdir -p $outdir/bwa
fi
if [ ! -d $outdir/gatk ]
then mkdir -p $outdir/gatk
fi
## 使用Trimmomatic对原始数据进行质控,ILLUMINACLIP中的一个关键参数 keepBothReads设为True。
time java -jar ${trimmomatic} PE \
$fq1 $fq2 \
$outdir/cleanfq/${fq_file_name}.paired.1.fq.gz ${fq_file_name}.unpaired.1.fq.gz \
$outdir/cleanfq/${fq_file_name}.paired.2.fq.gz ${fq_file_name}.unpaired.2.fq.gz \
ILLUMINACLIP:/your_path_to/Trimmomatic/adapters/TruSeq3-PE-2.fa:2:30:10:8:True \
SLIDINGWINDOW:5:15 LEADING:5 TRAILING:5 MINLEN:50 && echo "** fq QC done **"
## 使用bwa mem完成数据比对,bwa mem对任何长度大于40bp小于2000bp的read都是非常有效的; PL:ILLUMINA是我默认的
time $bwa mem -t 8 -M -Y -R "@RG\tID:$RGID\tPL:ILLUMINA\tPU:$PU\tLB:$library\tSM:$sample" $reference/Homo_sapiens_assembly38.fasta \
$outdir/cleanfq/${fq_file_name}.paired.1.fq.gz $outdir/cleanfq/${fq_file_name}.paired.2.fq.gz | $samtools view -Sb - > $outdir/bwa/${sample}.bam && \
echo "** BWA MEM done **" && \
time $samtools sort -@ 4 -m 4G -O bam -o $outdir/bwa/${sample}.sorted.bam $outdir/bwa/${sample}.bam && echo "** sorted raw bamfile done "
## 这一步不是必须的
# time $samtools index $outdir/bwa/${sample}.sorted.bam && echo "** ${sample}.sorted.bam index done **"
## 标记重复序列
$gatk MarkDuplicates \
-I $outdir/bwa/${sample}.sorted.bam \
-M $outdir/bwa/${sample}.markdup_metrics.txt \
-O $outdir/bwa/${sample}.sorted.markdup.bam && echo "** ${sample}.sorted.bam MarkDuplicates done **"
## 为${sample}.sorted.markdup.bam构建Index,这是继续后续步骤所必须的
time $samtools index $outdir/bwa/${sample}.sorted.markdup.bam && echo "** ${sample}.sorted.markdup.bam index done **"
## 执行BQSR
## [注]Does your vcf file have an index? GATK4 does not support on the fly indexing of VCFs anymore.
time $gatk BaseRecalibrator \
-R $reference/Homo_sapiens_assembly38.fasta \
-I $outdir/bwa/${sample}.sorted.markdup.bam \
--known-sites $GATK_bundle/1000G_phase1.indels.hg38.vcf \
--known-sites $GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf \
--known-sites $GATK_bundle/dbsnp_146.hg38.vcf \
-O $outdir/bwa/${sample}.sorted.markdup.recal_data.table && echo "** ${sample}.sorted.markdup.recal_data.table done **"
time $gatk ApplyBQSR \
--bqsr-recal-file $outdir/bwa/${sample}.sorted.markdup.recal_data.table \
-R $reference/Homo_sapiens_assembly38.fasta \
-I $outdir/bwa/${sample}.sorted.markdup.bam \
-O $outdir/bwa/${sample}.sorted.markdup.BQSR.bam && echo "** ApplyBQSR done **"
## 为${sample}.sorted.markdup.BQSR.bam构建Index,这是继续后续步骤所必须的
time $samtools index $outdir/bwa/${sample}.sorted.markdup.BQSR.bam && echo "** ${sample}.sorted.markdup.BQSR.bam index done **"
## 对于单个样本来说,有四个完成变异检测的方式,结果是一样的,可以按照需要挑选一种(以下默认第一种)。
## 第一,直接调用HaplotypeCaller输出样本VCF,面对较大的输入文件时,速度较慢
time $gatk HaplotypeCaller \
-R $reference/Homo_sapiens_assembly38.fasta \
-I $outdir/bwa/${sample}.sorted.markdup.BQSR.bam \
-O $outdir/gatk/${sample}.HC.vcf.gz && echo echo "** ${sample}.HC.vcf.gz done ** "
# ## 第二,输出这个样本每个染色体的vcf,然后在合并所有的染色体结果,目的是提高速度,这不是必须的,仅是通过分染色体获得速度的提升
# chrom=( chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM )
# for i in ${chrom[@]}; do
# time $gatk HaplotypeCaller \
# -R $reference/Homo_sapiens_assembly38.fasta \
# -I $outdir/bwa/${sample}.sorted.markdup.BQSR.bam \
# -L $i \
# -O $outdir/gatk/${sample}.HC.${i}.vcf.gz && echo "** ${sample}.HC.${i}.vcf.gz done ** " &
# done && wait
# merge_vcfs=""
# for i in ${chrom[@]}; do
# merge_vcfs=${merge_vcfs}" -I $outdir/gatk/${sample}.HC.${i}.vcf.gz \\"\n
# done && time $gatk MergeVcfs ${merge_vcfs} -O $outdir/gatk/${sample}.HC.vcf.gz && echo "** MergeVcfs done **"
# ## 第三,先输出样本的全gVCF,再进行GenotypeGVCFs,这个方式在单样本情况下不是必须的,但是多样本的标配,面对较大的输入文件时,速度较慢
# time $gatk HaplotypeCaller \
# --emit-ref-confidence GVCF \
# -R $reference/Homo_sapiens_assembly38.fasta \
# -I $outdir/bwa/${sample}.sorted.markdup.BQSR.bam \
# -O $outdir/gatk/${sample}.HC.g.vcf.gz && echo "** GVCF ${sample}.HC.g.vcf.gz done **"
# time $gatk GenotypeGVCFs \
# -R $reference/Homo_sapiens_assembly38.fasta \
# -V $outdir/gatk/${sample}.HC.g.vcf.gz \
# -O $outdir/gatk/${sample}.HC.vcf.gz && echo "** ${sample}.HC.vcf.gz done ** "
# ## 第四,输出每个染色体的gvcf,然后对每个染色体单独进行GenotypeGVCFs,目的是提高速度,这不是必须的,仅是通过分染色体获得速度的提升
# chrom=( chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM )
# for i in ${chrom[@]}; do
# time $gatk HaplotypeCaller \
# --emit-ref-confidence GVCF \
# -R $reference/Homo_sapiens_assembly38.fasta \
# -I $outdir/bwa/${sample}.sorted.markdup.BQSR.bam \
# -L $i \
# -O $outdir/gatk/${sample}.HC.${i}.g.vcf.gz && \
# time $gatk GenotypeGVCFs \
# -R $reference/Homo_sapiens_assembly38.fasta \
# -V $outdir/gatk/${sample}.HC.${i}.g.vcf.gz \
# -O $outdir/gatk/${sample}.HC.${i}.vcf.gz && echo "** ${sample}.HC.${i}.vcf.gz done ** " &
# done && wait
# merge_vcfs=""
# for i in ${chrom[@]}; do
# merge_vcfs=${merge_vcfs}" -I $outdir/gatk/${sample}.HC.${i}.vcf.gz \\"\n
# done && time $gatk MergeVcfs ${merge_vcfs} -O $outdir/gatk/${sample}.HC.vcf.gz && echo "** MergeVcfs done **"
## VQSR, 由于评价SNP和Indel质量高低的标准是不同的,因此,需要分SNP和Indel这两种不同的模式,分别进行质控
## 首先是SNP mode
time $gatk VariantRecalibrator \
-R $reference/Homo_sapiens_assembly38.fasta \
-V $outdir/gatk/${sample}.HC.vcf.gz \
-resource:hapmap,known=false,training=true,truth=true,prior=15.0 $GATK_bundle/hapmap_3.3.hg38.vcf \
-resource:omini,known=false,training=true,truth=false,prior=12.0 $GATK_bundle/1000G_omni2.5.hg38.vcf \
-resource:1000G,known=false,training=true,truth=false,prior=10.0 $GATK_bundle/1000G_phase1.snps.high_confidence.hg38.vcf \
-resource:dbsnp,known=true,training=false,truth=false,prior=6.0 $GATK_bundle/dbsnp_146.hg38.vcf \
-an DP -an QD -an FS -an SOR -an ReadPosRankSum -an MQRankSum \
-mode SNP \
-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 95.0 -tranche 90.0 \
-rscriptFile $outdir/gatk/${sample}.HC.snps.plots.R \
--tranches-file $outdir/gatk/${sample}.HC.snps.tranches \
-O $outdir/gatk/${sample}.HC.snps.recal && \
time $gatk ApplyVQSR \
-R $reference/Homo_sapiens_assembly38.fasta \
-V $outdir/gatk/${sample}.HC.vcf.gz \
--ts_filter_level 99.0 \
--tranches-file $outdir/gatk/${sample}.HC.snps.tranches \
-recalFile $outdir/gatk/${sample}.HC.snps.recal \
-mode SNP \
-O $outdir/gatk/${sample}.HC.snps.VQSR.vcf.gz && echo "** SNPs VQSR done **"
## 然后是Indel mode
time $gatk VariantRecalibrator \
-R $reference/Homo_sapiens_assembly38.fasta \
-input $outdir/gatk/${sample}.HC.snps.VQSR.vcf.gz \
-resource:mills,known=true,training=true,truth=true,prior=12.0 $GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf \
-an DP -an QD -an FS -an SOR -an ReadPosRankSum -an MQRankSum \
-mode INDEL \
--max-gaussians 6 \
-rscriptFile $outdir/gatk/${sample}.HC.snps.indels.plots.R \
--tranches-file $outdir/gatk/${sample}.HC.snps.indels.tranches \
-O $outdir/gatk/${sample}.HC.snps.indels.recal && \
time $gatk ApplyVQSR \
-R $reference/Homo_sapiens_assembly38.fasta \
-input $outdir/gatk/${sample}.HC.snps.VQSR.vcf.gz \
--ts_filter_level 99.0 \
--tranches-file $outdir/gatk/${sample}.HC.snps.indels.tranches \
-recalFile $outdir/gatk/${sample}.HC.snps.indels.recal \
-mode INDEL \
-O $outdir/gatk/${sample}.HC.VQSR.vcf.gz && echo "** SNPs and Indels VQSR (${sample}.HC.VQSR.vcf.gz finish) done **"
# 可以被删除清理的文件,这不是必须执行的
# 1) 对于比对文件只有最终的${sample}.sorted.markdup.BQSR.bam值得留下来
# 2) 对于VCF,可以保留.g.vcf.gz,原始HC.vcf.gz和完成质控的HC.VQSR.vcf.gz
# rm -f $outdir/bwa/${sample}.bam $outdir/bwa/${sample}.sorted.bam $outdir/bwa/${sample}.sorted.markdup.bam*
# rm -f $outdir/gatk/${sample}.HC.snps.VQSR.vcf.gz rm -f $outdir/gatk/${sample}.HC.*.recal