-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwgs_gvcf_to_vcf.sh
131 lines (104 loc) · 6.33 KB
/
wgs_gvcf_to_vcf.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/bash
## 这是多样本变异检测流程的后半部分,这个部分假设你已经用fastq_to_gvcf.sh为每个样本独立生成了其对应的比对结果和gvcf
# GATK的路径, 根据实际路径
gatk=/your_path_to/gatk/4.0.3.0/gatk
#reference
reference=/your_path_to/reference/hg38/Homo_sapiens_assembly38.fasta
GATK_bundle=/your_path_to/GATK_bundle/hg38
## shell执行参数
samples=$1 ## 所有的样本ID,用","分开
indir=$2 ## 输入目录的路径,这个输入路径要与fastq_to_gvcf.sh的输出路径完全相同
outname=$3 ## 设置输出文件名的前缀
outdir=$indir ## 输入和输出路径相同
# 设置群体变异检测结果的输出目录
if [ ! -d $outdir/population ]
then mkdir -p $outdir/population
fi
## 按照",",把所有样本ID拆分出来存至数组中
samples=$(echo $samples | tr "," "\n")
## 基于群体数据进行Joint genotyping, 同样有两种方式
## 第一,先合并所有的全gvcf结果,然后再统一进行GenotypeGVCFs,由于是全基因组,速度较慢
sample_gvcfs=""
for sample in $samples ; do
sample_gvcfs=${sample_gvcfs}"-V $outdir/${sample}/gatk/${sample}.HC.g.vcf.gz \\"\n
done
time $gatk CombineGVCFs \
-R $reference/Homo_sapiens_assembly38.fasta \
${sample_gvcfs} \
-O $outdir/poplation/${outname}.HC.g.vcf.gz && echo "** ${outname}.HC.g.vcf.gz done ** " && \
time $gatk GenotypeGVCFs \
-R $reference/Homo_sapiens_assembly38.fasta \
-V $outdir/poplation/${outname}.HC.g.vcf.gz \
-O $outdir/poplation/${outname}.HC.vcf.gz && echo "** ${outname}.HC.vcf.gz done ** "
# ## 第二,按照染色体分开,合并每个样本对应染色体的gVCF,然后再各自对染色体进行Joint calling(注意,这里和GATK3不同,GATK4只能接受一个gvcf的输入,因此需要先合并),
# ## 最后合并各个染色体的Genotype结果, 这要求每个样本必须按照染色体输出gvcf,速度快
# chrom=( chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM )
# for i in ${chrom[@]}; do
# sample_gvcfs=""
# for sample in $samples ; do
# sample_gvcfs=${sample_gvcfs}"-V $outdir/${sample}/gatk/${sample}.HC.${i}.g.vcf.gz \\"\n
# done
# time $gatk CombineGVCFs \
# -R $reference/Homo_sapiens_assembly38.fasta \
# ${sample_gvcfs} \
# -O $outdir/poplation/${outname}.HC.${i}.g.vcf.gz && echo "** ${outname}.HC.${i}.g.vcf.gz done ** " && \
# time $gatk GenotypeGVCFs \
# -R $reference/Homo_sapiens_assembly38.fasta \
# -V $outdir/poplation/${outname}.HC.${i}.g.vcf.gz \
# -O $outdir/poplation/${outname}.HC.${i}.vcf.gz && echo "** ${outname}.HC.${i}.vcf.gz done ** " &
# done && wait
# merge_vcfs=""
# for i in ${chrom[@]}; do
# merge_vcfs=${merge_vcfs}"-I $outdir/poplation/${outname}.HC.${i}.vcf.gz \\"\n
# done && time $gatk MergeVcfs ${merge_vcfs} -O $outdir/poplation/${outname}.HC.vcf.gz && echo "** MergeVcfs done **"
## VQSR, 由于评价SNP和Indel质量高低的标准是不同的,因此,需要分SNP和Indel这两种不同的模式,分别进行质控
# ## 这一步不是必须的,仅仅是为了提高速度,如果你的样本数目不多比如(500以下),那么可以选择忽略.如果你的样本很多(>500),
# ## 考虑到处理速度,我们可以用“MakeSitesOnlyVcf”只把sites的信息提取出来,忽略VCF后面的个体信息,从而节省IO时间。
# ## 如果提取了Only sites,那么注意下面VQSR中的输入文件要替换为这里的${outname}.HC.sites.vcf.gz
# time $gatk VariantRecalibrator \
# -I $outdir/poplation/${outname}.HC.vcf.gz \
# -O $outdir/poplation/${outname}.HC.sites.vcf.gz && echo "** Only sites done **"
## 首先是SNP mode
time $gatk VariantRecalibrator \
-R $reference/Homo_sapiens_assembly38.fasta \
-V $outdir/poplation/${outname}.HC.vcf.gz \
-resource hapmap,known=false,training=true,truth=true,prior=15.0:$GATK_bundle/hapmap_3.3.hg38.vcf \
-resource omin,known=false,training=true,truth=false,prior=12.0:$GATK_bundle/1000G_omni2.5.hg38.vcf \
-resource 1000G,known=false,training=true,truth=false,prior=10.0:$GATK_bundle/1000G_phase1.snps.high_confidence.hg38.vcf \
-resource dbsnp,known=true,training=false,truth=false,prior=2.0:$GATK_bundle/dbsnp_146.hg38.vcf \
-an DP -an QD -an FS -an SOR -an ReadPosRankSum -an MQRankSum \
-mode SNP \
-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 95.0 -tranche 90.0 \
--rscript-file $outdir/poplation/${outname}.HC.snps.plots.R \
--tranches-file $outdir/poplation/${outname}.HC.snps.tranches \
-O $outdir/poplation/${outname}.HC.snps.recal && \
time $gatk ApplyVQSR \
-R $reference/Homo_sapiens_assembly38.fasta \
-V $outdir/poplation/${outname}.HC.vcf.gz \
-ts_filter_level 99.0 \
--tranches-file $outdir/poplation/${outname}.HC.snps.tranches \
--recal-file $outdir/poplation/${outname}.HC.snps.recal \
-mode SNP \
-O $outdir/poplation/${outname}.HC.snps.VQSR.vcf.gz && echo "** SNPs VQSR done **"
## 然后是Indel mode
time $gatk VariantRecalibrator \
-R $reference/Homo_sapiens_assembly38.fasta \
-input $outdir/poplation/${outname}.HC.snps.VQSR.vcf.gz \
-resource mills,known=true,training=true,truth=true,prior=12.0:$GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf \
-an DP -an QD -an FS -an SOR -an ReadPosRankSum -an MQRankSum \
-mode INDEL \
--max-gaussians 6 \
--rscript-file $outdir/poplation/${outname}.HC.snps.indels.plots.R \
--tranches-file $outdir/poplation/${outname}.HC.snps.indels.tranches \
-O $outdir/poplation/${outname}.HC.snps.indels.recal && \
time $gatk ApplyVQSR \
-R $reference/Homo_sapiens_assembly38.fasta \
-input $outdir/poplation/${outname}.HC.snps.VQSR.vcf.gz \
-ts_filter_level 95.0 \
--tranches-file $outdir/poplation/${outname}.HC.snps.indels.tranches \
--recal-file $outdir/poplation/${outname}.HC.snps.indels.recal \
-mode INDEL \
-O $outdir/poplation/${outname}.HC.VQSR.vcf.gz && echo "** SNPs and Indels VQSR (${outname}.HC.VQSR.vcf.gz finish) done **"
# 可以被删除清理的文件,这不是必须执行的,可以保留.g.vcf.gz,原始HC.vcf.gz和完成质控的HC.VQSR.vcf.gz
# rm -f $outdir/poplation/${outname}.HC.snps.VQSR.vcf.gz
# $outdir/poplation/${outname}.HC.*.recal