-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcre.gemini2txt.vcf2db.sh
executable file
·110 lines (96 loc) · 3.72 KB
/
cre.gemini2txt.vcf2db.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash
# exports gemini.db database to gemini.db.txt file
# database schema: https://gemini.readthedocs.io/en/latest/content/database_schema.html#the-variants-table
# when using v.chr = g.chr AND v.gene = g.gene it becomes very slow
# by default bcbio writes PASS only variants to the database
# example call: cre.gemini2txt.sh S28-ensemble.db 5 ALL 0.01
# when using vcfanno/vcfdb loader some fields are different
# for some reason \n in the query string does not work here
#PBS -l walltime=1:00:00,nodes=1:ppn=1
#PBS -joe .
#PBS -d .
#PBS -l vmem=10g,mem=10g
if [ -z $file ]
then
file=$1
fi
#10 reads for WES 5 reads for RNA-seq
depth_threshold=$2
severity_threshold=$3
#echo $severity_threshold
max_af=$4
gemini query -q "select name from samples" $file > samples.txt
sQuery="select \
chrom as Chrom,\
start+1 as Pos,\
variant_id as Variant_id,\
ref as Ref,\
alt as Alt,\
impact as Variation,\
dp as Depth,\
qual as Quality,\
gene as Gene,\
clinvar_pathogenic as Clinvar,\
ensembl_gene_id as Ensembl_gene_id,\
transcript as Ensembl_transcript_id,\
aa_length as AA_position,\
exon as Exon,\
domains as Protein_domains,\
rs_ids as rsIDs,\
vcfanno_gnomad_af as Gnomad_af,\
vcfanno_gnomad_af_popmax as Gnomad_af_popmax,\
vcfanno_gnomad_ac as Gnomad_ac,\
vcfanno_gnomad_hom as Gnomad_hom,\
sift_score as Sift_score,\
polyphen_score as Polyphen_score,\
cadd_phred as Cadd_score,\
vest3_score as Vest3_score,\
revel_score as Revel_score,\
gerp_score as Gerp_score,\
aa_change as AA_change,\
hgvsc as Codon_change,\
phylop20way_mammalian as Conserved_in_20_mammals,
gts,"
while read sample
do
sQuery=$sQuery"gts."$sample","
sQuery=$sQuery"gt_alt_depths."$sample","
sQuery=$sQuery"gt_depths."$sample","
done < samples.txt
# gene_detailed may contain 2 records per single transcript - because of synonymous gene names, and some genes may have None in the name,for example TSRM
# https://groups.google.com/forum/#!topic/gemini-variation/U3uEvWCzuQo
# v.depth = 'None' see https://github.com/chapmanb/bcbio-nextgen/issues/1894
if [[ "$severity_threshold" == "ALL" ]]
then
#used for RNA-seq = 20k variants in the report
severity_filter=""
elif [[ "$severity_threshold" == "wes.synonymous" ]]
then
severity_filter=" and (is_coding=1 or is_splicing=1)"
#for WES = 1k variants in the report
else
severity_filter=" and impact_severity<>'LOW' "
fi
#max_aaf_all frequency is from gemini.conf and does not include gnomad WGS frequencing, gnomad WES only
#gnomad_af includes gnomad WGS
sQuery=$sQuery"hgvsc as Nucleotide_change_ensembl,\
hgvsp as Protein_change_ensembl,\
old_multiallelic as Old_multiallelic
from variants \
where \
(dp >= "$depth_threshold" or dp = '' or dp is null) "$severity_filter" and vcfanno_gnomad_af_popmax <= "$max_af
s_gt_filter=''
if [ -n "$denovo" ] && [ "$denovo" == 1 ]
then
# https://www.biostars.org/p/359117/
proband=`gemini query -q "select name from samples where phenotype=2" $file`
mom=`gemini query -q "select name from samples where phenotype=1 and sex=2" $file`
dad=`gemini query -q "select name from samples where phenotype=1 and sex=1" $file`
s_gt_filter="gt_types."$proband" == HET and gt_types."$dad" == HOM_REF and gt_types."$mom" == HOM_REF"
#(gt_types."$proband" == HOM_ALT and gt_types."$dad" == HOM_REF and gt_types."$mom" == HET)"
# otherwise a lot of trash variants
sQuery=$sQuery" and qual>=500"
gemini query -q "$sQuery" --gt-filter "$s_gt_filter" --header $file
else
gemini query --header -q "$sQuery" $file
fi