Skip to content

Commit

Permalink
A new -b option to output abbreviated protein predictions.
Browse files Browse the repository at this point in the history
For example, instead of

    BCSQ=splice_region|CD207|ENST00000410009|protein_coding,frameshift|\
    CD207|ENST00000410009|protein_coding|-|25EPPPKSGPSLVPGKTPTVRAALICLT\
    LVLVASVLLQAVLYPRFMGTISDVKTNVQLLKGRVDNISTLDSEIKKNSDGMEAAGVQIQMVNESLG\
    YVRSQFLKLKTSVEKANAQIQILTRSWEEVSTLNAQIPELKSDLEKASALNTKIRALQGSLENMSKL\
    LKRQNDILQVVSQGWKYFKGNFYYFSLIPKTWYSAEQFCVSRNSHLTSVTSESEQEFLYKTAGGLIY\
    WIGLTKAGMEGDWSWVDDTPFNKVQSVRFWIPGEPNNAGNNEHCGNIKAPSLQAWNDAPCDKTFLFI\
    CKRPYVPSEP*>25GASSQVRSISGPGENTHSPCCINLPDAGPGRLRPAAGRPLSPVYGHHIRCKDQ\
    CPVAERSCGQHQHPGF*|5507G>GC,

print

    BCSQ=splice_region|CD207|ENST00000410009|protein_coding,frameshift|\
    CD207|ENST00000410009|protein_coding|-|25E..329>25G..94|5507G>GC

Resolves samtools#957
  • Loading branch information
pd3 committed Apr 15, 2019
1 parent be4494b commit 64a5e1d
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 10 deletions.
28 changes: 23 additions & 5 deletions csq.c
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@ typedef struct _args_t
int phase, quiet, local_csq;
int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ
int ncsq_small_warned;
int brief_predictions;

int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
Expand Down Expand Up @@ -2576,6 +2577,20 @@ void kput_vcsq(args_t *args, vcsq_t *csq, kstring_t *str)
kputs(csq->vstr.s, str);
}

void kprint_aa_prediction(args_t *args, int beg, kstring_t *aa, kstring_t *str)
{
if ( !args->brief_predictions )
kputs(aa->s, str);
else
{
int len = aa->l;
if ( aa->s[len-1]=='*' ) len--;
kputc(aa->s[0], str);
kputs("..", str);
kputw(beg+len, str);
}
}

void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
{
int i;
Expand Down Expand Up @@ -2684,12 +2699,12 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
kputs(hap->tref.s, &str);
kprint_aa_prediction(args,aa_rbeg,&hap->tref,&str);
if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
{
kputc_('>', &str);
kputw(aa_sbeg, &str);
kputs(hap->tseq.s, &str);
kprint_aa_prediction(args,aa_sbeg,&hap->tseq,&str);
}
kputc_('|', &str);

Expand Down Expand Up @@ -3324,12 +3339,12 @@ int test_cds_local(args_t *args, bcf1_t *rec)
int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
kputc_('|', &str);
kputw(aa_rbeg, &str);
kputs(tref->s, &str);
kprint_aa_prediction(args,aa_rbeg,tref,&str);
if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
{
kputc_('>', &str);
kputw(aa_sbeg, &str);
kputs(tseq->s, &str);
kprint_aa_prediction(args,aa_sbeg,tseq,&str);
}
kputc_('|', &str);
kputw(rec->pos+1, &str);
Expand Down Expand Up @@ -3901,6 +3916,7 @@ static const char *usage(void)
" -g, --gff-annot <file> gff3 annotation file\n"
"\n"
"CSQ options:\n"
" -b, --brief-predictions annotate with abbreviated protein-changing predictions\n"
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
Expand Down Expand Up @@ -3947,6 +3963,7 @@ int main_csq(int argc, char *argv[])
{"force",0,0,1},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
{"brief-predictions",0,0,'b'},
{"custom-tag",1,0,'c'},
{"local-csq",0,0,'l'},
{"gff-annot",1,0,'g'},
Expand All @@ -3967,11 +3984,12 @@ int main_csq(int argc, char *argv[])
};
int c, targets_is_file = 0, regions_is_file = 0;
char *targets_list = NULL, *regions_list = NULL;
while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:b",loptions,NULL)) >= 0)
{
switch (c)
{
case 1 : args->force = 1; break;
case 'b': args->brief_predictions = 1; break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': args->quiet++; break;
Expand Down
13 changes: 10 additions & 3 deletions doc/bcftools.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 2019-04-05 09:10 BST
.\" Date: 2019-04-15 17:40 BST
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2019\-04\-05 09:10 BST" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2019\-04\-15 17:40 BST" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
Expand Down Expand Up @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
This manual page was last updated \fB2019\-04\-05 09:10 BST\fR and refers to bcftools git version \fB1\&.9\-129\-g6027d36+\fR\&.
This manual page was last updated \fB2019\-04\-15 17:40 BST\fR and refers to bcftools git version \fB1\&.9\-141\-gbe4494b+\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
Expand Down Expand Up @@ -1828,6 +1828,13 @@ Symbolic alleles are not supported\&. They will remain unannotated in the output
use this custom tag to store consequences rather than the default BCSQ tag
.RE
.PP
\fB\-b, \-\-brief\-predictions\fR
.RS 4
annotate with abbreviated protein\-changing predictions\&. That is, instead of writing the whole modified protein sequence with potentially hundreds of aminoacids, only an abbreviated version such as
\fI25E\&.\&.329>25G\&.\&.94\fR
will be written
.RE
.PP
\fB\-e, \-\-exclude\fR \fIEXPRESSION\fR
.RS 4
exclude sites for which
Expand Down
10 changes: 8 additions & 2 deletions doc/bcftools.html
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25035840"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25163040"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
will work in all situations. Un-indexed VCF and BCF and streams will
work in most, but not all situations. In general, whenever multiple VCFs are
read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2019-04-05 09:10 BST</strong></span> and refers to bcftools git version <span class="strong"><strong>1.9-129-g6027d36+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2019-04-15 17:40 BST</strong></span> and refers to bcftools git version <span class="strong"><strong>1.9-141-gbe4494b+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions &lt;= 0.1.19 to convert to VCF, which can then be read by
Expand Down Expand Up @@ -1043,6 +1043,12 @@
</span></dt><dd>
use this custom tag to store consequences rather than the default BCSQ tag
</dd><dt><span class="term">
<span class="strong"><strong>-b, --brief-predictions</strong></span>
</span></dt><dd>
annotate with abbreviated protein-changing predictions. That is, instead of writing
the whole modified protein sequence with potentially hundreds of aminoacids, only an
abbreviated version such as <span class="emphasis"><em>25E..329&gt;25G..94</em></span> will be written
</dd><dt><span class="term">
<span class="strong"><strong>-e, --exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
</span></dt><dd>
exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
Expand Down
5 changes: 5 additions & 0 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,11 @@ output VCF and are ignored for the prediction analysis.
*-c, --custom-tag* 'STRING'::
use this custom tag to store consequences rather than the default BCSQ tag

*-b, --brief-predictions*::
annotate with abbreviated protein-changing predictions. That is, instead of writing
the whole modified protein sequence with potentially hundreds of aminoacids, only an
abbreviated version such as '25E..329>25G..94' will be written

*-e, --exclude* 'EXPRESSION'::
exclude sites for which 'EXPRESSION' is true. For valid expressions see
*<<expressions,EXPRESSIONS>>*.
Expand Down
1 change: 1 addition & 0 deletions test/csq/ENST00000410009/frameshift.1.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{bin}/bcftools csq -b -f ENST00000410009.fa -g ENST00000410009.gff frameshift.vcf | ../sort-csq | {bin}/bcftools query -f'%POS\t%REF\t%ALT\t%BCSQ\n'
1 change: 1 addition & 0 deletions test/csq/ENST00000410009/frameshift.1.cmd.out
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
5507 G GC frameshift|CD207|ENST00000410009|protein_coding|-|25E..329>25G..94|5507G>GC,splice_region|CD207|ENST00000410009|protein_coding
1 change: 1 addition & 0 deletions test/csq/ENST00000410009/frameshift.2.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{bin}/bcftools csq -lb -f ENST00000410009.fa -g ENST00000410009.gff frameshift.vcf | ../sort-csq | {bin}/bcftools query -f'%POS\t%REF\t%ALT\t%BCSQ\n'
1 change: 1 addition & 0 deletions test/csq/ENST00000410009/frameshift.2.cmd.out
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
5507 G GC frameshift|CD207|ENST00000410009|protein_coding|-|25E..329>25G..94|5507G>GC,splice_region|CD207|ENST00000410009|protein_coding

0 comments on commit 64a5e1d

Please sign in to comment.