Skip to content

Commit

Permalink
[NEWS] Revamp of bcftools call -G
Browse files Browse the repository at this point in the history
Sample grouping by population was not truly independent and could still be influenced
by the presence of other sample groups. With this commit, -G should work as expected.
Other changes are:

- Optional addition of INFO/PV4 annotation with `call -a INFO/PV4`
- Remove generation of useless HOB and ICB annotation; use +fill-tags -- -t HWE,ExcHet` instead
- The `call -f` option was renamed to `-a` to (1) make it consistent with `mpileup` and (2) to indicate
  that it includes both INFO and FORMAT annotations, not just FORMAT as previously
- Any sensible Number=R,Type=Integer annotation can be used with -G, such as AD or QS
- Don't trim QUAL; although usefuleness of this change is questionable for true probabilistic
  interpretation (such high precision is unrealistic), using QUAL as a score rather than probability
  *is* helpful and permits more fine-grained filtering
- Fix a suspected bug in `call -F` in the worst case, for certain improve readability
- Note that `call -C trio` is temporarily disabled by this commit

Fixes samtools#1332
  • Loading branch information
pd3 committed Dec 7, 2020
1 parent 07c4308 commit 0f4aed2
Show file tree
Hide file tree
Showing 43 changed files with 852 additions and 711 deletions.
34 changes: 14 additions & 20 deletions call.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ THE SOFTWARE. */
#define CALL_CONSTR_TRIO (1<<2)
#define CALL_CONSTR_ALLELES (1<<3)
//
//
#define CALL_FMT_PV4 (1<<5)
#define CALL_FMT_GQ (1<<6)
#define CALL_FMT_GP (1<<7)

Expand All @@ -52,18 +52,13 @@ family_t;
// For the single-sample and grouped -G calling
typedef struct
{
double ref_lk, max_lk, lk_sum;
float *qsum; // QS(quality sum) values
int nqsum, dp;
double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc;
}
grp1_t;
typedef struct
{
grp1_t *grp;
int ngrp;
int *smpl2grp;
int nqsum;
uint32_t *smpl, nsmpl;
uint32_t nals, als;
}
grp_t;
smpl_grp_t;

// For the `-C alleles -i` constrained calling
typedef struct
Expand All @@ -82,6 +77,7 @@ typedef struct
int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old)
char **als; // array to hold the trimmed set of alleles to appear on output
int nals; // size of the als array
int als_new, nals_new; // bitmask with final alleles and their number
family_t *fams; // list of families and samples for trio calling
int nfams, mfams;
int ntrio[5][5]; // possible trio genotype combinations and their counts; first idx:
Expand All @@ -96,18 +92,16 @@ typedef struct
int32_t *ugts, *cgts; // unconstraind and constrained GTs
uint32_t output_tags;
char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN)
tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES
char *sample_groups; // for single-sample or grouped calling with -G
grp_t smpl_grp;
float *qsum;
int nqsum;
tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES
char *sample_groups; // for single-sample or grouped calling with -G
char *sample_groups_tag; // for -G [AD|QS:]
smpl_grp_t *smpl_grp;
int nsmpl_grp;

// ccall only
double indel_frac, min_perm_p, min_lrt;
double prior_type, pref;
double ref_lk, lk_sum;
int ngrp1_samples, n_perm;
int nhets, ndiploid;
char *prior_file;
ccall_t *cdat;

Expand Down Expand Up @@ -149,7 +143,7 @@ void qcall_destroy(call_t *call);
void call_init_pl2p(call_t *call);
uint32_t *call_trio_prep(int is_x, int is_son);

void init_allele_trimming_maps(call_t *call, int als, int nals);
void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als);
void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out);
void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new);

#endif
4 changes: 2 additions & 2 deletions ccall.c
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,8 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
// trim Number=R tags
int out_als = 0;
for (i=0; i<nals; i++) out_als |= 1<<i;
init_allele_trimming_maps(call, out_als, nals_ori);
mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
init_allele_trimming_maps(call, nals_ori, out_als);
mcall_trim_and_update_numberR(call, rec, nals_ori, nals);

return is_var;
}
Expand Down
38 changes: 26 additions & 12 deletions doc/bcftools.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 2020-09-22
.\" Date: 2020-11-25 16:08 GMT
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2020\-09\-22" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2020\-11\-25 16:08 GMT" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
Expand Down Expand Up @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
This manual page was last updated \fB2020\-09\-22\fR and refers to bcftools git version \fB1\&.11\fR\&.
This manual page was last updated \fB2020\-11\-25 16:08 GMT\fR and refers to bcftools git version \fB1\&.11\-24\-g9718479+\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
Expand Down Expand Up @@ -664,6 +664,15 @@ See also the
option\&.
.RE
.PP
\fB\-C, \-\-columns\-file\fR \fIfile\fR
.RS 4
Read the list of columns from a file (normally given via the
\fB\-c, \-\-columns\fR
option)\&. "\-" to skip a column of the annotation file\&. One column name per row, an additional space\- or tab\-separated field can be present to indicate the merge logic (normally given via the
\fB\-l, \-\-merge\-logic\fR
option)\&. This is useful when many annotations are added at once\&.
.RE
.PP
\fB\-e, \-\-exclude\fR \fIEXPRESSION\fR
.RS 4
exclude sites for which
Expand Down Expand Up @@ -730,11 +739,11 @@ and
expressions instead of discarding them
.RE
.PP
\fB\-l, \-\-merge\-logic\fR \fItag\fR:\*(Aqfirst\*(Aq|\fIappend\fR|\fIunique\fR|\fIsum\fR|\fIavg\fR|\fImin\fR|\fImax\fR[,\&...]
\fB\-l, \-\-merge\-logic\fR \fItag\fR:\*(Aqfirst\*(Aq|\fIappend\fR|\fIappend\-missing\fR|\fIunique\fR|\fIsum\fR|\fIavg\fR|\fImin\fR|\fImax\fR[,\&...]
.RS 4
if multiple regions overlap a single record, the option defines how to treat multiple annotation values when setting
\fItag\fR
in the destination file: use the first encountered value ignoring the rest (\fIfirst\fR); append allowing duplicates (\fIappend\fR); append discarding duplicate values (\fIunique\fR); sum the values (\fIsum\fR, numeric fields only); average the values (\fIavg\fR); use the minimum value (\fImin\fR) or the maximum (\fImax\fR)\&.
in the destination file: use the first encountered value ignoring the rest (\fIfirst\fR); append allowing duplicates (\fIappend\fR); append even if the appended value is missing, i\&.e\&. is a dot (\fIappend\-missing\fR); append discarding duplicate values (\fIunique\fR); sum the values (\fIsum\fR, numeric fields only); average the values (\fIavg\fR); use the minimum value (\fImin\fR) or the maximum (\fImax\fR)\&.

Note that this option is intended for use with BED or TAB\-delimited annotation files only\&. Moreover, it is effective only when either
\fIREF\fR
Expand Down Expand Up @@ -787,6 +796,12 @@ see
\fBCommon Options\fR
.RE
.PP
\fB\-\-rename\-annots\fR \fIfile\fR
.RS 4
rename annotations according to the map in
\fIfile\fR, with "old_name new_name\en" pairs separated by whitespaces, each on a separate line\&. The old name must be prefixed with the annotation type: INFO, FORMAT, or FILTER\&.
.RE
.PP
\fB\-\-rename\-chrs\fR \fIfile\fR
.RS 4
rename chromosomes according to the map in
Expand Down Expand Up @@ -976,7 +991,7 @@ output all alternate alleles present in the alignments even if they do not appea
.PP
\fB\-f, \-\-format\-fields\fR \fIlist\fR
.RS 4
comma\-separated list of FORMAT fields to output for each sample\&. Currently GQ and GP fields are supported\&. For convenience, the fields can be given as lower case letters\&.
comma\-separated list of FORMAT fields to output for each sample\&. Currently GQ and GP fields are supported\&. For convenience, the fields can be given as lower case letters\&. Prefixed with "^" indicates a request for tag removal of auxiliary tags useful only for calling\&.
.RE
.PP
\fB\-F, \-\-prior\-freqs\fR \fIAN\fR,\fIAC\fR
Expand Down Expand Up @@ -1015,11 +1030,10 @@ is a tab\-delimited text file with sample names in the first column and group na
\fI\-\fR
is given instead, no HWE assumption is made at all and single\-sample calling is performed\&. (Note that in low coverage data this inflates the rate of false positives\&.) The
\fB\-G\fR
option requires the presence of FORMAT/AD generated at the
\fBbcftools mpileup\fR
step by providing the
\fB\-a FMT/AD\fR
option\&.
option requires the presence of per\-sample FORMAT/QS or FORMAT/AD tag generated with
\fBbcftools mpileup \-a QS\fR
(or
\fB\-a AD\fR)\&.
.RE
.PP
\fB\-g, \-\-gvcf\fR \fIINT\fR
Expand Down Expand Up @@ -2279,7 +2293,7 @@ Homozygous genotypes only, useful with low coverage data (requires
.PP
\fB\-\-n\-matches\fR \fIINT\fR
.RS 4
Print only top INT matches for each sample, 0 for unlimited\&. Use negative value to sort by HWE probability rather than the number of discordant sites\&.
Print only top INT matches for each sample, 0 for unlimited\&. Use negative value to sort by HWE probability rather than the number of discordant sites\&. Note that average score is used to determine the top matches, not absolute values\&.
.RE
.PP
\fB\-\-no\-HWE\-prob\fR
Expand Down
34 changes: 26 additions & 8 deletions doc/bcftools.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp144160"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25199360"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
Expand All @@ -10,7 +10,7 @@
(Note that files with non-standard index names can be accessed as e.g.
"<code class="literal">bcftools view -r X:2928329 file.vcf.gz##idx##non-standard-index-name</code>".)</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2020-09-22</strong></span> and refers to bcftools git version <span class="strong"><strong>1.11</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2020-11-25 16:08 GMT</strong></span> and refers to bcftools git version <span class="strong"><strong>1.11-24-g9718479+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions &lt;= 0.1.19 to convert to VCF, which can then be read by
Expand Down Expand Up @@ -286,6 +286,14 @@

See also the <span class="strong"><strong>-l, --merge-logic</strong></span> option.
</dd><dt><span class="term">
<span class="strong"><strong>-C, --columns-file</strong></span> <span class="emphasis"><em>file</em></span>
</span></dt><dd>
Read the list of columns from a file (normally given via the <span class="strong"><strong>-c, --columns</strong></span> option).
"-" to skip a column of the annotation file.
One column name per row, an additional space- or tab-separated field can
be present to indicate the merge logic (normally given via the <span class="strong"><strong>-l, --merge-logic</strong></span> option).
This is useful when many annotations are added at once.
</dd><dt><span class="term">
<span class="strong"><strong>-e, --exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
</span></dt><dd>
exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
Expand Down Expand Up @@ -318,11 +326,12 @@
</span></dt><dd>
keep sites which do not pass <span class="strong"><strong>-i</strong></span> and <span class="strong"><strong>-e</strong></span> expressions instead of discarding them
</dd><dt><span class="term">
<span class="strong"><strong>-l, --merge-logic</strong></span> <span class="emphasis"><em>tag</em></span>:'first'|<span class="emphasis"><em>append</em></span>|<span class="emphasis"><em>unique</em></span>|<span class="emphasis"><em>sum</em></span>|<span class="emphasis"><em>avg</em></span>|<span class="emphasis"><em>min</em></span>|<span class="emphasis"><em>max</em></span>[,…]
<span class="strong"><strong>-l, --merge-logic</strong></span> <span class="emphasis"><em>tag</em></span>:'first'|<span class="emphasis"><em>append</em></span>|<span class="emphasis"><em>append-missing</em></span>|<span class="emphasis"><em>unique</em></span>|<span class="emphasis"><em>sum</em></span>|<span class="emphasis"><em>avg</em></span>|<span class="emphasis"><em>min</em></span>|<span class="emphasis"><em>max</em></span>[,…]
</span></dt><dd>
if multiple regions overlap a single record, the option defines how to treat multiple
annotation values when setting <span class="emphasis"><em>tag</em></span> in the destination file: use the first encountered value ignoring
the rest (<span class="emphasis"><em>first</em></span>); append allowing duplicates (<span class="emphasis"><em>append</em></span>); append discarding duplicate values (<span class="emphasis"><em>unique</em></span>);
the rest (<span class="emphasis"><em>first</em></span>); append allowing duplicates (<span class="emphasis"><em>append</em></span>); append even if the appended value is missing,
i.e. is a dot (<span class="emphasis"><em>append-missing</em></span>); append discarding duplicate values (<span class="emphasis"><em>unique</em></span>);
sum the values (<span class="emphasis"><em>sum</em></span>, numeric fields only); average the values (<span class="emphasis"><em>avg</em></span>); use the minimum value (<span class="emphasis"><em>min</em></span>) or
the maximum (<span class="emphasis"><em>max</em></span>).

Expand Down Expand Up @@ -356,6 +365,13 @@
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd><dt><span class="term">
<span class="strong"><strong>--rename-annots</strong></span> <span class="emphasis"><em>file</em></span>
</span></dt><dd>
rename annotations according to the map in <span class="emphasis"><em>file</em></span>, with
"old_name new_name\n" pairs separated by whitespaces, each on a separate
line. The old name must be prefixed with the annotation type:
INFO, FORMAT, or FILTER.
</dd><dt><span class="term">
<span class="strong"><strong>--rename-chrs</strong></span> <span class="emphasis"><em>file</em></span>
</span></dt><dd>
rename chromosomes according to the map in <span class="emphasis"><em>file</em></span>, with
Expand Down Expand Up @@ -490,7 +506,8 @@
</span></dt><dd>
comma-separated list of FORMAT fields to output for each sample. Currently
GQ and GP fields are supported. For convenience, the fields can be given
as lower case letters.
as lower case letters. Prefixed with "^" indicates a request for tag
removal of auxiliary tags useful only for calling.
</dd><dt><span class="term">
<span class="strong"><strong>-F, --prior-freqs</strong></span> <span class="emphasis"><em>AN</em></span>,<span class="emphasis"><em>AC</em></span>
</span></dt><dd>
Expand All @@ -510,14 +527,14 @@

# Now before calling, stream the raw mpileup output through `bcftools annotate` to add the frequencies
bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]</pre><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-G, --group-samples</strong></span> <span class="emphasis"><em>FILE</em></span>|<span class="emphasis"><em>-</em></span>
<span class="strong"><strong>-G, --group-samples</strong></span> <span class="emphasis"><em><span class="TAG:">FILE</span></em></span>|<span class="emphasis"><em>-</em></span>
</span></dt><dd>
by default, all samples are assumed to come from a single population. This option allows to group samples
into populations and apply the HWE assumption within but not across the populations. <span class="emphasis"><em>FILE</em></span> is a tab-delimited
text file with sample names in the first column and group names in the second column. If <span class="emphasis"><em>-</em></span> is
given instead, no HWE assumption is made at all and single-sample calling is performed. (Note that
in low coverage data this inflates the rate of false positives.) The <span class="strong"><strong>-G</strong></span> option requires the presence of
FORMAT/AD generated at the <span class="strong"><strong>bcftools mpileup</strong></span> step by providing the <span class="strong"><strong>-a FMT/AD</strong></span> option.
per-sample FORMAT/QS or FORMAT/AD tag generated with <span class="strong"><strong>bcftools mpileup -a QS</strong></span> (or <span class="strong"><strong>-a AD</strong></span>).
</dd><dt><span class="term">
<span class="strong"><strong>-g, --gvcf</strong></span> <span class="emphasis"><em>INT</em></span>
</span></dt><dd>
Expand Down Expand Up @@ -1361,7 +1378,8 @@
<span class="strong"><strong>--n-matches</strong></span> <span class="emphasis"><em>INT</em></span>
</span></dt><dd>
Print only top INT matches for each sample, 0 for unlimited. Use negative value
to sort by HWE probability rather than the number of discordant sites.
to sort by HWE probability rather than the number of discordant sites. Note
that average score is used to determine the top matches, not absolute values.
</dd><dt><span class="term">
<span class="strong"><strong>--no-HWE-prob</strong></span>
</span></dt><dd>
Expand Down
Loading

0 comments on commit 0f4aed2

Please sign in to comment.