[NEWS] Revamp of bcftools call -G

Sample grouping by population was not truly independent and could still be influenced by the presence of other sample groups. With this commit, -G should work as expected. Other changes are: - Optional addition of INFO/PV4 annotation with `call -a INFO/PV4` - Remove generation of useless HOB and ICB annotation; use +fill-tags -- -t HWE,ExcHet` instead - The `call -f` option was renamed to `-a` to (1) make it consistent with `mpileup` and (2) to indicate that it includes both INFO and FORMAT annotations, not just FORMAT as previously - Any sensible Number=R,Type=Integer annotation can be used with -G, such as AD or QS - Don't trim QUAL; although usefuleness of this change is questionable for true probabilistic interpretation (such high precision is unrealistic), using QUAL as a score rather than probability *is* helpful and permits more fine-grained filtering - Fix a suspected bug in `call -F` in the worst case, for certain improve readability - Note that `call -C trio` is temporarily disabled by this commit Fixes samtools#1332
overerd · Dec 7, 2020 · 0f4aed2 · 0f4aed2
1 parent 07c4308
commit 0f4aed2
Show file tree

Hide file tree

Showing 43 changed files with 852 additions and 711 deletions.
diff --git a/call.h b/call.h
@@ -34,7 +34,7 @@ THE SOFTWARE.  */
 #define CALL_CONSTR_TRIO    (1<<2)
 #define CALL_CONSTR_ALLELES (1<<3)
 //
-//
+#define CALL_FMT_PV4        (1<<5)
 #define CALL_FMT_GQ         (1<<6)
 #define CALL_FMT_GP         (1<<7)
 
@@ -52,18 +52,13 @@ family_t;
 // For the single-sample and grouped -G calling
 typedef struct
 {
+    double ref_lk, max_lk, lk_sum;
     float *qsum;    // QS(quality sum) values
-    int nqsum, dp;
-    double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc;
-}
-grp1_t;
-typedef struct
-{
-    grp1_t *grp;
-    int ngrp;
-    int *smpl2grp;
+    int nqsum;
+    uint32_t *smpl, nsmpl;
+    uint32_t nals, als;
 }
-grp_t;
+smpl_grp_t;
 
 // For the `-C alleles -i` constrained calling
 typedef struct
@@ -82,6 +77,7 @@ typedef struct
     int *pl_map, npl_map;   // same as above for PLs, but reverse (new -> old)
     char **als;             // array to hold the trimmed set of alleles to appear on output
     int nals;               // size of the als array
+    int als_new, nals_new;  // bitmask with final alleles and their number
     family_t *fams;         // list of families and samples for trio calling
     int nfams, mfams;
     int ntrio[5][5];        // possible trio genotype combinations and their counts; first idx:
@@ -96,18 +92,16 @@ typedef struct
     int32_t *ugts, *cgts;   // unconstraind and constrained GTs
     uint32_t output_tags;
     char *prior_AN, *prior_AC;  // reference panel AF tags (AF=AC/AN)
-    tgt_als_t *tgt_als;     // for CALL_CONSTR_ALLELES
-    char *sample_groups;    // for single-sample or grouped calling with -G
-    grp_t smpl_grp;
-    float *qsum;
-    int nqsum;
+    tgt_als_t *tgt_als;         // for CALL_CONSTR_ALLELES
+    char *sample_groups;        // for single-sample or grouped calling with -G
+    char *sample_groups_tag;    // for -G [AD|QS:]
+    smpl_grp_t *smpl_grp;
+    int nsmpl_grp;
 
     // ccall only
     double indel_frac, min_perm_p, min_lrt;
     double prior_type, pref;
-    double ref_lk, lk_sum;
     int ngrp1_samples, n_perm;
-    int nhets, ndiploid;
     char *prior_file;
     ccall_t *cdat;
 
@@ -149,7 +143,7 @@ void qcall_destroy(call_t *call);
 void call_init_pl2p(call_t *call);
 uint32_t *call_trio_prep(int is_x, int is_son);
 
-void init_allele_trimming_maps(call_t *call, int als, int nals);
-void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als);
+void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out);
+void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new);
 
 #endif
diff --git a/ccall.c b/ccall.c
@@ -303,8 +303,8 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double
     // trim Number=R tags
     int out_als = 0;
     for (i=0; i<nals; i++) out_als |= 1<<i;
-    init_allele_trimming_maps(call, out_als, nals_ori);
-    mcall_trim_numberR(call, rec, nals_ori, nals, out_als);
+    init_allele_trimming_maps(call, nals_ori, out_als);
+    mcall_trim_and_update_numberR(call, rec, nals_ori, nals);
 
     return is_var;
 }

diff --git a/doc/bcftools.1 b/doc/bcftools.1
@@ -2,12 +2,12 @@
 .\"     Title: bcftools
 .\"    Author: [see the "AUTHORS" section]
 .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\"      Date: 2020-09-22
+.\"      Date: 2020-11-25 16:08 GMT
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "BCFTOOLS" "1" "2020\-09\-22" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "2020\-11\-25 16:08 GMT" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Define some portability stuff
 .\" -----------------------------------------------------------------
@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
 BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
 .SS "VERSION"
 .sp
-This manual page was last updated \fB2020\-09\-22\fR and refers to bcftools git version \fB1\&.11\fR\&.
+This manual page was last updated \fB2020\-11\-25 16:08 GMT\fR and refers to bcftools git version \fB1\&.11\-24\-g9718479+\fR\&.
 .SS "BCF1"
 .sp
 The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
@@ -664,6 +664,15 @@ See also the
 option\&.
 .RE
 .PP
+\fB\-C, \-\-columns\-file\fR \fIfile\fR
+.RS 4
+Read the list of columns from a file (normally given via the
+\fB\-c, \-\-columns\fR
+option)\&. "\-" to skip a column of the annotation file\&. One column name per row, an additional space\- or tab\-separated field can be present to indicate the merge logic (normally given via the
+\fB\-l, \-\-merge\-logic\fR
+option)\&. This is useful when many annotations are added at once\&.
+.RE
+.PP
 \fB\-e, \-\-exclude\fR \fIEXPRESSION\fR
 .RS 4
 exclude sites for which
@@ -730,11 +739,11 @@ and
 expressions instead of discarding them
 .RE
 .PP
-\fB\-l, \-\-merge\-logic\fR \fItag\fR:\*(Aqfirst\*(Aq|\fIappend\fR|\fIunique\fR|\fIsum\fR|\fIavg\fR|\fImin\fR|\fImax\fR[,\&...]
+\fB\-l, \-\-merge\-logic\fR \fItag\fR:\*(Aqfirst\*(Aq|\fIappend\fR|\fIappend\-missing\fR|\fIunique\fR|\fIsum\fR|\fIavg\fR|\fImin\fR|\fImax\fR[,\&...]
 .RS 4
 if multiple regions overlap a single record, the option defines how to treat multiple annotation values when setting
 \fItag\fR
-in the destination file: use the first encountered value ignoring the rest (\fIfirst\fR); append allowing duplicates (\fIappend\fR); append discarding duplicate values (\fIunique\fR); sum the values (\fIsum\fR, numeric fields only); average the values (\fIavg\fR); use the minimum value (\fImin\fR) or the maximum (\fImax\fR)\&.
+in the destination file: use the first encountered value ignoring the rest (\fIfirst\fR); append allowing duplicates (\fIappend\fR); append even if the appended value is missing, i\&.e\&. is a dot (\fIappend\-missing\fR); append discarding duplicate values (\fIunique\fR); sum the values (\fIsum\fR, numeric fields only); average the values (\fIavg\fR); use the minimum value (\fImin\fR) or the maximum (\fImax\fR)\&.
 
 Note that this option is intended for use with BED or TAB\-delimited annotation files only\&. Moreover, it is effective only when either
 \fIREF\fR
@@ -787,6 +796,12 @@ see
 \fBCommon Options\fR
 .RE
 .PP
+\fB\-\-rename\-annots\fR \fIfile\fR
+.RS 4
+rename annotations according to the map in
+\fIfile\fR, with "old_name new_name\en" pairs separated by whitespaces, each on a separate line\&. The old name must be prefixed with the annotation type: INFO, FORMAT, or FILTER\&.
+.RE
+.PP
 \fB\-\-rename\-chrs\fR \fIfile\fR
 .RS 4
 rename chromosomes according to the map in
@@ -976,7 +991,7 @@ output all alternate alleles present in the alignments even if they do not appea
 .PP
 \fB\-f, \-\-format\-fields\fR \fIlist\fR
 .RS 4
-comma\-separated list of FORMAT fields to output for each sample\&. Currently GQ and GP fields are supported\&. For convenience, the fields can be given as lower case letters\&.
+comma\-separated list of FORMAT fields to output for each sample\&. Currently GQ and GP fields are supported\&. For convenience, the fields can be given as lower case letters\&. Prefixed with "^" indicates a request for tag removal of auxiliary tags useful only for calling\&.
 .RE
 .PP
 \fB\-F, \-\-prior\-freqs\fR \fIAN\fR,\fIAC\fR
@@ -1015,11 +1030,10 @@ is a tab\-delimited text file with sample names in the first column and group na
 \fI\-\fR
 is given instead, no HWE assumption is made at all and single\-sample calling is performed\&. (Note that in low coverage data this inflates the rate of false positives\&.) The
 \fB\-G\fR
-option requires the presence of FORMAT/AD generated at the
-\fBbcftools mpileup\fR
-step by providing the
-\fB\-a FMT/AD\fR
-option\&.
+option requires the presence of per\-sample FORMAT/QS or FORMAT/AD tag generated with
+\fBbcftools mpileup \-a QS\fR
+(or
+\fB\-a AD\fR)\&.
 .RE
 .PP
 \fB\-g, \-\-gvcf\fR \fIINT\fR
@@ -2279,7 +2293,7 @@ Homozygous genotypes only, useful with low coverage data (requires
 .PP
 \fB\-\-n\-matches\fR \fIINT\fR
 .RS 4
-Print only top INT matches for each sample, 0 for unlimited\&. Use negative value to sort by HWE probability rather than the number of discordant sites\&.
+Print only top INT matches for each sample, 0 for unlimited\&. Use negative value to sort by HWE probability rather than the number of discordant sites\&. Note that average score is used to determine the top matches, not absolute values\&.
 .RE
 .PP
 \fB\-\-no\-HWE\-prob\fR

diff --git a/doc/bcftools.html b/doc/bcftools.html
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp144160"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools  is  a set of utilities that manipulate variant calls in the Variant
+<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25199360"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools  is  a set of utilities that manipulate variant calls in the Variant
 Call Format (VCF) and its binary counterpart BCF. All commands work
 transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
 automatically even when streaming from a pipe. Indexed VCF and BCF
@@ -10,7 +10,7 @@
 (Note that files with non-standard index names can be accessed as e.g.
 "<code class="literal">bcftools view -r X:2928329 file.vcf.gz##idx##non-standard-index-name</code>".)</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
 standard input (stdin) and outputs to the standard output (stdout). Several
-commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2020-09-22</strong></span> and refers to bcftools git version <span class="strong"><strong>1.11</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
+commands can thus be  combined  with  Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2020-11-25 16:08 GMT</strong></span> and refers to bcftools git version <span class="strong"><strong>1.11-24-g9718479+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
 compatible with this version of bcftools. To read BCF1 files one can use
 the view command from old versions of bcftools packaged with samtools
 versions &lt;= 0.1.19 to convert to VCF, which can then be read by
@@ -286,6 +286,14 @@
 
     See also the <span class="strong"><strong>-l, --merge-logic</strong></span> option.
 </dd><dt><span class="term">
+<span class="strong"><strong>-C, --columns-file</strong></span> <span class="emphasis"><em>file</em></span>
+</span></dt><dd>
+    Read the list of columns from a file (normally given via the <span class="strong"><strong>-c, --columns</strong></span> option).
+    "-" to skip a column of the annotation file.
+    One column name per row, an additional space- or tab-separated field can
+    be present to indicate the merge logic (normally given via the <span class="strong"><strong>-l, --merge-logic</strong></span> option).
+    This is useful when many annotations are added at once.
+</dd><dt><span class="term">
 <span class="strong"><strong>-e, --exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
 </span></dt><dd>
     exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
@@ -318,11 +326,12 @@
 </span></dt><dd>
     keep sites which do not pass <span class="strong"><strong>-i</strong></span> and <span class="strong"><strong>-e</strong></span> expressions instead of discarding them
 </dd><dt><span class="term">
-<span class="strong"><strong>-l, --merge-logic</strong></span> <span class="emphasis"><em>tag</em></span>:'first'|<span class="emphasis"><em>append</em></span>|<span class="emphasis"><em>unique</em></span>|<span class="emphasis"><em>sum</em></span>|<span class="emphasis"><em>avg</em></span>|<span class="emphasis"><em>min</em></span>|<span class="emphasis"><em>max</em></span>[,…]
+<span class="strong"><strong>-l, --merge-logic</strong></span> <span class="emphasis"><em>tag</em></span>:'first'|<span class="emphasis"><em>append</em></span>|<span class="emphasis"><em>append-missing</em></span>|<span class="emphasis"><em>unique</em></span>|<span class="emphasis"><em>sum</em></span>|<span class="emphasis"><em>avg</em></span>|<span class="emphasis"><em>min</em></span>|<span class="emphasis"><em>max</em></span>[,…]
 </span></dt><dd>
     if multiple regions overlap a single record, the option defines how to treat multiple
     annotation values when setting <span class="emphasis"><em>tag</em></span> in the destination file: use the first encountered value ignoring
-    the rest (<span class="emphasis"><em>first</em></span>); append allowing duplicates (<span class="emphasis"><em>append</em></span>); append discarding duplicate values (<span class="emphasis"><em>unique</em></span>);
+    the rest (<span class="emphasis"><em>first</em></span>); append allowing duplicates (<span class="emphasis"><em>append</em></span>); append even if the appended value is missing,
+    i.e. is a dot (<span class="emphasis"><em>append-missing</em></span>); append discarding duplicate values (<span class="emphasis"><em>unique</em></span>);
     sum the values (<span class="emphasis"><em>sum</em></span>, numeric fields only); average the values (<span class="emphasis"><em>avg</em></span>); use the minimum value (<span class="emphasis"><em>min</em></span>) or
     the maximum (<span class="emphasis"><em>max</em></span>).
 
@@ -356,6 +365,13 @@
 </span></dt><dd>
     see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
 </dd><dt><span class="term">
+<span class="strong"><strong>--rename-annots</strong></span> <span class="emphasis"><em>file</em></span>
+</span></dt><dd>
+    rename annotations according to the map in <span class="emphasis"><em>file</em></span>, with
+    "old_name new_name\n" pairs separated by whitespaces, each on a separate
+    line. The old name must be prefixed with the annotation type:
+    INFO, FORMAT, or FILTER.
+</dd><dt><span class="term">
 <span class="strong"><strong>--rename-chrs</strong></span> <span class="emphasis"><em>file</em></span>
 </span></dt><dd>
     rename chromosomes according to the map in <span class="emphasis"><em>file</em></span>, with
@@ -490,7 +506,8 @@
 </span></dt><dd>
     comma-separated list of FORMAT fields to output for each sample. Currently
     GQ and GP fields are supported. For convenience, the fields can be given
-    as lower case letters.
+    as lower case letters. Prefixed with "^" indicates a request for tag
+    removal of auxiliary tags useful only for calling.
 </dd><dt><span class="term">
 <span class="strong"><strong>-F, --prior-freqs</strong></span> <span class="emphasis"><em>AN</em></span>,<span class="emphasis"><em>AC</em></span>
 </span></dt><dd>
@@ -510,14 +527,14 @@
 
     # Now before calling, stream the raw mpileup output through `bcftools annotate` to add the frequencies
     bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]</pre><div class="variablelist"><dl><dt><span class="term">
-<span class="strong"><strong>-G, --group-samples</strong></span> <span class="emphasis"><em>FILE</em></span>|<span class="emphasis"><em>-</em></span>
+<span class="strong"><strong>-G, --group-samples</strong></span> <span class="emphasis"><em><span class="TAG:">FILE</span></em></span>|<span class="emphasis"><em>-</em></span>
 </span></dt><dd>
     by default, all samples are assumed to come from a single population. This option allows to group samples
     into populations and apply the HWE assumption within but not across the populations. <span class="emphasis"><em>FILE</em></span> is a tab-delimited
     text file with sample names in the first column and group names in the second column. If <span class="emphasis"><em>-</em></span> is
     given instead, no HWE assumption is made at all and single-sample calling is performed. (Note that
     in low coverage data this inflates the rate of false positives.) The <span class="strong"><strong>-G</strong></span> option requires the presence of
-    FORMAT/AD generated at the <span class="strong"><strong>bcftools mpileup</strong></span> step by providing the <span class="strong"><strong>-a FMT/AD</strong></span> option.
+    per-sample FORMAT/QS or FORMAT/AD tag generated with <span class="strong"><strong>bcftools mpileup -a QS</strong></span> (or <span class="strong"><strong>-a AD</strong></span>).
 </dd><dt><span class="term">
 <span class="strong"><strong>-g, --gvcf</strong></span> <span class="emphasis"><em>INT</em></span>
 </span></dt><dd>
@@ -1361,7 +1378,8 @@
 <span class="strong"><strong>--n-matches</strong></span> <span class="emphasis"><em>INT</em></span>
 </span></dt><dd>
     Print only top INT matches for each sample, 0 for unlimited. Use negative value
-    to sort by HWE probability rather than the number of discordant sites.
+    to sort by HWE probability rather than the number of discordant sites. Note
+    that average score is used to determine the top matches, not absolute values.
 </dd><dt><span class="term">
 <span class="strong"><strong>--no-HWE-prob</strong></span>
 </span></dt><dd>