Resolved merged conflicts

dlaehnemann · Sep 2, 2014 · 34f212f · 34f212f
2 parents b0ac420 + 6ad8058
commit 34f212f
Show file tree

Hide file tree

Showing 27 changed files with 679 additions and 44 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -1,6 +1,13 @@
-Petr Danecek <petr.danecek@sanger.ac.uk>
-Heng Li <lh3@live.co.uk>
-Shane McCarthy <sm15@sanger.ac.uk>
-Nicholas Clarke <nc6@sanger.ac.uk>
-John Marshall <jm18@sanger.ac.uk>
+BCFtools package is currently maintained by
+Petr Danecek, Shane McCarthy and John Marshall.
+
+Alphabetical list of people who have made contributions:
+
+    Nicholas Clarke
+    Petr Danecek <petr.danecek@sanger.ac.uk>
+    Heng Li
+    Shane McCarthy <sm15@sanger.ac.uk>
+    John Marshall <jm18@sanger.ac.uk>
+    Joel Martin
+    Stephan Schiffels
 
diff --git a/Makefile b/Makefile
@@ -41,7 +41,7 @@ DFLAGS   =
 OBJS     = main.o vcfindex.o tabix.o \
            vcfstats.o vcfisec.o vcfmerge.o vcfquery.o vcffilter.o filter.o vcfsom.o \
            vcfnorm.o vcfgtcheck.o vcfview.o vcfannotate.o vcfroh.o vcfconcat.o \
-           vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o HMM.o \
+           vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o HMM.o tsv2vcf.o \
            ccall.o em.o prob1.o kmin.o # the original samtools calling
 INCLUDES = -I. -I$(HTSDIR)
 
@@ -92,14 +92,15 @@ plugins: $(PLUGINS)
 bcftools_h = bcftools.h $(htslib_vcf_h)
 call_h = call.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) vcmp.h
 convert_h = convert.h $(htslib_vcf_h)
+tsv2vcf_h = tsv2vcf.h $(htslib_vcf_h)
 filter_h = filter.h $(htslib_vcf_h)
 prob1_h = prob1.h $(htslib_vcf_h) $(call_h)
 
 main.o: main.c $(htslib_hts_h) version.h $(bcftools_h)
 vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h) vcmp.h $(filter_h)
 vcfcall.o: vcfcall.c $(htslib_vcf_h) $(HTSDIR)/htslib/kfunc.h $(htslib_synced_bcf_reader_h) $(bcftools_h) $(call_h) $(prob1_h)
 vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h)
-vcfconvert.o: vcfconvert.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h)
+vcfconvert.o: vcfconvert.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
 vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h
 vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
 vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h)
@@ -115,6 +116,7 @@ reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(HTSDIR)/htslib/kseq.h
 tabix.o: tabix.c $(htslib_bgzf_h) $(htslib_tbx_h)
 ccall.o: ccall.c $(HTSDIR)/htslib/kfunc.h $(call_h) kmin.h $(prob1_h)
 convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h)
+tsv2vcf.o: tsv2vcf.c $(tsv2vcf_h)
 em.o: em.c $(htslib_vcf_h) kmin.h $(call_h)
 filter.o: filter.c $(HTSDIR)/htslib/khash_str2int.h $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
 gvcf.o: gvcf.c $(call_h)

diff --git a/bcftools.1 b/bcftools.1
@@ -2,12 +2,12 @@
 .\"     Title: bcftools
 .\"    Author: [see the "AUTHORS" section]
 .\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
-.\"      Date: 08/26/2014
+.\"      Date: 09/02/2014
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "BCFTOOLS" "1" "08/26/2014" "\ \&" "\ \&"
+.TH "BCFTOOLS" "1" "09/02/2014" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Define some portability stuff
 .\" -----------------------------------------------------------------
@@ -881,6 +881,27 @@ see
 .nr an-break-flag 1
 .br
 .ps +1
+\fBVCF output options:\fR
+.RS 4
+.PP
+\fB\-o, \-\-output\fR \fIFILE\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.PP
+\fB\-O, \-\-output\-type\fR \fIb\fR|\fIu\fR|\fIz\fR|\fIv\fR
+.RS 4
+see
+\fBCommon Options\fR
+.RE
+.RE
+.sp
+.it 1 an-trap
+.nr an-no-space-flag 1
+.nr an-break-flag 1
+.br
+.ps +1
 \fBgen/sample options:\fR
 .RS 4
 .PP
@@ -914,6 +935,35 @@ convert from VCF to gen/sample format used by IMPUTE2 and SHAPEIT\&. The columns
 tag to take values for \&.gen file: GT,PL,GL,GP
 .RE
 .RE
+.sp
+.it 1 an-trap
+.nr an-no-space-flag 1
+.nr an-break-flag 1
+.br
+.ps +1
+\fBtsv options:\fR
+.RS 4
+.PP
+\fB\-\-tsv2vcf\fR \fIfile\fR
+.RS 4
+convert from TSV (tab\-separated values) format to VCF\&. The input file fields can be tab\- or space\- delimited
+.RE
+.PP
+\fB\-c, \-\-columns\fR \fIlist\fR
+.RS 4
+comma\-separated list of fields in the input file\&. In the current version, the fields CHROM, POS, ID, and AA are expected and can appear in arbitrary order\&. The AA field lists alleles on the forward reference strand, for example "CC" or "CT" for diploid genotypes or "C" for haploid genotypes (sex chromosomes)\&. Insertions and deletions are not supported yet\&. Columns which should be ignored in the input file can be indicated by "\-"\&.
+.RE
+.PP
+\fB\-f, \-\-ref\fR \fIfile\fR
+.RS 4
+reference sequence in fasta format
+.RE
+.PP
+\fB\-s, \-\-sample\fR \fIstring\fR
+.RS 4
+sample name
+.RE
+.RE
 .SS "bcftools filter \fI[OPTIONS]\fR \fIFILE\fR"
 .sp
 Apply fixed\-threshold filters\&.
@@ -1572,6 +1622,7 @@ process multiple VCFs listed in the file
 []              The brackets loop over all samples
 %GT             Genotype (e\&.g\&. 0/1)
 %TGT            Translated genotype (e\&.g\&. C/A)
+%IUPACGT        Genotype translated to IUPAC ambiguity codes (e\&.g\&. M instead of C/A)
 %LINE           Prints the whole line
 %SAMPLE         Sample name
 .fi

diff --git a/bcftools.txt b/bcftools.txt
@@ -497,6 +497,13 @@ the *-a, --allow-overlaps* option is specified.
 *-T, --targets-file* 'FILE'::
     see *<<common_options,Common Options>>*
 
+==== VCF output options:
+
+*-o, --output* 'FILE'::
+    see *<<common_options,Common Options>>*
+
+*-O, --output-type* 'b'|'u'|'z'|'v'::
+    see *<<common_options,Common Options>>*
 
 ==== gen/sample options:
 *-g, --gensample* 'prefix' or 'gen-file','sample-file'::
@@ -522,6 +529,27 @@ the *-a, --allow-overlaps* option is specified.
 *--tag* 'STRING'::
     tag to take values for .gen file: GT,PL,GL,GP
 
+==== tsv options:
+*--tsv2vcf* 'file'::
+    convert from TSV (tab-separated values) format to VCF. The 
+    input file fields can be tab- or space- delimited
+
+*-c, --columns* 'list'::
+    comma-separated list of fields in the input file. In the current
+    version, the fields CHROM, POS, ID, and AA are expected and 
+    can appear in arbitrary order.
+    The AA field lists alleles on the forward reference strand,
+    for example "CC" or "CT" for diploid genotypes or "C"
+    for haploid genotypes (sex chromosomes). Insertions and deletions
+    are not supported yet.
+    Columns which should be ignored in the input file can be indicated by "-".
+
+*-f, --ref* 'file'::
+    reference sequence in fasta format
+
+*-s, --sample* 'string'::
+    sample name
+
 
 [[filter]]
 === bcftools filter '[OPTIONS]' 'FILE'
@@ -915,6 +943,7 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format.
     []              The brackets loop over all samples
     %GT             Genotype (e.g. 0/1)
     %TGT            Translated genotype (e.g. C/A)
+    %IUPACGT        Genotype translated to IUPAC ambiguity codes (e.g. M instead of C/A)
     %LINE           Prints the whole line
     %SAMPLE         Sample name
 

diff --git a/convert.c b/convert.c
@@ -58,6 +58,7 @@ THE SOFTWARE.  */
 #define T_GT_TO_PROB3  19   // not publicly advertised
 #define T_PL_TO_PROB3  20   // not publicly advertised
 #define T_FIRST_ALT    21   // not publicly advertised
+#define T_IUPAC_GT     22 
 
 typedef struct _fmt_t
 {
@@ -156,9 +157,15 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
 
     if ( info->len == 1 )
     {
-        if ( info->type == BCF_BT_FLOAT ) ksprintf(str, "%g", info->v1.f);
-        else if ( info->type != BCF_BT_CHAR ) kputw(info->v1.i, str);
-        else kputc(info->v1.i, str);
+        switch (info->type)
+        {
+            case BCF_BT_INT8:  if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+            case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+            case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
+            case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
+            case BCF_BT_CHAR:  kputc(info->v1.i, str); break;
+            default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
+        }
     }
     else if ( fmt->subscript >=0 )
     {
@@ -167,9 +174,20 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
             kputc('.', str);
             return;
         }
-        if ( info->type == BCF_BT_FLOAT ) ksprintf(str, "%g", ((float*)info->vptr)[fmt->subscript]);
-        else if ( info->type != BCF_BT_CHAR ) kputw(bcf_array_ivalue(info->vptr,info->type,fmt->subscript), str);
-        else error("TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type);
+        #define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
+            type_t val = ((type_t *) info->vptr)[fmt->subscript]; \
+            if ( is_missing || is_vector_end ) kputc('.',str); \
+            else kprint; \
+        }
+        switch (info->type)
+        {
+            case BCF_BT_INT8:  BRANCH(int8_t,  val==bcf_int8_missing,  val==bcf_int8_vector_end,  kputw(val, str)); break;
+            case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
+            case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
+            case BCF_BT_FLOAT: BRANCH(float,   bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
+            default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
+        }
+        #undef BRANCH
     }
     else
         bcf_fmt_array(str, info->len, info->type, info->vptr);
@@ -248,6 +266,78 @@ static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isampl
     }
     if (l == 0) kputc('.', str);
 }
+static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
+{
+    init_format(convert, line, fmt);
+    if ( fmt->fmt==NULL ) return;
+
+    // Init mapping between alleles and IUPAC table
+    hts_expand(uint8_t, line->n_allele, convert->ndat, convert->dat);
+    int8_t *dat = (int8_t*)convert->dat;
+    int i;
+    for (i=0; i<line->n_allele; i++)
+    {
+        if ( line->d.allele[i][1] ) dat[i] = -1;
+        else
+        {
+            switch (line->d.allele[i][0])
+            {
+                case 'A': dat[i] = 0; break;
+                case 'C': dat[i] = 1; break;
+                case 'G': dat[i] = 2; break;
+                case 'T': dat[i] = 3; break;
+                case 'a': dat[i] = 0; break;
+                case 'c': dat[i] = 1; break;
+                case 'g': dat[i] = 2; break;
+                case 't': dat[i] = 3; break;
+                default: dat[i] = -1;
+            }
+        }
+    }
+}
+static void process_iupac_gt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+    if ( !fmt->ready )
+        init_format_iupac(convert, line, fmt);
+
+    if ( fmt->fmt==NULL )
+    {
+        kputc('.', str);
+        return;
+    }
+
+    assert( fmt->fmt->type==BCF_BT_INT8 );
+
+    static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
+    int8_t *dat = (int8_t*)convert->dat;
+
+    int8_t *x = (int8_t*)(fmt->fmt->p + isample*fmt->fmt->size); // FIXME: does not work with n_alt >= 64
+    int l = 0;
+    while ( l<fmt->fmt->n && x[l]!=bcf_int8_vector_end && x[l]!=bcf_int8_missing ) l++;
+
+    if ( l==2 )
+    {
+        // diploid
+        int ia = (x[0]>>1) - 1, ib = (x[1]>>1) - 1;
+        if ( ia>=0 && ia<line->n_allele && ib>=0 && ib<line->n_allele && dat[ia]>=0 && dat[ib]>=0 )
+        {
+            kputc(iupac[dat[ia]][dat[ib]], str);
+            return;
+        }
+    }
+    for (l = 0; l < fmt->fmt->n && x[l] != bcf_int8_vector_end; ++l)
+    {
+        if (l) kputc("/|"[x[l]&1], str);
+        if (x[l]>>1)
+        {
+            int ial = (x[l]>>1) - 1;
+            kputs(line->d.allele[ial], str);
+        }
+        else
+            kputc('.', str);
+    }
+    if (l == 0) kputc('.', str);
+}
 static void process_sample(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
 {
     kputs(convert->header->samples[isample], str);
@@ -447,6 +537,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
         case T_MASK: fmt->handler = NULL; break;
         case T_GT: fmt->handler = &process_gt; convert->max_unpack |= BCF_UN_FMT; break;
         case T_TGT: fmt->handler = &process_tgt; convert->max_unpack |= BCF_UN_FMT; break;
+        case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
         case T_LINE: fmt->handler = &process_line; break;
         default: error("TODO: handler for type %d\n", fmt->type);
     }
@@ -485,6 +576,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
         if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
         else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
         else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+        else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
         else
         {
             fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);

diff --git a/filter.c b/filter.c
@@ -985,8 +985,9 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // log
             while ( b<bend && *b ) b++;
             if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
             else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
-            if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
-            if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+            if ( logic!=TOK_EQ )
+                atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+            pass_site |= atok->pass_samples[i];
         }
         if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
     }
@@ -1035,8 +1036,9 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic)    // log
             while ( y<yend && *y ) y++;
             if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
             else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
-            if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
-            if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
+            if ( logic!=TOK_EQ )
+                atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+            pass_site |= atok->pass_samples[i];
         }
         if ( !atok->nsamples )
             atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set

diff --git a/test/check.chk b/test/check.chk
@@ -4,6 +4,7 @@
 # SN, Summary numbers:
 # SN	[2]id	[3]key	[4]value
 SN	0	number of samples:	2
+SN	0	number of records:	18
 SN	0	number of SNPs:	5
 SN	0	number of MNPs:	1
 SN	0	number of indels:	9

diff --git a/test/mpileup.1.out b/test/mpileup.1.out
@@ -14,8 +14,6 @@
 ##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
 ##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
 ##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
-##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
-##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
 ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
 ##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of high-quality non-reference bases">

diff --git a/test/mpileup.2.out b/test/mpileup.2.out
@@ -14,8 +14,6 @@
 ##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
 ##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
 ##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
-##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
-##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
 ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
 ##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of high-quality non-reference bases">

diff --git a/test/mpileup.cAls.out b/test/mpileup.cAls.out
@@ -14,8 +14,6 @@
 ##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
 ##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
 ##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
-##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
-##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
 ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
 ##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of high-quality non-reference bases">