Skip to content

Commit

Permalink
Resolved merged conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Sep 2, 2014
2 parents b0ac420 + 6ad8058 commit 34f212f
Show file tree
Hide file tree
Showing 27 changed files with 679 additions and 44 deletions.
17 changes: 12 additions & 5 deletions AUTHORS
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Petr Danecek <petr.danecek@sanger.ac.uk>
Heng Li <lh3@live.co.uk>
Shane McCarthy <sm15@sanger.ac.uk>
Nicholas Clarke <nc6@sanger.ac.uk>
John Marshall <jm18@sanger.ac.uk>
BCFtools package is currently maintained by
Petr Danecek, Shane McCarthy and John Marshall.

Alphabetical list of people who have made contributions:

Nicholas Clarke
Petr Danecek <petr.danecek@sanger.ac.uk>
Heng Li
Shane McCarthy <sm15@sanger.ac.uk>
John Marshall <jm18@sanger.ac.uk>
Joel Martin
Stephan Schiffels

6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ DFLAGS =
OBJS = main.o vcfindex.o tabix.o \
vcfstats.o vcfisec.o vcfmerge.o vcfquery.o vcffilter.o filter.o vcfsom.o \
vcfnorm.o vcfgtcheck.o vcfview.o vcfannotate.o vcfroh.o vcfconcat.o \
vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o HMM.o \
vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o HMM.o tsv2vcf.o \
ccall.o em.o prob1.o kmin.o # the original samtools calling
INCLUDES = -I. -I$(HTSDIR)

Expand Down Expand Up @@ -92,14 +92,15 @@ plugins: $(PLUGINS)
bcftools_h = bcftools.h $(htslib_vcf_h)
call_h = call.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) vcmp.h
convert_h = convert.h $(htslib_vcf_h)
tsv2vcf_h = tsv2vcf.h $(htslib_vcf_h)
filter_h = filter.h $(htslib_vcf_h)
prob1_h = prob1.h $(htslib_vcf_h) $(call_h)

main.o: main.c $(htslib_hts_h) version.h $(bcftools_h)
vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h) vcmp.h $(filter_h)
vcfcall.o: vcfcall.c $(htslib_vcf_h) $(HTSDIR)/htslib/kfunc.h $(htslib_synced_bcf_reader_h) $(bcftools_h) $(call_h) $(prob1_h)
vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(HTSDIR)/htslib/kseq.h $(bcftools_h)
vcfconvert.o: vcfconvert.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h)
vcfconvert.o: vcfconvert.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h)
Expand All @@ -115,6 +116,7 @@ reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(HTSDIR)/htslib/kseq.h
tabix.o: tabix.c $(htslib_bgzf_h) $(htslib_tbx_h)
ccall.o: ccall.c $(HTSDIR)/htslib/kfunc.h $(call_h) kmin.h $(prob1_h)
convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h)
tsv2vcf.o: tsv2vcf.c $(tsv2vcf_h)
em.o: em.c $(htslib_vcf_h) kmin.h $(call_h)
filter.o: filter.c $(HTSDIR)/htslib/khash_str2int.h $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
gvcf.o: gvcf.c $(call_h)
Expand Down
55 changes: 53 additions & 2 deletions bcftools.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 08/26/2014
.\" Date: 09/02/2014
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "08/26/2014" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "09/02/2014" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
Expand Down Expand Up @@ -881,6 +881,27 @@ see
.nr an-break-flag 1
.br
.ps +1
\fBVCF output options:\fR
.RS 4
.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
\fBCommon Options\fR
.RE
.PP
\fB\-O, \-\-output\-type\fR \fIb\fR|\fIu\fR|\fIz\fR|\fIv\fR
.RS 4
see
\fBCommon Options\fR
.RE
.RE
.sp
.it 1 an-trap
.nr an-no-space-flag 1
.nr an-break-flag 1
.br
.ps +1
\fBgen/sample options:\fR
.RS 4
.PP
Expand Down Expand Up @@ -914,6 +935,35 @@ convert from VCF to gen/sample format used by IMPUTE2 and SHAPEIT\&. The columns
tag to take values for \&.gen file: GT,PL,GL,GP
.RE
.RE
.sp
.it 1 an-trap
.nr an-no-space-flag 1
.nr an-break-flag 1
.br
.ps +1
\fBtsv options:\fR
.RS 4
.PP
\fB\-\-tsv2vcf\fR \fIfile\fR
.RS 4
convert from TSV (tab\-separated values) format to VCF\&. The input file fields can be tab\- or space\- delimited
.RE
.PP
\fB\-c, \-\-columns\fR \fIlist\fR
.RS 4
comma\-separated list of fields in the input file\&. In the current version, the fields CHROM, POS, ID, and AA are expected and can appear in arbitrary order\&. The AA field lists alleles on the forward reference strand, for example "CC" or "CT" for diploid genotypes or "C" for haploid genotypes (sex chromosomes)\&. Insertions and deletions are not supported yet\&. Columns which should be ignored in the input file can be indicated by "\-"\&.
.RE
.PP
\fB\-f, \-\-ref\fR \fIfile\fR
.RS 4
reference sequence in fasta format
.RE
.PP
\fB\-s, \-\-sample\fR \fIstring\fR
.RS 4
sample name
.RE
.RE
.SS "bcftools filter \fI[OPTIONS]\fR \fIFILE\fR"
.sp
Apply fixed\-threshold filters\&.
Expand Down Expand Up @@ -1572,6 +1622,7 @@ process multiple VCFs listed in the file
[] The brackets loop over all samples
%GT Genotype (e\&.g\&. 0/1)
%TGT Translated genotype (e\&.g\&. C/A)
%IUPACGT Genotype translated to IUPAC ambiguity codes (e\&.g\&. M instead of C/A)
%LINE Prints the whole line
%SAMPLE Sample name
.fi
Expand Down
29 changes: 29 additions & 0 deletions bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,13 @@ the *-a, --allow-overlaps* option is specified.
*-T, --targets-file* 'FILE'::
see *<<common_options,Common Options>>*

==== VCF output options:

*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*

*-O, --output-type* 'b'|'u'|'z'|'v'::
see *<<common_options,Common Options>>*

==== gen/sample options:
*-g, --gensample* 'prefix' or 'gen-file','sample-file'::
Expand All @@ -522,6 +529,27 @@ the *-a, --allow-overlaps* option is specified.
*--tag* 'STRING'::
tag to take values for .gen file: GT,PL,GL,GP

==== tsv options:
*--tsv2vcf* 'file'::
convert from TSV (tab-separated values) format to VCF. The
input file fields can be tab- or space- delimited

*-c, --columns* 'list'::
comma-separated list of fields in the input file. In the current
version, the fields CHROM, POS, ID, and AA are expected and
can appear in arbitrary order.
The AA field lists alleles on the forward reference strand,
for example "CC" or "CT" for diploid genotypes or "C"
for haploid genotypes (sex chromosomes). Insertions and deletions
are not supported yet.
Columns which should be ignored in the input file can be indicated by "-".

*-f, --ref* 'file'::
reference sequence in fasta format

*-s, --sample* 'string'::
sample name


[[filter]]
=== bcftools filter '[OPTIONS]' 'FILE'
Expand Down Expand Up @@ -915,6 +943,7 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format.
[] The brackets loop over all samples
%GT Genotype (e.g. 0/1)
%TGT Translated genotype (e.g. C/A)
%IUPACGT Genotype translated to IUPAC ambiguity codes (e.g. M instead of C/A)
%LINE Prints the whole line
%SAMPLE Sample name

Expand Down
104 changes: 98 additions & 6 deletions convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ THE SOFTWARE. */
#define T_GT_TO_PROB3 19 // not publicly advertised
#define T_PL_TO_PROB3 20 // not publicly advertised
#define T_FIRST_ALT 21 // not publicly advertised
#define T_IUPAC_GT 22

typedef struct _fmt_t
{
Expand Down Expand Up @@ -156,9 +157,15 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp

if ( info->len == 1 )
{
if ( info->type == BCF_BT_FLOAT ) ksprintf(str, "%g", info->v1.f);
else if ( info->type != BCF_BT_CHAR ) kputw(info->v1.i, str);
else kputc(info->v1.i, str);
switch (info->type)
{
case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
case BCF_BT_CHAR: kputc(info->v1.i, str); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
}
else if ( fmt->subscript >=0 )
{
Expand All @@ -167,9 +174,20 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
kputc('.', str);
return;
}
if ( info->type == BCF_BT_FLOAT ) ksprintf(str, "%g", ((float*)info->vptr)[fmt->subscript]);
else if ( info->type != BCF_BT_CHAR ) kputw(bcf_array_ivalue(info->vptr,info->type,fmt->subscript), str);
else error("TODO: %s:%d .. info->type=%d\n", __FILE__,__LINE__, info->type);
#define BRANCH(type_t, is_missing, is_vector_end, kprint) { \
type_t val = ((type_t *) info->vptr)[fmt->subscript]; \
if ( is_missing || is_vector_end ) kputc('.',str); \
else kprint; \
}
switch (info->type)
{
case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break;
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
}
else
bcf_fmt_array(str, info->len, info->type, info->vptr);
Expand Down Expand Up @@ -248,6 +266,78 @@ static void process_tgt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isampl
}
if (l == 0) kputc('.', str);
}
static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
init_format(convert, line, fmt);
if ( fmt->fmt==NULL ) return;

// Init mapping between alleles and IUPAC table
hts_expand(uint8_t, line->n_allele, convert->ndat, convert->dat);
int8_t *dat = (int8_t*)convert->dat;
int i;
for (i=0; i<line->n_allele; i++)
{
if ( line->d.allele[i][1] ) dat[i] = -1;
else
{
switch (line->d.allele[i][0])
{
case 'A': dat[i] = 0; break;
case 'C': dat[i] = 1; break;
case 'G': dat[i] = 2; break;
case 'T': dat[i] = 3; break;
case 'a': dat[i] = 0; break;
case 'c': dat[i] = 1; break;
case 'g': dat[i] = 2; break;
case 't': dat[i] = 3; break;
default: dat[i] = -1;
}
}
}
}
static void process_iupac_gt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( !fmt->ready )
init_format_iupac(convert, line, fmt);

if ( fmt->fmt==NULL )
{
kputc('.', str);
return;
}

assert( fmt->fmt->type==BCF_BT_INT8 );

static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} };
int8_t *dat = (int8_t*)convert->dat;

int8_t *x = (int8_t*)(fmt->fmt->p + isample*fmt->fmt->size); // FIXME: does not work with n_alt >= 64
int l = 0;
while ( l<fmt->fmt->n && x[l]!=bcf_int8_vector_end && x[l]!=bcf_int8_missing ) l++;

if ( l==2 )
{
// diploid
int ia = (x[0]>>1) - 1, ib = (x[1]>>1) - 1;
if ( ia>=0 && ia<line->n_allele && ib>=0 && ib<line->n_allele && dat[ia]>=0 && dat[ib]>=0 )
{
kputc(iupac[dat[ia]][dat[ib]], str);
return;
}
}
for (l = 0; l < fmt->fmt->n && x[l] != bcf_int8_vector_end; ++l)
{
if (l) kputc("/|"[x[l]&1], str);
if (x[l]>>1)
{
int ial = (x[l]>>1) - 1;
kputs(line->d.allele[ial], str);
}
else
kputc('.', str);
}
if (l == 0) kputc('.', str);
}
static void process_sample(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
kputs(convert->header->samples[isample], str);
Expand Down Expand Up @@ -447,6 +537,7 @@ static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
case T_MASK: fmt->handler = NULL; break;
case T_GT: fmt->handler = &process_gt; convert->max_unpack |= BCF_UN_FMT; break;
case T_TGT: fmt->handler = &process_tgt; convert->max_unpack |= BCF_UN_FMT; break;
case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
case T_LINE: fmt->handler = &process_line; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
Expand Down Expand Up @@ -485,6 +576,7 @@ static char *parse_tag(convert_t *convert, char *p, int is_gtf)
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
else
{
fmt_t *fmt = register_tag(convert, T_FORMAT, str.s, is_gtf);
Expand Down
10 changes: 6 additions & 4 deletions filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -985,8 +985,9 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
while ( b<bend && *b ) b++;
if ( a-astr != b-bstr ) atok->pass_samples[i] = 0;
else atok->pass_samples[i] = strncmp(astr,bstr,a-astr)==0 ? 1 : 0;
if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
if ( logic!=TOK_EQ )
atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
pass_site |= atok->pass_samples[i];
}
if ( !atok->nsamples ) atok->nsamples = btok->nsamples;
}
Expand Down Expand Up @@ -1035,8 +1036,9 @@ static int cmp_vector_strings(token_t *atok, token_t *btok, int logic) // log
while ( y<yend && *y ) y++;
if ( x-xstr != y-ystr ) atok->pass_samples[i] = 0;
else atok->pass_samples[i] = strncmp(xstr,ystr,x-xstr)==0 ? 1 : 0;
if ( logic!=TOK_EQ ) pass_site = pass_site ? 0 : 1;
if ( !pass_site && atok->pass_samples[i] ) pass_site = 1;
if ( logic!=TOK_EQ )
atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
pass_site |= atok->pass_samples[i];
}
if ( !atok->nsamples )
atok->nvalues = atok->nsamples = btok->nsamples; // is it a bug? not sure if atok->nvalues should be set
Expand Down
1 change: 1 addition & 0 deletions test/check.chk
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# SN, Summary numbers:
# SN [2]id [3]key [4]value
SN 0 number of samples: 2
SN 0 number of records: 18
SN 0 number of SNPs: 5
SN 0 number of MNPs: 1
SN 0 number of indels: 9
Expand Down
2 changes: 0 additions & 2 deletions test/mpileup.1.out
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of high-quality non-reference bases">
Expand Down
2 changes: 0 additions & 2 deletions test/mpileup.2.out
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of high-quality non-reference bases">
Expand Down
2 changes: 0 additions & 2 deletions test/mpileup.cAls.out
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
##INFO=<ID=MQSB,Number=1,Type=Float,Description="Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)">
##INFO=<ID=SGB,Number=1,Type=Float,Description="Segregation based metric.">
##INFO=<ID=MQ0F,Number=1,Type=Float,Description="Fraction of MQ0 reads (smaller is better)">
##INFO=<ID=I16,Number=16,Type=Float,Description="Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h">
##INFO=<ID=QS,Number=R,Type=Float,Description="Auxiliary tag used for calling">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of high-quality bases">
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of high-quality non-reference bases">
Expand Down
Loading

0 comments on commit 34f212f

Please sign in to comment.