Skip to content

Commit

Permalink
stats,call: Use the new --sample convention
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Feb 13, 2014
1 parent 914b1b7 commit e050878
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 112 deletions.
23 changes: 12 additions & 11 deletions bcftools.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 02/12/2014
.\" Date: 02/13/2014
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "02/12/2014" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "02/13/2014" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
Expand Down Expand Up @@ -279,9 +279,12 @@ Output compressed BCF (\fIb\fR), uncompressed BCF (\fIu\fR), compressed VCF (\fI
Regions can be specified either on command line or in a VCF, BED, or tab\-delimited file (the default)\&. The columns of the tab\-delimited file are: CHROM, POS, and, optionally, POS_TO, where positions are 1\-based and inclusive\&. Uncompressed files are stored in memory, while bgzip\-compressed and tabix\-indexed region files are streamed\&. Note that sequence names must match exactly, "chr20" is not the same as "20"\&. This option requires indexed VCF/BCF files\&.
.RE
.PP
\fB\-s, \-\-samples\fR \fIFILE\fR|\fILIST\fR
\fB\-s, \-\-samples\fR \fILIST\fR|\fI:FILE\fR
.RS 4
Comma\-separated list of samples to include or a file with one sample per line\&. The command
Comma\-separated list of samples to include or, when prefixed with colon
\fI:\fR, read sample names from
\fIFILE\fR
with one sample per line\&. The command
\fBbcftools call\fR
accepts an optional second column indicating ploidy (0, 1 or 2) and can parse also PED files\&. With
\fBbcftools call\fR\fB \-C\fR
Expand Down Expand Up @@ -966,9 +969,8 @@ see
.PP
\fB\-s, \-\-samples\fR \fILIST\fR|\fI:FILE\fR
.RS 4
comma\-separated list of samples or, when prefixed with colon
\fI:\fR, read sample names from
\fIFILE\fR
see
\fBCommon Options\fR
.RE
.PP
\fB\-t, \-\-targets\fR \fIfile\fR|\fIchr\fR|\fIchr:pos\fR|\fIchr:from\-to\fR|\fIchr:from\-\fR[,\&...]
Expand Down Expand Up @@ -1135,9 +1137,8 @@ see
.PP
\fB\-s, \-\-samples\fR \fILIST\fR|\fI:FILE\fR
.RS 4
comma\-separated list of samples or, when prefixed with colon
\fI:\fR, read sample names from
\fIFILE\fR
see
\fBCommon Options\fR
.RE
.PP
\fB\-t, \-\-targets\fR \fIfile\fR|\fIchr\fR|\fIchr:pos\fR|\fIchr:from\-to\fR|\fIchr:from\-\fR[,\&...]
Expand Down Expand Up @@ -1232,7 +1233,7 @@ see
\fBCommon Options\fR
.RE
.PP
\fB\-s, \-\-samples\fR \fIFILE\fR|\fILIST\fR
\fB\-s, \-\-samples\fR \fILIST\fR|\fI:FILE\fR
.RS 4
see
\fBCommon Options\fR
Expand Down
15 changes: 7 additions & 8 deletions bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,10 @@ OPTIONS
sequence names must match exactly, "chr20" is not the same as "20". This
option requires indexed VCF/BCF files.

*-s, --samples* 'FILE'|'LIST'::
Comma-separated list of samples to include or a file with one sample
per line. The command *<<call,bcftools call>>* accepts an optional second
*-s, --samples* 'LIST'|':FILE'::
Comma-separated list of samples to include or, when prefixed with colon
':', read sample names from 'FILE' with one sample per line.
The command *<<call,bcftools call>>* accepts an optional second
column indicating ploidy (0, 1 or 2) and can parse also PED files.
With *<<call,bcftools call>> -C* 'trio', PED file is expected.

Expand Down Expand Up @@ -528,8 +529,7 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format.
see *<<common_options,Common Options>>*

*-s, --samples* 'LIST'|':FILE'::
comma-separated list of samples or, when prefixed with colon ':', read sample
names from 'FILE'
see *<<common_options,Common Options>>*

*-t, --targets* 'file'|'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]::
see *<<common_options,Common Options>>*
Expand Down Expand Up @@ -624,8 +624,7 @@ Transition probabilities:
see *<<common_options,Common Options>>*

*-s, --samples* 'LIST'|':FILE'::
comma-separated list of samples or, when prefixed with colon ':', read sample
names from 'FILE'
see *<<common_options,Common Options>>*

*-t, --targets* 'file'|'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]::
see *<<common_options,Common Options>>*
Expand Down Expand Up @@ -682,7 +681,7 @@ the program generates separate stats for intersection and the complements.
*-r, --regions* 'file'|'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]::
see *<<common_options,Common Options>>*

*-s, --samples* 'FILE'|'LIST'::
*-s, --samples* 'LIST'|':FILE'::
see *<<common_options,Common Options>>*

*-t, --targets* 'file'|'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]::
Expand Down
160 changes: 70 additions & 90 deletions vcfcall.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <stdarg.h>
#include <htslib/kfunc.h>
#include <htslib/synced_bcf_reader.h>
#include <sys/stat.h>
#include <ctype.h>
#include "bcftools.h"
#include "call.h"
#include "prob1.h"
Expand All @@ -20,9 +20,6 @@ void error(const char *format, ...);
#define lrand48() rand()
#endif

#include <htslib/kseq.h>
KSTREAM_INIT(gzFile, gzread, 16384)

#define CF_NO_GENO 1
// (1<<1)
#define CF_CCALL (1<<2)
Expand Down Expand Up @@ -98,40 +95,40 @@ static char **add_sample(char **sam, int *n, int *m, char *name, int ploidy, int
return sam;
}

static char **read_ped_samples(call_t *call, const char *fn, int *_n)
static char **parse_ped_samples(call_t *call, char **vals, int _n)
{
char **sam = 0;
int dret, n = 0, max = 0, i;
kstream_t *ks;
kstring_t s = {0,0,0};
gzFile fp;
fp = gzopen(fn, "r");
if ( !fp ) error("Could not read the file: %s\n", fn);
ks = ks_init(fp);
while (ks_getuntil(ks, KS_SEP_LINE, &s, &dret) >= 0)
int i, j, max = 0, n = 0;
kstring_t str = {0,0,0};
char **sam = NULL;
for (i=0; i<_n; i++)
{
char *col_ends[5], *tmp = s.s;
i = 0;
while ( *tmp && i<5 )
str.l = 0;
kputs(vals[i], &str);
char *col_ends[5], *tmp = str.s;
j = 0;
while ( *tmp && j<5 )
{
if ( *tmp=='\t' || *tmp==' ' )
{
col_ends[i] = tmp;
if ( isspace(*tmp) )
{
*tmp = 0;
i++;
++tmp;
while ( isspace(*tmp) ) tmp++; // allow multiple spaces
col_ends[j] = tmp-1;
j++;
continue;
}
tmp++;
}
if ( i!=5 ) break;
if ( j!=5 ) break;

family_t *fam = get_family(call->fams, call->nfams, s.s);
family_t *fam = get_family(call->fams, call->nfams, str.s);
if ( !fam )
{
call->nfams++;
hts_expand(family_t, call->nfams, call->mfams, call->fams);
fam = &call->fams[call->nfams-1];
fam->name = strdup(s.s);
for (i=0; i<3; i++) fam->sample[i] = -1;
fam->name = strdup(str.s);
for (j=0; j<3; j++) fam->sample[j] = -1;
}

int ploidy = 2;
Expand All @@ -141,27 +138,31 @@ static char **read_ped_samples(call_t *call, const char *fn, int *_n)
else
ploidy = call->flag & CALL_CHR_X ? 2 : 0; // female: two chrX copies, no chrY
}
sam = add_sample(sam, &n, &max, col_ends[0]+1, ploidy, &i);
if ( col_ends[2]-col_ends[1]!=2 || col_ends[1][1]!='0' ) // father
sam = add_sample(sam, &n, &max, col_ends[0]+1, ploidy, &j);
if ( strcmp(col_ends[1]+1,"0") ) // father
{
if ( fam->sample[CHILD]>=0 ) error("Multiple childs in %s\n", s.s);
fam->sample[CHILD] = i;
if ( fam->sample[FATHER]>=0 ) error("Two fathers in %s?\n", s.s);
if ( fam->sample[CHILD]>=0 ) error("Multiple childs in %s [%s,%s]\n", str.s, sam[j],sam[fam->sample[CHILD]]);
fam->sample[CHILD] = j;
if ( fam->sample[FATHER]>=0 ) error("Two fathers in %s?\n", str.s);
sam = add_sample(sam, &n, &max, col_ends[1]+1, call->flag & (CALL_CHR_X|CALL_CHR_Y) ? 1 : 2, &fam->sample[FATHER]);
}
if ( col_ends[3]-col_ends[2]!=2 || col_ends[2][1]!='0' ) // mother
if ( strcmp(col_ends[2]+1,"0") ) // mother
{
if ( fam->sample[MOTHER]>=0 ) error("Two mothers in %s?\n", s.s);
if ( fam->sample[MOTHER]>=0 ) error("Two mothers in %s?\n", str.s);
sam = add_sample(sam, &n, &max, col_ends[2]+1, call->flag & CALL_CHR_Y ? 0 : 2, &fam->sample[MOTHER]);
}
}
free(str.s);

if ( i!=_n ) // not a ped file
{
if ( i>0 ) error("Could not parse the samples, thought it was PED format, some rows have 5 columns?!\n");
return NULL;
}
assert( n==_n );
for (i=0; i<call->nfams; i++)
assert( call->fams[i].sample[0]>=0 && call->fams[i].sample[1]>=0 && call->fams[i].sample[2]>=0 ); //todo
assert( call->fams[i].sample[0]>=0 && call->fams[i].sample[1]>=0 && call->fams[i].sample[2]>=0 ); // multiple childs, not a trio

ks_destroy(ks);
gzclose(fp);
free(s.s);
*_n = n;
return sam;
}

Expand All @@ -177,64 +178,38 @@ static char **read_ped_samples(call_t *call, const char *fn, int *_n)
*/
static char **read_samples(call_t *call, const char *fn, int *_n)
{
int dret, n = 0, max = 0;
char **sam = 0;
*_n = 0;
int i, n;
char **vals = hts_readlist(fn, &n);
if ( !vals ) error("Could not read the file: %s\n", fn);

struct stat sbuf;
if ( stat(fn, &sbuf) != 0 )
char **smpls = parse_ped_samples(call, vals, n);
if ( !smpls )
{
// it is not a file, interpret as list of sample names
const char *t = fn, *p = t;
while (*t)
smpls = (char**) malloc(sizeof(char*)*n);
for (i=0; i<n; i++)
{
t++;
if ( *t==',' || !*t )
{
sam = (char**) realloc(sam, sizeof(char*)*(n+1));
sam[n] = (char*) malloc(sizeof(char)*(t-p+2));
memcpy(sam[n], p, t-p);
sam[n][t-p] = 0;
sam[n][t-p+1] = 2; // assume diploid
p = t+1;
n++;
char *s = vals[i];
while ( *s && !isspace(*s) ) s++;
int len = s-vals[i];
smpls[i] = (char*) malloc(len+2);
strncpy(smpls[i],vals[i],len);
smpls[i][len] = 0;
while ( *s && isspace(*s) ) s++;
int x = 2;
if ( *s )
{
x = (int)s[0] - '0'; // Convert ASCII digit to decimal
if (x != 0 && x != 1 && x != 2) error("Ploidy can only be 0, 1 or 2: %s\n", vals[i]);
}
smpls[i][len+1] = x;
}
*_n = n;
return sam;
}

sam = read_ped_samples(call, fn, _n);
if ( sam ) return sam;
for (i=0; i<n; i++) free(vals[i]);
free(vals);

kstream_t *ks;
kstring_t s = {0,0,0};
gzFile fp;
fp = gzopen(fn, "r");
if ( !fp ) error("Could not read the file: %s\n", fn);
ks = ks_init(fp);
while (ks_getuntil(ks, KS_SEP_SPACE, &s, &dret) >= 0)
{
hts_expand(char*,(n+1),max,sam);
int l = s.l;
sam[n] = (char*) malloc(sizeof(char)*(s.l+2));
strcpy(sam[n], s.s);
sam[n][l+1] = 2; // by default, diploid
if (dret != '\n') {
if (ks_getuntil(ks, KS_SEP_SPACE, &s, &dret) >= 0) { // read ploidy, 1 or 2
int x = (int)s.s[0] - '0'; // Convert ASCII digit to decimal
if (x == 0 || x == 1 || x == 2) sam[n][l+1] = x;
else fprintf(stderr, "(%s) ploidy can only be 0, 1 or 2; assuming diploid\n", __func__);
}
if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
}
++n;
}
ks_destroy(ks);
gzclose(fp);
free(s.s);
*_n = n;
return sam;
*_n = n;
return smpls;
}


Expand Down Expand Up @@ -262,11 +237,16 @@ static void init_data(args_t *args)
args->samples_map = (int *) malloc(sizeof(int)*args->nsamples);
args->aux.hdr = bcf_hdr_subset(args->aux.srs->readers[0].header, args->nsamples, args->samples, args->samples_map);
for (i=0; i<args->nsamples; i++)
if ( args->samples_map[i]<0 ) fprintf(stderr,"Warning: no such sample: \"%s\"\n", args->samples[i]);
if ( args->samples_map[i]<0 ) error("No such sample \"%s\", please prefix with ':' to indicate file name\n", args->samples[i]);
if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
}
else
{
args->aux.hdr = bcf_hdr_dup(args->aux.srs->readers[0].header);
for (i=0; i<args->nsamples; i++)
if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
error("No such sample \"%s\", please prefix with ':' to indicate file name\n", args->samples[i]);
}

// Reorder ploidy and family indexes to match mpileup's output and exclude samples which are not available
if ( args->aux.ploidy )
Expand All @@ -277,7 +257,7 @@ static void init_data(args_t *args)
for (j=0; j<3; j++)
{
int k = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[ args->aux.fams[i].sample[j] ]);
if ( k<0 ) error("No such sample: %s\n", args->samples[ args->aux.fams[i].sample[j] ]);
if ( k<0 ) error("No such sample \"%s\", please prefix with ':' to indicate file name\n", args->samples[ args->aux.fams[i].sample[j] ]);
args->aux.fams[i].sample[j] = k;
}
}
Expand Down Expand Up @@ -366,7 +346,7 @@ static void usage(args_t *args)
fprintf(stderr, "File format options:\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n");
fprintf(stderr, " -r, --regions <reg|file> restrict to comma-separated list of regions or regions listed in a file, see man page for details\n");
fprintf(stderr, " -s, --samples <list|file> sample list, PED file or a file with optional second column for ploidy (0, 1 or 2) [all samples]\n");
fprintf(stderr, " -s, --samples <list|:file> sample list, PED file or a file with optional second column for ploidy (0, 1 or 2) [all samples]\n");
fprintf(stderr, " -t, --targets <reg|file> similar to -r but streams rather than index-jumps, see man page for details\n");
fprintf(stderr, "\nInput/output options:\n");
fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
Expand Down
2 changes: 1 addition & 1 deletion vcfquery.c
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ static void usage(void)
fprintf(stderr, " -l, --list-samples print the list of samples and exit \n");
fprintf(stderr, " -r, --regions <reg|file> restrict to comma-separated list of regions or regions listed in a file, see man page for details\n");
fprintf(stderr, " -t, --targets <reg|file> similar to -r but streams rather than index-jumps, see man page for details\n");
fprintf(stderr, " -s, --samples <list|file> samples to include: comma-separated list or one name per line in a file\n");
fprintf(stderr, " -s, --samples <list|:file> comma-separated list of samples to include or one name per line in a file\n");
fprintf(stderr, " -v, --vcf-list <file> process multiple VCFs listed in the file\n");
fprintf(stderr, "Expressions:\n");
fprintf(stderr, "\t%%CHROM The CHROM column (similarly also other columns, such as POS, ID, QUAL, etc.)\n");
Expand Down
10 changes: 8 additions & 2 deletions vcfstats.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,13 @@ static void init_stats(args_t *args)

if ( args->samples_fname )
{
if ( !bcf_sr_set_samples(args->files,args->samples_fname) ) error("Could not initialize samples: %s\n", args->samples_fname);
if ( !bcf_sr_set_samples(args->files,args->samples_fname) )
{
if ( args->samples_fname[0]!=':' )
error("No such sample(s), please prefix with ':' to indicate file name: \"%s\"\n", args->samples_fname);
else
error("Unable to parse the file: \"%s\"\n", args->samples_fname);
}
args->af_gts_snps = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
Expand Down Expand Up @@ -1218,7 +1224,7 @@ static void usage(void)
fprintf(stderr, " -F, --fasta-ref <file> faidx indexed reference sequence file to determine INDEL context\n");
fprintf(stderr, " -i, --split-by-ID collect stats for sites with ID separately (known vs novel)\n");
fprintf(stderr, " -r, --regions <reg|file> restrict to comma-separated list of regions or regions listed in a file, see man page for details\n");
fprintf(stderr, " -s, --samples <list|file> produce sample stats, \"-\" to include all samples\n");
fprintf(stderr, " -s, --samples <list|:file> produce sample stats, \"-\" to include all samples\n");
fprintf(stderr, " -t, --targets <reg|file> similar to -r but streams rather than index-jumps, see man page for details\n");
fprintf(stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1,100]\n");
fprintf(stderr, "\n");
Expand Down

0 comments on commit e050878

Please sign in to comment.