Skip to content

Commit

Permalink
New concat --naive option for fast operations on large BCFs
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Nov 20, 2015
1 parent 87a55fa commit f0aff23
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 6 deletions.
12 changes: 11 additions & 1 deletion doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,10 @@ columns appearing in the same order. Can be used, for example, to
concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel
VCF into one. The input files must be sorted by chr and position. The files
must be given in the correct order to produce sorted VCF on output unless
the *-a, --allow-overlaps* option is specified.
the *-a, --allow-overlaps* option is specified. With the --naive option, the files
are concatenated without being recompressed, which is very fast but dangerous
if the BCF headers differ.


*-a, --allow-overlaps*::
First coordinate of the next file can precede last record of the current file.
Expand All @@ -626,6 +629,13 @@ the *-a, --allow-overlaps* option is specified.
*-l, --ligate*::
Ligate phased VCFs by matching phase at overlapping haplotypes

*-n, --naive*::
Concatenate BCF files without recompression. This is very fast but requires
that all files have the same headers. This is because all tags and
chromosome names in the BCF body rely on the implicit order of the contig
and tag definitions in the header. Currently no sanity checks
are in place and only works for compressed BCF files. Dangerous, use with caution.

*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*

Expand Down
136 changes: 131 additions & 5 deletions vcfconcat.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ THE SOFTWARE. */
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
#include <htslib/bgzf.h>
#include "bcftools.h"

typedef struct _args_t
Expand All @@ -50,7 +51,7 @@ typedef struct _args_t

char **argv, *output_fname, *file_list, **fnames, *remove_dups, *regions_list;
int argc, nfnames, allow_overlaps, phased_concat, regions_is_file;
int compact_PS, phase_set_changed;
int compact_PS, phase_set_changed, naive_concat;
}
args_t;

Expand Down Expand Up @@ -175,8 +176,11 @@ static void destroy_data(args_t *args)
for (i=0; i<args->nfnames; i++) free(args->fnames[i]);
free(args->fnames);
if ( args->files ) bcf_sr_destroy(args->files);
if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
bcf_hdr_destroy(args->out_hdr);
if ( args->out_fh )
{
if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n");
}
if ( args->out_hdr ) bcf_hdr_destroy(args->out_hdr);
free(args->seen_seq);
free(args->start_pos);
free(args->swap_phase);
Expand Down Expand Up @@ -549,6 +553,114 @@ static void concat(args_t *args)
}
}

BGZF *hts_get_bgzfp(htsFile *fp);
static void naive_concat(args_t *args)
{
// only compressed BCF atm

BGZF *bgzf_out;
if ( strcmp("-",args->output_fname) )
bgzf_out = bgzf_open(args->output_fname,"w");
else
bgzf_out = bgzf_dopen(fileno(stdout), "w");

int page_size = getpagesize();
char *buf = (char*) valloc(page_size);
kstring_t tmp = {0,0,0};
int i;
for (i=0; i<args->nfnames; i++)
{
htsFile *hts_fp = hts_open(args->fnames[i],"r");
if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
htsFormat type = *hts_get_format(hts_fp);

if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");

BGZF *fp = hts_get_bgzfp(hts_fp);
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));

uint8_t magic[5];
if ( bgzf_read(fp, magic, 5)<0 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);

bgzf_read(fp, &tmp.l, 4);
hts_expand(char,tmp.l,tmp.m,tmp.s);
bgzf_read(fp, tmp.s, tmp.l);

// write only the first header
if ( i==0 )
{
if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
}

// Output all non-header data that were read together with the header block
int nskip = fp->block_offset;
if ( fp->block_length - nskip > 0 )
{
if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);


// Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
ssize_t nread, ncached = 0, nwr;
const int neof = 28;
char cached[neof];
while (1)
{
nread = bgzf_raw_read(fp, buf, page_size);

// page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
if ( nread<=0 ) break;
if ( nread<=neof ) // last block
{
if ( ncached )
{
// flush the part of the cache that won't be needed
nwr = bgzf_raw_write(bgzf_out, cached, nread);
if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);

// make space in the cache so that we can append to the end
if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
}

// fill the cache and check for eof outside this loop
memcpy(cached+neof-nread,buf,nread);
break;
}

// not the last block, flush the cache if full
if ( ncached )
{
nwr = bgzf_raw_write(bgzf_out, cached, ncached);
if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
ncached = 0;
}

// fill the cache
nread -= neof;
memcpy(cached,buf+nread,neof);
ncached = neof;

nwr = bgzf_raw_write(bgzf_out, buf, nread);
if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
}
if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
{
nwr = bgzf_raw_write(bgzf_out, cached, neof);
if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
}
if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
}
free(buf);
free(tmp.s);
if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
}

static void usage(args_t *args)
{
fprintf(stderr, "\n");
Expand All @@ -557,7 +669,9 @@ static void usage(args_t *args)
fprintf(stderr, " concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel\n");
fprintf(stderr, " VCF into one. The input files must be sorted by chr and position. The files\n");
fprintf(stderr, " must be given in the correct order to produce sorted VCF on output unless\n");
fprintf(stderr, " the -a, --allow-overlaps option is specified.\n");
fprintf(stderr, " the -a, --allow-overlaps option is specified. With the --naive option, the files\n");
fprintf(stderr, " are concatenated without being recompressed, which is very fast but dangerous\n");
fprintf(stderr, " if the BCF headers differ.\n");
fprintf(stderr, "Usage: bcftools concat [options] <A.vcf.gz> [<B.vcf.gz> [...]]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
Expand All @@ -567,6 +681,7 @@ static void usage(args_t *args)
fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n");
fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
fprintf(stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
fprintf(stderr, " -o, --output <file> Write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
Expand All @@ -587,6 +702,7 @@ int main_vcfconcat(int argc, char *argv[])

static struct option loptions[] =
{
{"naive",0,0,'n'},
{"compact-PS",0,0,'c'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
Expand All @@ -601,7 +717,7 @@ int main_vcfconcat(int argc, char *argv[])
{0,0,0,0}
};
char *tmp;
while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:c",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h:?o:O:f:alq:Dd:r:R:cn",loptions,NULL)) >= 0)
{
switch (c) {
case 'c': args->compact_PS = 1; break;
Expand All @@ -613,6 +729,7 @@ int main_vcfconcat(int argc, char *argv[])
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
break;
case 'n': args->naive_concat = 1; break;
case 'a': args->allow_overlaps = 1; break;
case 'l': args->phased_concat = 1; break;
case 'f': args->file_list = optarg; break;
Expand Down Expand Up @@ -649,6 +766,15 @@ int main_vcfconcat(int argc, char *argv[])
if ( !args->nfnames ) usage(args);
if ( args->remove_dups && !args->allow_overlaps ) error("The -D option is supported only with -a\n");
if ( args->regions_list && !args->allow_overlaps ) error("The -r/-R option is supported only with -a\n");
if ( args->naive_concat )
{
if ( args->allow_overlaps ) error("The option --naive cannot be combined with --allow-overlaps\n");
if ( args->phased_concat ) error("The option --naive cannot be combined with --ligate\n");
naive_concat(args);
destroy_data(args);
free(args);
return 0;
}
init_data(args);
concat(args);
destroy_data(args);
Expand Down

0 comments on commit f0aff23

Please sign in to comment.