Skip to content

Commit

Permalink
New -c/--chain option in bcftools-consensus
Browse files Browse the repository at this point in the history
This option saves the transformation of the reference to the
consensus sequence in a chain file that can be used with the
liftOver tool.
  • Loading branch information
jherrero authored and pd3 committed Dec 15, 2014
1 parent de6bbe9 commit 6a9449c
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 11 deletions.
180 changes: 170 additions & 10 deletions consensus.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,27 @@
#include "bcftools.h"
#include "rbuf.h"

typedef struct
{
int num; // number of ungapped blocks in this chain
int *block_lengths; // length of the ungapped blocks in this chain
int *ref_gaps; // length of the gaps on the reference sequence between blocks
int *alt_gaps; // length of the gaps on the alternative sequence between blocks
int ori_pos;
int ref_last_block_ori; // start position on the reference sequence of the following ungapped block (0-based)
int alt_last_block_ori; // start position on the alternative sequence of the following ungapped block (0-based)
}
chain_t;


typedef struct
{
kstring_t fa_buf; // buffered reference sequence
int fa_ori_pos; // start position of the fa_buffer (wrt original sequence)
int fa_frz_pos; // protected position to avoid conflicting variants (last pos for SNPs/ins)
int fa_mod_off; // position difference of fa_frz_pos in the ori and modified sequence (ins positive)
int fa_end_pos; // region's end position in the original sequence
int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
int fa_case; // output upper case or lower case?

rbuf_t vcf_rbuf;
Expand All @@ -52,15 +66,118 @@ typedef struct

regidx_t *mask;

int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
// Note that the chain is re-initialised for each chromosome/seq_region

bcf_srs_t *files;
bcf_hdr_t *hdr;
FILE *fp_out;
FILE *fp_chain;
char **argv;
int argc, output_iupac, haplotype, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
}
args_t;

static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
{
// fprintf(stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
chain = (chain_t*) calloc(1,sizeof(chain_t));
chain->num = 0;
chain->block_lengths = NULL;
chain->ref_gaps = NULL;
chain->alt_gaps = NULL;
chain->ori_pos = ref_ori_pos;
chain->ref_last_block_ori = ref_ori_pos;
chain->alt_last_block_ori = ref_ori_pos;
return chain;
}

static void destroy_chain(args_t *args)
{
chain_t *chain = args->chain;
free(chain->ref_gaps);
free(chain->alt_gaps);
free(chain->block_lengths);
free(chain);
chain = NULL;
}

static void print_chain(args_t *args)
{
/*
Example chain format (see: https://genome.ucsc.edu/goldenPath/help/chain.html):
chain 1 500 + 480 500 1 501 + 480 501 1
12 3 1
1 0 3
484
chain line is:
- chain
- score (sum of the length of ungapped block in this case)
- ref_seqname (from the fasta header, parsed by htslib)
- ref_seqlength (from the fasta header)
- ref_strand (+ or -; always + for bcf-consensus)
- ref_start (as defined in the fasta header)
- ref_end (as defined in the fasta header)
- alt_seqname (same as ref_seqname as bcf-consensus only considers SNPs and indels)
- alt_seqlength (adjusted to match the length of the alt sequence)
- alt_strand (+ or -; always + for bcf-consensus)
- alt_start (same as ref_start, as no edits are recorded/applied before that position)
- alt_end (adjusted to match the length of the alt sequence)
- chain_num (just an auto-increment id)
the other (sorted) lines are:
- length of the ungapped alignment block
- gap on the ref sequence between this and the next block (all but the last line)
- gap on the alt sequence between this and the next block (all but the last line)
*/
chain_t *chain = args->chain;
int n = chain->num;
int ref_end_pos = args->fa_length + chain->ori_pos;
int last_block_size = ref_end_pos - chain->ref_last_block_ori;
int alt_end_pos = chain->alt_last_block_ori + last_block_size;
int score = 0;
for (n=0; n<chain->num; n++) {
score += chain->block_lengths[n];
}
score += last_block_size;
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
for (n=0; n<chain->num; n++) {
fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
}
fprintf(args->fp_chain, "%d\n\n", last_block_size);
}

static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
{
// fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
int num = chain->num;

if (ref_start <= chain->ref_last_block_ori) {
// In case this variant is back-to-back with the previous one
chain->ref_last_block_ori = ref_start + ref_len;
chain->alt_last_block_ori = alt_start + alt_len;
chain->ref_gaps[num-1] += ref_len;
chain->alt_gaps[num-1] += alt_len;

} else {
// Extend the ungapped blocks, store the gap length
chain->block_lengths = realloc(chain->block_lengths, (num + 1) * sizeof(int));
chain->ref_gaps = realloc(chain->ref_gaps, (num + 1) * sizeof(int));
chain->alt_gaps = realloc(chain->alt_gaps, (num + 1) * sizeof(int));
chain->block_lengths[num] = ref_start - chain->ref_last_block_ori;
chain->ref_gaps[num] = ref_len;
chain->alt_gaps[num] = alt_len;
// Update the start positions of the next block
chain->ref_last_block_ori = ref_start + ref_len;
chain->alt_last_block_ori = alt_start + alt_len;
// Increment the number of ungapped blocks
chain->num++;
}
}

static void init_data(args_t *args)
{
args->files = bcf_sr_init();
Expand All @@ -83,6 +200,12 @@ static void init_data(args_t *args)
args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
if ( !args->mask ) error("Failed to initialize mask regions\n");
}
// In case we want to store the chains
if ( args->chain_fname )
{
args->fp_chain = fopen(args->chain_fname,"w");
args->chain_id = 0;
}
rbuf_init(&args->vcf_rbuf, 100);
args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*));
args->fp_out = args->output_fname ? fopen(args->output_fname,"w") : stdout;
Expand All @@ -97,6 +220,8 @@ static void destroy_data(args_t *args)
free(args->vcf_buf);
free(args->fa_buf.s);
if ( args->mask ) regidx_destroy(args->mask);
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
}

Expand All @@ -117,13 +242,14 @@ static void init_region(args_t *args, char *line)
from--;
ss = ++se;
to = strtol(ss,&se,10);
if ( ss==se || (*se && !isspace(*se)) ) { from = 0; to = 0x7fffffff; }
if ( ss==se || (*se && !isspace(*se)) ) { from = 0; to = 0; }
else to--;
}
}
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) error("Sequence \"%s\" not in %s\n", line,args->fname);
// if ( args->rid<0 ) error("Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
args->fa_length = 0;
args->fa_end_pos = to;
args->fa_ori_pos = from;
args->fa_mod_off = 0;
Expand All @@ -133,6 +259,12 @@ static void init_region(args_t *args, char *line)
bcf_sr_seek(args->files,line,args->fa_ori_pos);
if ( tmp_ptr ) *tmp_ptr = tmp;
fprintf(args->fp_out,">%s\n",line);
if (args->chain_fname )
{
args->chain = init_chain(args->chain, args->fa_ori_pos);
} else {
args->chain = NULL;
}
}

static bcf1_t **next_vcf_line(args_t *args)
Expand Down Expand Up @@ -279,15 +411,15 @@ static void apply_variant(args_t *args, bcf1_t *rec)
error(
"The fasta sequence does not match the REF allele at %s:%d:\n"
" .vcf: [%s]\n"
" .vcf: [%s] <- (ALT)\n"
" .fa: [%s]%c%s\n",
bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], args->fa_buf.s+idx,
bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
}
if ( !strcasecmp(rec->d.allele[ialt], "<DEL>") ) {
rec->d.allele[ialt] = "\0";
}

int alen = strlen(rec->d.allele[ialt]);
if ( args->fa_case )
for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
Expand All @@ -311,10 +443,26 @@ static void apply_variant(args_t *args, bcf1_t *rec)
for (i=0; i<alen; i++)
args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
}
if (args->chain && len_diff != 0)
{
// If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
{
// ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
}
else
{
// otherwise, just the coordinates of the variant as given
push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
}
}
args->fa_buf.l += len_diff;
args->fa_mod_off += len_diff;
args->fa_frz_pos = rec->pos + rec->rlen - 1;
}


static void mask_region(args_t *args, char *seq, int len)
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
Expand Down Expand Up @@ -348,29 +496,34 @@ static void consensus(args_t *args)
{
while ( args->vcf_rbuf.n )
{
if (args->chain) {
print_chain(args);
destroy_chain(args);
}
bcf1_t *rec = args->vcf_buf[args->vcf_rbuf.f];
if ( rec->rid!=args->rid || rec->pos > args->fa_end_pos ) break;
if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) ) break;
int i = rbuf_shift(&args->vcf_rbuf);
apply_variant(args, args->vcf_buf[i]);
}
flush_fa_buffer(args, 0);
init_region(args, str.s+1);
continue;
}
args->fa_length += str.l;

// determine if uppercase or lowercase is used in this fasta file
if ( args->fa_case==-1 ) args->fa_case = toupper(str.s[0])==str.s[0] ? 1 : 0;

if ( args->mask ) mask_region(args, str.s, str.l);
if ( args->mask && args->rid>=0) mask_region(args, str.s, str.l);
kputs(str.s, &args->fa_buf);

bcf1_t **rec_ptr = NULL;
while ( (rec_ptr = next_vcf_line(args)) )
while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
{
bcf1_t *rec = *rec_ptr;

// still the same chr and the same region? if not, fasta buf can be flushed
if ( rec->rid!=args->rid || rec->pos > args->fa_end_pos )
if ( rec->rid!=args->rid || ( args->fa_end_pos && rec->pos > args->fa_end_pos ) )
{
// save the vcf record until next time and flush
unread_vcf_line(args, rec_ptr);
Expand All @@ -396,6 +549,10 @@ static void consensus(args_t *args)
}
if ( !rec_ptr ) flush_fa_buffer(args, 60);
}
if (args->chain) {
print_chain(args);
destroy_chain(args);
}
flush_fa_buffer(args, 0);
hts_close(fasta);
free(str.s);
Expand All @@ -413,6 +570,7 @@ static void usage(args_t *args)
fprintf(stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(stderr, " -m, --mask <file> replace regions with N\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
Expand All @@ -435,10 +593,11 @@ int main_consensus(int argc, char *argv[])
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
{"mask",1,0,'m'},
{"chain",1,0,'c'},
{0,0,0,0}
};
char c;
while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
{
Expand All @@ -447,6 +606,7 @@ int main_consensus(int argc, char *argv[])
case 'i': args->output_iupac = 1; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'c': args->chain_fname = optarg; break;
case 'H':
args->haplotype = optarg[0] - '0';
if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
Expand Down
11 changes: 11 additions & 0 deletions test/consensus.1.chain
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
chain 497 1 501 + 1 501 1 502 + 1 502 1
11 3 1
1 0 3
485

chain 495 2 501 + 0 501 2 505 + 0 505 2
61 3 1
54 3 1
58 0 8
322

8 changes: 8 additions & 0 deletions test/consensus.2.chain
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chain 500 1 501 + 1 501 1 504 + 1 504 1
15 0 3
485

chain 490 2 501 + 0 501 2 490 + 0 490 2
199 11 0
291

11 changes: 11 additions & 0 deletions test/consensus.3.chain
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
chain 497 1 501 + 1 501 1 502 + 1 502 1
11 3 1
1 0 3
485

chain 495 2 501 + 0 501 2 505 + 0 505 2
61 3 1
54 3 1
58 0 8
322

8 changes: 8 additions & 0 deletions test/consensus.4.chain
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chain 500 1 501 + 1 501 1 504 + 1 504 1
15 0 3
485

chain 490 2 501 + 0 501 2 490 + 0 490 2
199 11 0
291

15 changes: 14 additions & 1 deletion test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,13 @@
test_vcf_convert($opts,in=>'convert',out=>'convert.hls.samples',args=>'-h .,.,-');
test_vcf_convert_tsv2vcf($opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme');
test_vcf_consensus($opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'');
test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.1.chain',chain=>'consensus.1.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'');
test_vcf_consensus($opts,in=>'consensus',out=>'consensus.2.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1');
test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.2.chain',chain=>'consensus.2.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1');
test_vcf_consensus($opts,in=>'consensus',out=>'consensus.3.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-i');
test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.3.chain',chain=>'consensus.3.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-i');
test_vcf_consensus($opts,in=>'consensus',out=>'consensus.4.out',fa=>'consensus.fa',args=>'-H 1');
test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.4.chain',chain=>'consensus.4.chain',fa=>'consensus.fa',args=>'-H 1');

print "\nNumber of tests:\n";
printf " total .. %d\n", $$opts{nok}+$$opts{nfailed};
Expand Down Expand Up @@ -729,6 +733,15 @@ sub test_vcf_consensus
my ($opts,%args) = @_;
bgzip_tabix_vcf($opts,$args{in});
my $mask = $args{mask} ? "-m $$opts{path}/$args{mask}" : '';
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask");
my $chain = $args{chain} ? "-c $$opts{path}/$args{chain}" : '';
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain");
}
sub test_vcf_consensus_chain
{
my ($opts,%args) = @_;
bgzip_tabix_vcf($opts,$args{in});
my $mask = $args{mask} ? "-m $$opts{path}/$args{mask}" : '';
my $chain = $args{chain} ? "-c $$opts{path}/$args{chain}.new" : '';
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain > /dev/null; cat $$opts{path}/$args{chain}.new");
}

0 comments on commit 6a9449c

Please sign in to comment.