Skip to content

Commit

Permalink
Make consensus skip duplicate insertions. Fixes samtools#888
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Dec 21, 2018
1 parent bf526a0 commit 27c9e5d
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 5 deletions.
22 changes: 17 additions & 5 deletions consensus.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ typedef struct
int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
int prev_base_pos; // the position of prev_base
int prev_is_insert;

rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
Expand Down Expand Up @@ -531,11 +532,18 @@ static void apply_variant(args_t *args, bcf1_t *rec)
ialt = 1;
}

// Overlapping variant? Can be still OK iff this is an insertion
if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) )
// Overlapping variant?
if ( rec->pos <= args->fa_frz_pos )
{
fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
// Can be still OK iff this is an insertion (and which does not follow another insertion, see #888).
// This still may not be enough for more complicated cases with multiple duplicate positions
// and other types in between. In such case let the user normalize the VCF and remove duplicates.
if ( !(bcf_get_variant_type(rec,ialt) & VCF_INDEL) || rec->d.var[ialt].n <= 0 || args->prev_is_insert )
{
fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}

}

int len_diff = 0, alen = 0;
Expand Down Expand Up @@ -592,7 +600,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
error(
"The fasta sequence does not match the REF allele at %s:%d:\n"
" .vcf: [%s]\n"
" .vcf: [%s] <- (REF)\n"
" .vcf: [%s] <- (ALT)\n"
" .fa: [%s]%c%s\n",
bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
Expand Down Expand Up @@ -624,9 +632,13 @@ static void apply_variant(args_t *args, bcf1_t *rec)

args->prev_base = rec->d.allele[0][rec->rlen - 1];
args->prev_base_pos = rec->pos + rec->rlen - 1;
args->prev_is_insert = 0;
}
else
{
args->prev_is_insert = 1;
args->prev_base_pos = rec->pos;

// insertion
ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
Expand Down
2 changes: 2 additions & 0 deletions test/consensus5.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>4:86698932-86698942
GTGAATGAATG
2 changes: 2 additions & 0 deletions test/consensus5.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>4:86698932-86698942
GTGAATGAATGAATG
7 changes: 7 additions & 0 deletions test/consensus5.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.1
##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
##contig=<ID=4,assembly=b37,length=191154276>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00119
4 86698932 rs34791496 G GTGAA 100 PASS . GT 0|1
4 86698932 rs538589807;rs569662225 GTGAA GTGAATGAA,G 100 PASS . GT 0|1
1 change: 1 addition & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@
test_vcf_consensus($opts,in=>'consensus3',out=>'consensus3.out',fa=>'consensus2.fa',args=>'-H 2 -M "?"');
test_vcf_consensus($opts,in=>'consensus3',out=>'consensus3.2.out',fa=>'consensus2.fa',args=>'-H 2 -M "?" -p xx_');
test_vcf_consensus($opts,in=>'consensus4',out=>'consensus4.out',fa=>'consensus2.fa',args=>'');
test_vcf_consensus($opts,in=>'consensus5',out=>'consensus5.out',fa=>'consensus5.fa',args=>'--haplotype LA');
test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1);
test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite
test_mpileup($opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite
Expand Down

0 comments on commit 27c9e5d

Please sign in to comment.