Skip to content

Commit

Permalink
New norm --multi-overlaps option. Resolves samtools#1802 and samtoo…
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Oct 12, 2022
1 parent 79c48e2 commit d984ce9
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 4 deletions.
5 changes: 5 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ Changes affecting specific commands:
- Fix a bug where too many alleles passed to `-C alleles` via `-T` caused memory
corruption (#1790)

* bcftools norm

- New --multi-overlaps option allows to set overlapping alleles either to the
ref allele (the current default) or to a missing allele (#1764 and #1802)

* bcftools query

- Fix a rare bug where the printing of SAMPLE field with `query` was incorrectly
Expand Down
4 changes: 4 additions & 0 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2366,6 +2366,10 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
'both'; if SNPs and indels should be merged into a single record, specify
'any'.

*--multi-overlaps* '0'|'.'::
use the reference ('0') or missing ('.') allele for overlapping alleles after
splitting multiallelic sites

*--no-version*::
see *<<common_options,Common Options>>*

Expand Down
9 changes: 9 additions & 0 deletions test/norm.5.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=ref.fasta
##contig=<ID=1,length=51304566>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
1 511 . CAGCA CAG . . . GT 1|0 0|1
1 511 . CAGCA CAA . . . GT 0|0 1|0
1 511 . CAGCA CCA . . . GT 0|0 0|0
9 changes: 9 additions & 0 deletions test/norm.5.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##reference=ref.fasta
##contig=<ID=1,length=51304566>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
1 511 . CAGCA CAG . . . GT 1|0 .|1
1 511 . CAGCA CAA . . . GT .|. 1|.
1 511 . CAGCA CCA . . . GT .|. .|.
6 changes: 6 additions & 0 deletions test/norm.5.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##reference=ref.fasta
##contig=<ID=1,length=51304566>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B
1 511 . CAGCA CAG,CAA,CCA . . . GT 1|0 2|1
2 changes: 2 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@
run_test(\&test_vcf_norm,$opts,in=>'atomize.split.3',out=>'atomize.split.3.1.out',args=>'--atomize --atom-overlaps .');
run_test(\&test_vcf_norm,$opts,in=>'norm.4',out=>'norm.4.1.out',args=>'-m +both');
run_test(\&test_vcf_norm,$opts,in=>'norm.4',out=>'norm.4.2.out',args=>'-m +any');
run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.1.out',args=>'-m - --multi-overlaps 0');
run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.2.out',args=>'-m - --multi-overlaps .');
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>'');
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y');
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>'');
Expand Down
19 changes: 15 additions & 4 deletions vcfnorm.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* vcfnorm.c -- Left-align and normalize indels.
Copyright (C) 2013-2021 Genome Research Ltd.
Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Expand Down Expand Up @@ -102,7 +102,7 @@ typedef struct
int record_cmd_line, force, force_warned, keep_sum_ad;
abuf_t *abuf;
abuf_opt_t atomize;
int use_star_allele;
int use_star_allele, ma_use_ref_allele;
char *old_rec_tag;
htsFile *out;
}
Expand Down Expand Up @@ -711,11 +711,14 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int
for (j=0; j<ngts; j++)
{
if ( gt[j]==bcf_int32_vector_end ) break;
if ( bcf_gt_is_missing(gt[j]) || bcf_gt_allele(gt[j])==0 ) continue; // missing allele or ref: leave as is
if ( bcf_gt_is_missing(gt[j]) ) continue; // missing allele: leave as is
if ( (ialt==0 || args->ma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is
if ( bcf_gt_allele(gt[j])==ialt+1 )
gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT
else
else if ( args->ma_use_ref_allele )
gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF
else
gt[j] = bcf_gt_missing | bcf_gt_is_phased(gt[j]); // set to missing
}
gt += ngts;
}
Expand Down Expand Up @@ -2087,6 +2090,7 @@ static void usage(void)
fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n");
fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n");
Expand Down Expand Up @@ -2126,6 +2130,7 @@ int main_vcfnorm(int argc, char *argv[])
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
args->do_indels = 1;
args->ma_use_ref_allele = 1;
args->clevel = -1;
int region_is_file = 0;
int targets_is_file = 0;
Expand All @@ -2144,6 +2149,7 @@ int main_vcfnorm(int argc, char *argv[])
{"fasta-ref",required_argument,NULL,'f'},
{"do-not-normalize",no_argument,NULL,'N'},
{"multiallelics",required_argument,NULL,'m'},
{"multi-overlaps",required_argument,NULL,13},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"regions-overlap",required_argument,NULL,1},
Expand Down Expand Up @@ -2177,6 +2183,11 @@ int main_vcfnorm(int argc, char *argv[])
else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n");
break;
case 12 : args->old_rec_tag = optarg; break;
case 13 :
if ( optarg[0]=='0' ) args->ma_use_ref_allele = 1;
else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
else error("Invalid argument to --multi-overlaps\n");
break;
case 'N': args->do_indels = 0; break;
case 'd':
if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
Expand Down

0 comments on commit d984ce9

Please sign in to comment.