diff --git a/NEWS b/NEWS index fdb2238c5..bbcfbdcc4 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,11 @@ Changes affecting specific commands: - Fix a bug where too many alleles passed to `-C alleles` via `-T` caused memory corruption (#1790) +* bcftools norm + + - New --multi-overlaps option allows to set overlapping alleles either to the + ref allele (the current default) or to a missing allele (#1764 and #1802) + * bcftools query - Fix a rare bug where the printing of SAMPLE field with `query` was incorrectly diff --git a/doc/bcftools.txt b/doc/bcftools.txt index eb6b6843a..457bde783 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -2366,6 +2366,10 @@ the *<>* option is supplied. 'both'; if SNPs and indels should be merged into a single record, specify 'any'. +*--multi-overlaps* '0'|'.':: + use the reference ('0') or missing ('.') allele for overlapping alleles after + splitting multiallelic sites + *--no-version*:: see *<>* diff --git a/test/norm.5.1.out b/test/norm.5.1.out new file mode 100644 index 000000000..180509fe2 --- /dev/null +++ b/test/norm.5.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=ref.fasta +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 511 . CAGCA CAG . . . GT 1|0 0|1 +1 511 . CAGCA CAA . . . GT 0|0 1|0 +1 511 . CAGCA CCA . . . GT 0|0 0|0 diff --git a/test/norm.5.2.out b/test/norm.5.2.out new file mode 100644 index 000000000..fd57823a1 --- /dev/null +++ b/test/norm.5.2.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=ref.fasta +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 511 . CAGCA CAG . . . GT 1|0 .|1 +1 511 . CAGCA CAA . . . GT .|. 1|. +1 511 . CAGCA CCA . . . GT .|. .|. diff --git a/test/norm.5.vcf b/test/norm.5.vcf new file mode 100644 index 000000000..4832f9c78 --- /dev/null +++ b/test/norm.5.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##reference=ref.fasta +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 511 . CAGCA CAG,CAA,CCA . . . GT 1|0 2|1 diff --git a/test/test.pl b/test/test.pl index ad9fb1e1d..f15eda3d7 100755 --- a/test/test.pl +++ b/test/test.pl @@ -264,6 +264,8 @@ run_test(\&test_vcf_norm,$opts,in=>'atomize.split.3',out=>'atomize.split.3.1.out',args=>'--atomize --atom-overlaps .'); run_test(\&test_vcf_norm,$opts,in=>'norm.4',out=>'norm.4.1.out',args=>'-m +both'); run_test(\&test_vcf_norm,$opts,in=>'norm.4',out=>'norm.4.2.out',args=>'-m +any'); +run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.1.out',args=>'-m - --multi-overlaps 0'); +run_test(\&test_vcf_norm,$opts,in=>'norm.5',out=>'norm.5.2.out',args=>'-m - --multi-overlaps .'); run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>''); run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y'); run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>''); diff --git a/vcfnorm.c b/vcfnorm.c index 38c5de4e5..73ef2071d 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -102,7 +102,7 @@ typedef struct int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; abuf_opt_t atomize; - int use_star_allele; + int use_star_allele, ma_use_ref_allele; char *old_rec_tag; htsFile *out; } @@ -711,11 +711,14 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int for (j=0; jma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is if ( bcf_gt_allele(gt[j])==ialt+1 ) gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT - else + else if ( args->ma_use_ref_allele ) gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF + else + gt[j] = bcf_gt_missing | bcf_gt_is_phased(gt[j]); // set to missing } gt += ngts; } @@ -2087,6 +2090,7 @@ static void usage(void) fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); @@ -2126,6 +2130,7 @@ int main_vcfnorm(int argc, char *argv[]) args->buf_win = 1000; args->mrows_collapse = COLLAPSE_BOTH; args->do_indels = 1; + args->ma_use_ref_allele = 1; args->clevel = -1; int region_is_file = 0; int targets_is_file = 0; @@ -2144,6 +2149,7 @@ int main_vcfnorm(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, + {"multi-overlaps",required_argument,NULL,13}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"regions-overlap",required_argument,NULL,1}, @@ -2177,6 +2183,11 @@ int main_vcfnorm(int argc, char *argv[]) else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n"); break; case 12 : args->old_rec_tag = optarg; break; + case 13 : + if ( optarg[0]=='0' ) args->ma_use_ref_allele = 1; + else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0; + else error("Invalid argument to --multi-overlaps\n"); + break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;