Skip to content

Commit

Permalink
Allow multiallelic sites in the -i, --use-id reference.
Browse files Browse the repository at this point in the history
Also flip genotypes, not just REF/ALT!
  • Loading branch information
pd3 committed May 25, 2017
1 parent 7eb67ba commit 519f310
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 12 deletions.
17 changes: 6 additions & 11 deletions plugins/fixref.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
typedef struct
{
uint32_t pos;
uint8_t ref:4, alt:4;
uint8_t ref;
}
marker_t;

Expand Down Expand Up @@ -129,7 +129,7 @@ const char *usage(void)
"About: This tool helps to determine and fix strand orientation.\n"
" Currently the following modes are recognised:\n"
" flip .. flips non-ambiguous SNPs and ignores the rest\n"
" id .. swap REF/ALT using the ID column to determine the REF allele\n"
" id .. swap REF/ALT and GTs using the ID column to determine the REF allele\n"
" stats .. collect and print stats\n"
" top .. converts from Illumina TOP strand to fwd\n"
"\n"
Expand Down Expand Up @@ -292,12 +292,10 @@ static void dbsnp_init(args_t *args, const char *chr)
while ( bcf_sr_next_line(sr) )
{
bcf1_t *rec = bcf_sr_get_line(sr, 0);
if ( rec->n_allele!=2 ) continue; // skip multiallelic markers
if ( rec->d.allele[0][1]!=0 || rec->d.allele[1][1]!=0 ) continue; // skip non-snps

int ref = nt2int(rec->d.allele[0][0]);
int alt = nt2int(rec->d.allele[1][0]);
if ( ref<0 || alt<0 ) continue; // non-[ACGT] base
if ( ref<0 ) continue; // non-[ACGT] base

uint32_t id = parse_rsid(rec->d.id);
if ( !id ) continue;
Expand All @@ -308,15 +306,14 @@ static void dbsnp_init(args_t *args, const char *chr)
if ( ret==0 ) continue; // skip ambiguous id
kh_val(args->i2m, k).pos = (uint32_t)rec->pos;
kh_val(args->i2m, k).ref = ref;
kh_val(args->i2m, k).alt = alt;
}
done:
bcf_sr_destroy(sr);
}

static bcf1_t *dbsnp_check(args_t *args, bcf1_t *rec, int ir, int ia, int ib)
{
int k, ref,alt,pos;
int k, ref,pos;
uint32_t id = parse_rsid(rec->d.id);
if ( !id ) goto no_info;

Expand All @@ -332,13 +329,11 @@ static bcf1_t *dbsnp_check(args_t *args, bcf1_t *rec, int ir, int ia, int ib)
}

ref = kh_val(args->i2m, k).ref;
alt = kh_val(args->i2m, k).alt;

if ( ref!=ir )
error("Reference base mismatch at %s:%d .. %c vs %c\n",bcf_seqname(args->hdr,rec),rec->pos+1,int2nt(ref),int2nt(ir));

if ( ia==ref && ib==alt ) return rec;
if ( ia==alt && ib==ref ) { args->nswap++; return set_ref_alt(args,rec,int2nt(ib),int2nt(ia),0); }
if ( ia==ref ) return rec;
if ( ib==ref ) { args->nswap++; return set_ref_alt(args,rec,int2nt(ib),int2nt(ia),1); }

no_info:
args->nunresolved++;
Expand Down
11 changes: 11 additions & 0 deletions test/fixref.2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=1,length=2147483647>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XY00001
1 1 rs1 C T . . . GT 0/1
1 2 rs2 T A . . . GT 0/1
1 3 rs3 A T . . . GT 0/1
1 4 rs4 A T . . . GT 0/1
1 5 rs5 C T . . . GT 0/1
1 6 rs6 C T . . . GT 0/1
10 changes: 10 additions & 0 deletions test/fixref.2a.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=1,length=2147483647>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XY00001
1 1 rs1 C T . . . GT 0/1
1 2 rs2 A T . . . GT 1/0
1 3 rs3 A T . . . GT 0/1
1 4 rs4 T A . . . GT 1/0
1 5 rs5 C T . . . GT 0/1
1 6 rs6 T C . . . GT 1/0
10 changes: 10 additions & 0 deletions test/fixref.2b.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=1,length=2147483647>
#CHROM POS ID REF ALT QUAL FILTER INFO
1 1 rs1 C T . . .
1 2 rs2 T G . . .
1 3 rs3 A T,C . . .
1 4 rs4 A T,C . . .
1 5 rs5 C A,T . . .
1 6 rs6 C A,T . . .
8 changes: 7 additions & 1 deletion test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@
test_vcf_plugin($opts,in=>'trio',out=>'trio.out',cmd=>'+trio-switch-rate',args=>'-- -p {PATH}/trio.ped | grep -v bcftools');
test_vcf_plugin($opts,in=>'ad-bias',out=>'ad-bias.out',cmd=>'+ad-bias',args=>'-- -s {PATH}/ad-bias.samples | grep -v bcftools');
test_vcf_plugin($opts,in=>'af-dist',out=>'af-dist.out',cmd=>'+af-dist',args=>' | grep -v bcftools');
test_vcf_plugin($opts,in=>'fixref',out=>'fixref.1.out',cmd=>'+fixref',args=>'-- -f {PATH}/norm.fa -m top');
test_vcf_plugin($opts,in=>'fixref.2a',out=>'fixref.2.out',index=>['fixref.2b'],cmd=>'+fixref',args=>'-- -f {PATH}/norm.fa -i {TMP}/fixref.2b.vcf.gz');
test_vcf_plugin($opts,in=>'aa',out=>'aa.out',cmd=>'+fill-from-fasta',args=>'-- -f {PATH}/aa.fa -c AA -h {PATH}/aa.hdr -i \'TYPE="snp"\'');
test_vcf_plugin($opts,in=>'ref',out=>'ref.out',cmd=>'+fill-from-fasta',args=>'-- -f {PATH}/norm.fa -c REF');
test_vcf_plugin($opts,in=>'view',out=>'view.GTsubset.NA1.out',cmd=>'+GTsubset --no-version',args=>'-- -s NA00001');
Expand Down Expand Up @@ -864,7 +864,13 @@ sub test_vcf_plugin
if ( !exists($args{args}) ) { $args{args} = ''; }
$args{args} =~ s/{PATH}/$$opts{path}/g;
$args{cmd} =~ s/{PATH}/$$opts{path}/g;
$args{args} =~ s/{TMP}/$$opts{tmp}/g;
$args{cmd} =~ s/{TMP}/$$opts{tmp}/g;
bgzip_tabix_vcf($opts,"$args{in}");
if ( exists($args{index}) )
{
for my $file (@{$args{index}}) { bgzip_tabix_vcf($opts,$file); }
}
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools $args{cmd} $$opts{tmp}/$args{in}.vcf.gz $args{args} 2>/dev/null | grep -v ^##bcftools_");
cmd("$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz > $$opts{tmp}/$args{in}.bcf");
Expand Down

0 comments on commit 519f310

Please sign in to comment.