Skip to content

Commit

Permalink
Add new -i, --use-id option to +fixref for swapping ALT/REF by ID
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Mar 14, 2017
1 parent 77a5697 commit 425b8d4
Showing 1 changed file with 164 additions and 22 deletions.
186 changes: 164 additions & 22 deletions plugins/fixref.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,20 +81,36 @@
#include <htslib/kseq.h>
#include <htslib/kfunc.h>
#include <htslib/faidx.h>
#include <htslib/khash.h>
#include <htslib/synced_bcf_reader.h>
#include "bcftools.h"

#define MODE_STATS 1
#define MODE_TOP2FWD 2
#define MODE_FLIP2FWD 3
#define MODE_USE_ID 4

typedef struct
{
uint32_t pos;
uint8_t ref:4, alt:4;
}
marker_t;

KHASH_MAP_INIT_INT(i2m, marker_t)
typedef khash_t(i2m) i2m_t;

typedef struct
{
char *dbsnp_fname;
int mode, discard;
bcf_hdr_t *hdr;
faidx_t *fai;
int32_t *gts, ngts;
uint32_t nsite,nok,nflip,nerr,nswap,nflip_swap,nonSNP,nonACGT,nonbiallelic;
uint32_t count[4][4];
int rid, skip_rid;
i2m_t *i2m;
int32_t *gts, ngts, pos;
uint32_t nsite,nok,nflip,nunresolved,nswap,nflip_swap,nonSNP,nonACGT,nonbiallelic;
uint32_t count[4][4], npos_err, unsorted;
}
args_t;

Expand All @@ -112,6 +128,7 @@ const char *usage(void)
"About: This tool helps to determine and fix strand orientation.\n"
" Currently the following modes are recognised:\n"
" flip .. flips non-ambiguous SNPs and ignores the rest\n"
" id .. swap REF/ALT using the ID column to determine the REF allele\n"
" stats .. collect and print stats\n"
" top .. converts from Illumina TOP strand to fwd\n"
"\n"
Expand All @@ -127,7 +144,10 @@ const char *usage(void)
"Plugin options:\n"
" -d, --discard Discard sites which could not be resolved\n"
" -f, --fasta-ref <file.fa> Reference sequence\n"
" -m, --mode <string> Collect stats (\"stats\") or convert (\"top\", \"flip\") [stats]\n"
" -i, --use-id <file.vcf> Swap REF/ALT using the ID column to determine the REF allele, implies -m id.\n"
" Download the dbSNP file from\n"
" https://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf\n"
" -m, --mode <string> Collect stats (\"stats\") or convert (\"flip\", \"id\", \"top\") [stats]\n"
"\n"
"Examples:\n"
" # run stats\n"
Expand All @@ -136,6 +156,9 @@ const char *usage(void)
" # convert from TOP to fwd\n"
" bcftools +fixref file.bcf -Ob -o out.bcf -- -f ref.fa -m top\n"
"\n"
" # match the REF/ALT alleles based on the ID column, discard unknown sites\n"
" bcftools +fixref file.bcf -Ob -o out.bcf -- -d -f ref.fa -i All_20151104.vcf.gz\n"
"\n"
" # assuming the reference build is correct, just flip to fwd, discarding the rest\n"
" bcftools +fixref file.bcf -Ob -o out.bcf -- -d -f ref.fa -m flip\n"
"\n";
Expand All @@ -144,6 +167,7 @@ const char *usage(void)
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
memset(&args,0,sizeof(args_t));
args.skip_rid = -1;
args.hdr = in;
args.mode = MODE_STATS;
char *ref_fname = NULL;
Expand All @@ -152,19 +176,22 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{"mode",required_argument,NULL,'m'},
{"discard",no_argument,NULL,'d'},
{"fasta-ref",required_argument,NULL,'f'},
{"use-id",required_argument,NULL,'i'},
{NULL,0,NULL,0}
};
int c;
while ((c = getopt_long(argc, argv, "?hf:m:d",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "?hf:m:di:",loptions,NULL)) >= 0)
{
switch (c)
{
case 'm':
if ( !strcasecmp(optarg,"top") ) args.mode = MODE_TOP2FWD;
else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD;
else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID;
else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS;
else error("The source strand convention not recognised: %s\n", optarg);
break;
case 'i': args.dbsnp_fname = optarg; args.mode = MODE_USE_ID; break;
case 'd': args.discard = 1; break;
case 'f': ref_fname = optarg; break;
case 'h':
Expand Down Expand Up @@ -219,8 +246,108 @@ static inline int nt2int(char nt)
#define int2nt(x) "ACGT"[x]
#define revint(x) ("3210"[x]-'0')

inline uint32_t parse_rsid(char *name)
{
if ( name[0]!='r' || name[1]!='s' )
{
name = strstr(name, "rs");
if ( !name ) return 0;
}
char *tmp;
name += 2;
uint64_t id = strtol(name, &tmp, 10);
if ( tmp==name || *tmp ) return 0;
if ( id > UINT32_MAX ) error("FIXME: the ID is too big for uint32_t: %s\n", name-2);
return id;
}

int fetch_ref(args_t *args, bcf1_t *rec)
{
// Get the reference allele
int len;
char *ref = faidx_fetch_seq(args->fai, (char*)bcf_seqname(args->hdr,rec), rec->pos, rec->pos, &len);
if ( !ref )
{
if ( faidx_has_seq(args->fai, bcf_seqname(args->hdr,rec))==0 )
{
fprintf(stderr,"Ignoring sequence \"%s\"\n", bcf_seqname(args->hdr,rec));
args->skip_rid = rec->rid;
return -2;
}
error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
}
int ir = nt2int(*ref);
free(ref);
return ir;
}

void dbsnp_init(args_t *args, const char *chr)
{
if ( args->i2m ) kh_destroy(i2m, args->i2m);
args->i2m = kh_init(i2m);
bcf_srs_t *sr = bcf_sr_init();
if ( bcf_sr_set_regions(sr, chr, 0) != 0 ) goto done;
if ( !bcf_sr_add_reader(sr,args->dbsnp_fname) ) error("Failed to open %s: %s\n", args->dbsnp_fname,bcf_sr_strerror(sr->errnum));
while ( bcf_sr_next_line(sr) )
{
bcf1_t *rec = bcf_sr_get_line(sr, 0);
if ( rec->n_allele!=2 ) continue; // skip multiallelic markers
if ( rec->d.allele[0][1]!=0 || rec->d.allele[1][1]!=0 ) continue; // skip non-snps

int ref = nt2int(rec->d.allele[0][0]);
int alt = nt2int(rec->d.allele[1][0]);
if ( ref<0 || alt<0 ) continue; // non-[ACGT] base

uint32_t id = parse_rsid(rec->d.id);
if ( !id ) continue;

int ret, k;
k = kh_put(i2m, args->i2m, id, &ret);
if ( ret<0 ) error("An error occurred while inserting the key %u\n", id);
if ( ret==0 ) continue; // skip ambiguous id
kh_val(args->i2m, k).pos = (uint32_t)rec->pos;
kh_val(args->i2m, k).ref = ref;
kh_val(args->i2m, k).alt = alt;
}
done:
bcf_sr_destroy(sr);
}

bcf1_t *dbsnp_check(args_t *args, bcf1_t *rec, int ir, int ia, int ib)
{
int k, ref,alt,pos;
uint32_t id = parse_rsid(rec->d.id);
if ( !id ) goto no_info;

k = kh_get(i2m, args->i2m, id);
if ( k==kh_end(args->i2m) ) goto no_info;

pos = (int)kh_val(args->i2m, k).pos;
if ( pos != rec->pos )
{
rec->pos = pos;
ir = fetch_ref(args, rec);
args->npos_err++;
}

ref = kh_val(args->i2m, k).ref;
alt = kh_val(args->i2m, k).alt;

if ( ref!=ir )
error("Reference base mismatch at %s:%d .. %c vs %c\n",bcf_seqname(args->hdr,rec),rec->pos+1,int2nt(ref),int2nt(ir));

if ( ia==ref && ib==alt ) return rec;
if ( ia==alt && ib==ref ) { args->nswap++; return set_ref_alt(args,rec,int2nt(ib),int2nt(ia),0); }

no_info:
args->nunresolved++;
return args->discard ? NULL : rec;
}

bcf1_t *process(bcf1_t *rec)
{
if ( rec->rid == args.skip_rid ) return NULL;

bcf1_t *ret = args.mode==MODE_STATS ? NULL : rec;
args.nsite++;

Expand All @@ -232,12 +359,9 @@ bcf1_t *process(bcf1_t *rec)
}

// Get the reference allele
int len;
char *ref = faidx_fetch_seq(args.fai, (char*)bcf_seqname(args.hdr,rec), rec->pos, rec->pos, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);
int ir = nt2int(*ref);
free(ref);
if ( ir<0 )
int ir = fetch_ref(&args, rec);
if ( ir==-2 ) return NULL;
if ( ir==-1 )
{
args.nonACGT++;
return args.discard ? NULL : ret; // not A,C,G,T
Expand Down Expand Up @@ -276,12 +400,32 @@ bcf1_t *process(bcf1_t *rec)

if ( ir==ia ) args.nok++;

if ( args.mode==MODE_FLIP2FWD )
if ( args.mode==MODE_USE_ID )
{
if ( !args.i2m || args.rid!=rec->rid )
{
args.pos = 0;
args.rid = rec->rid;
dbsnp_init(&args,bcf_seqname(args.hdr,rec));
}
ret = dbsnp_check(&args, rec, ir,ia,ib);
if ( !args.unsorted && args.pos > rec->pos )
{
fprintf(stderr,
"Warning: corrected position(s) results in unsorted VCF, for example %s:%d comes after %s:%d\n"
" The standard unix `sort` or `vcf-sort` from vcftools can be used to fix the order.\n",
bcf_seqname(args.hdr,rec),rec->pos+1,bcf_seqname(args.hdr,rec),args.pos);
args.unsorted = 1;
}
args.pos = rec->pos;
return ret;
}
else if ( args.mode==MODE_FLIP2FWD )
{
int pair = 1 << ia | 1 << ib;
if ( pair==0x9 || pair==0x6 ) // skip ambiguous pairs: A/T or C/G
{
args.nerr++;
args.nunresolved++;
return args.discard ? NULL : ret;
}
if ( ir==ia ) return ret;
Expand All @@ -290,7 +434,7 @@ bcf1_t *process(bcf1_t *rec)
if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),1); }
error("FIXME: this should not happen %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);
}
if ( args.mode==MODE_TOP2FWD )
else if ( args.mode==MODE_TOP2FWD )
{
if ( ia==0 && (ib==1 || ib==2) ) // unambiguous pair: A/C or A/G
{
Expand All @@ -314,8 +458,8 @@ bcf1_t *process(bcf1_t *rec)
}
else // ambiguous pair, sequence walking must be performed
{
int win = rec->pos > 100 ? 100 : rec->pos, beg = rec->pos - win, end = rec->pos + win;
ref = faidx_fetch_seq(args.fai, (char*)bcf_seqname(args.hdr,rec), beg,end, &len);
int len, win = rec->pos > 100 ? 100 : rec->pos, beg = rec->pos - win, end = rec->pos + win;
char *ref = faidx_fetch_seq(args.fai, (char*)bcf_seqname(args.hdr,rec), beg,end, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);
if ( end - beg + 1 != len ) error("FIXME: check win=%d,len=%d at %s:%d (%d %d %d)\n", win,len, bcf_seqname(args.hdr,rec),rec->pos+1);

Expand Down Expand Up @@ -357,7 +501,7 @@ bcf1_t *process(bcf1_t *rec)
}
}

args.nerr++;
args.nunresolved++;
return args.discard ? NULL : ret;
}
}
Expand Down Expand Up @@ -417,17 +561,15 @@ void destroy(void)
fprintf(stderr,"NS\tflipped \t%u\t%.1f%%\n", args.nflip,100.*args.nflip/(args.nsite-nskip));
fprintf(stderr,"NS\tswapped \t%u\t%.1f%%\n", args.nswap,100.*args.nswap/(args.nsite-nskip));
fprintf(stderr,"NS\tflip+swap \t%u\t%.1f%%\n", args.nflip_swap,100.*args.nflip_swap/(args.nsite-nskip));
fprintf(stderr,"NS\tunresolved \t%u\t%.1f%%\n", args.nerr,100.*args.nerr/(args.nsite-nskip));
fprintf(stderr,"NS\tunresolved \t%u\t%.1f%%\n", args.nunresolved,100.*args.nunresolved/(args.nsite-nskip));
fprintf(stderr,"NS\tfixed pos \t%u\t%.1f%%\n", args.npos_err,100.*args.npos_err/(args.nsite-nskip));
}
fprintf(stderr,"NS\tskipped \t%u\n", nskip);
fprintf(stderr,"NS\tnon-ACGT \t%u\n", args.nonACGT);
fprintf(stderr,"NS\tnon-SNP \t%u\n", args.nonSNP);
fprintf(stderr,"NS\tnon-biallelic\t%u\n", args.nonbiallelic);

//fprintf(stderr,"Number of correct/swapped/flipped/unresolved sites:\t%u\t%u\t%u\t%u\n", args.nok,args.nswap,args.nflip,args.nerr);
//fprintf(stderr,"Percentage of correct/swapped/flipped/unresolved sites:\t%.1f\t%.1f\t%.1f\t%.1f\n",
// 100.*args.nok/args.ncmp,100.*args.nswap/args.ncmp,100.*args.nflip/args.ncmp,100.*args.nerr/args.ncmp);

free(args.gts);
if ( args.fai ) fai_destroy(args.fai);
if ( args.i2m ) kh_destroy(i2m, args.i2m);
}

0 comments on commit 425b8d4

Please sign in to comment.