Skip to content

Commit

Permalink
regidx: extend regidx API
Browse files Browse the repository at this point in the history
* added functions to loop over all regions
* lazy index build in case random access is not required
* support for chromosome names only, beg-end coordinates not mandatory
* set cap at maximum coordinate at 2147483647, hts_itr does not support larger
* tab and reg parsers will throw on finding a `0` to catch user
  error of using 0-based rather than 1-based coords
  • Loading branch information
mcshane committed Aug 5, 2016
1 parent 2aacec6 commit ef6773a
Show file tree
Hide file tree
Showing 4 changed files with 223 additions and 59 deletions.
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ OBJS = main.o vcfindex.o tabix.o \
vcfnorm.o vcfgtcheck.o vcfview.o vcfannotate.o vcfroh.o vcfconcat.o \
vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \
vcfcnv.o HMM.o vcfplugin.o consensus.o ploidy.o bin.o hclust.o version.o \
regidx.o \
mpileup.o bam2bcf.o bam2bcf_indel.o sample.o \
ccall.o em.o prob1.o kmin.o # the original samtools calling

Expand Down Expand Up @@ -178,8 +179,9 @@ ploidy.o: ploidy.c regidx.h $(HTSDIR)/htslib/khash_str2int.h $(HTSDIR)/htslib/ks
polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_h) peakfit.h
peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(HTSDIR)/htslib/kstring.h
bin.o: bin.c $(bin_h)
regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h
consensus.o: consensus.c $(htslib_hts_h) $(HTSDIR)/htslib/kseq.h rbuf.h $(bcftools_h) regidx.h
mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_regidx_h) $(bcftools_h) $(call_h) $(bam2bcf_h) $(sample_h)
mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) regidx.h $(bcftools_h) $(call_h) $(bam2bcf_h) $(sample_h)
sample.o: $(sample_h) $(htslib_hts_h) $(HTSDIR)/htslib/khash_str2int.h
version.o: version.h version.c
hclust.o: hclust.c hclust.h
Expand Down
2 changes: 1 addition & 1 deletion mpileup.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/faidx.h>
#include <htslib/kstring.h>
#include <htslib/khash_str2int.h>
#include <htslib/regidx.h>
#include <assert.h>
#include "regidx.h"
#include "bcftools.h"
#include "bam2bcf.h"
#include "sample.h"
Expand Down
223 changes: 173 additions & 50 deletions regidx.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (C) 2014 Genome Research Ltd.
Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Expand Down Expand Up @@ -28,6 +28,7 @@
#include <htslib/khash_str2int.h>
#include "regidx.h"

#define MAX_COOR 2147483647 // CSI and hts_itr_query limit
#define LIDX_SHIFT 13 // number of insignificant index bits

// List of regions for one chromosome
Expand All @@ -53,7 +54,8 @@ struct _regidx_t

// temporary data for index initialization
kstring_t str;
int rid_prev, start_prev, end_prev;
int rid_prev;
uint32_t start_prev, end_prev;
int payload_size;
void *payload;
};
Expand All @@ -78,50 +80,68 @@ char **regidx_seq_names(regidx_t *idx, int *n)
return idx->seq_names;
}

int _regidx_build_index(regidx_t *idx)
int _regidx_build_index(regidx_t *idx, reglist_t *list)
{
int iseq;
for (iseq=0; iseq<idx->nseq; iseq++)
int j,k, imax = 0; // max index bin
for (j=0; j<list->nregs; j++)
{
reglist_t *list = &idx->seq[iseq];
int j,k, imax = 0; // max index bin
for (j=0; j<list->nregs; j++)
int ibeg = list->regs[j].start >> LIDX_SHIFT;
int iend = list->regs[j].end >> LIDX_SHIFT;
if ( imax < iend + 1 )
{
int ibeg = list->regs[j].start >> LIDX_SHIFT;
int iend = list->regs[j].end >> LIDX_SHIFT;
if ( imax < iend + 1 )
{
int old_imax = imax;
imax = iend + 1;
kroundup32(imax);
list->idx = (int*) realloc(list->idx, imax*sizeof(int));
for (k=old_imax; k<imax; k++) list->idx[k] = -1;
}
if ( ibeg==iend )
{
if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
}
else
{
for (k=ibeg; k<=iend; k++)
if ( list->idx[k]<0 ) list->idx[k] = j;
}
list->nidx = iend + 1;
int old_imax = imax;
imax = iend + 1;
kroundup32(imax);
list->idx = (int*) realloc(list->idx, imax*sizeof(int));
for (k=old_imax; k<imax; k++) list->idx[k] = -1;
}
if ( ibeg==iend )
{
if ( list->idx[ibeg]<0 ) list->idx[ibeg] = j;
}
else
{
for (k=ibeg; k<=iend; k++)
if ( list->idx[k]<0 ) list->idx[k] = j;
}
list->nidx = iend + 1;
}
return 0;
}

int regidx_insert_list(regidx_t *idx, char *line, char delim)
{
kstring_t tmp = {0,0,0};
char *ss = line;
while ( *ss )
{
char *se = ss;
while ( *se && *se!=delim ) se++;
tmp.l = 0;
kputsn(ss, se-ss, &tmp);
if ( regidx_insert(idx,tmp.s) < 0 )
{
free(tmp.s);
return -1;
}
if ( !*se ) break;
ss = se+1;
}
free(tmp.s);
return 0;
}

int regidx_insert(regidx_t *idx, char *line)
{
if ( !line )
return _regidx_build_index(idx);
if ( !line ) return 0;

char *chr_from, *chr_to;
reg_t reg;
int ret = idx->parse(line,&chr_from,&chr_to,&reg,idx->payload,idx->usr);
if ( ret==-2 ) return -1; // error
if ( ret==-1 ) return 0; // skip the line
if ( reg.start > MAX_COOR - 1 ) reg.start = MAX_COOR - 1;
if ( reg.end > MAX_COOR - 1 ) reg.end = MAX_COOR - 1;

int rid;
idx->str.l = 0;
Expand Down Expand Up @@ -151,7 +171,7 @@ int regidx_insert(regidx_t *idx, char *line)
{
if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) )
{
fprintf(stderr,"The regions are not sorted: %s:%d-%d is before %s:%d-%d\n",
fprintf(stderr,"The regions are not sorted: %s:%u-%u is before %s:%u-%u\n",
idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1);
return -1;
}
Expand Down Expand Up @@ -187,8 +207,8 @@ regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f fr
idx->usr = usr_dat;
idx->seq2regs = khash_str2int_init();
idx->rid_prev = -1;
idx->start_prev = -1;
idx->end_prev = -1;
idx->start_prev = 0;
idx->end_prev = 0;
idx->payload_size = payload_size;
if ( payload_size ) idx->payload = malloc(payload_size);

Expand Down Expand Up @@ -249,29 +269,42 @@ int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, r
reglist_t *list = &idx->seq[iseq];
if ( !list->nregs ) return 0;

int i, ibeg = from>>LIDX_SHIFT;
int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
if ( ireg < 0 )
int i;
if ( list->nregs==1 )
{
// linear search; if slow, replace with binary search
if ( ibeg > list->nidx ) ibeg = list->nidx;
for (i=ibeg - 1; i>=0; i--)
if ( list->idx[i] >=0 ) break;
ireg = i>=0 ? list->idx[i] : 0;
// one region only, the index was not built to save memory
if ( idx->seq[iseq].regs[0].start > to ) return 0; // no match
if ( idx->seq[iseq].regs[0].end < from ) return 0;
i = 0;
}
for (i=ireg; i<list->nregs; i++)
else
{
if ( list->regs[i].start > to ) return 0; // no match
if ( list->regs[i].end >= from && list->regs[i].start <= to ) break; // found
}
if ( !list->idx ) _regidx_build_index(idx, list);

if ( i>=list->nregs ) return 0; // no match
int ibeg = from>>LIDX_SHIFT;
int ireg = ibeg < list->nidx ? list->idx[ibeg] : list->idx[ list->nidx - 1 ];
if ( ireg < 0 )
{
// linear search; if slow, replace with binary search
if ( ibeg > list->nidx ) ibeg = list->nidx;
for (i=ibeg - 1; i>=0; i--)
if ( list->idx[i] >=0 ) break;
ireg = i>=0 ? list->idx[i] : 0;
}
for (i=ireg; i<list->nregs; i++)
{
if ( list->regs[i].start > to ) return 0; // no match
if ( list->regs[i].end >= from ) break; // found
}

if ( i>=list->nregs ) return 0; // no match
}
if ( !itr ) return 1;

itr->i = 0;
itr->n = list->nregs - i;
itr->reg = &idx->seq[iseq].regs[i];
itr->seq = iseq;
if ( idx->payload_size )
itr->payload = idx->seq[iseq].payload + i*idx->payload_size;
else
Expand All @@ -289,11 +322,18 @@ int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, reg_t *re

char *se = ss;
while ( *se && !isspace(*se) ) se++;
if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }

*chr_beg = ss;
*chr_end = se-1;

if ( !*se )
{
// just the chromosome name
reg->start = 0;
reg->end = MAX_COOR - 1;
return 0;
}

ss = se+1;
reg->start = hts_parse_decimal(ss, &se, 0);
if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
Expand All @@ -314,14 +354,23 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *re

char *se = ss;
while ( *se && !isspace(*se) ) se++;
if ( !*se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }

*chr_beg = ss;
*chr_end = se-1;

if ( !*se )
{
// just the chromosome name
reg->start = 0;
reg->end = MAX_COOR - 1;
return 0;
}

ss = se+1;
reg->start = hts_parse_decimal(ss, &se, 0) - 1;
if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
reg->start = hts_parse_decimal(ss, &se, 0);
if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; }
if ( reg->start==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
reg->start--;

if ( !se[0] || !se[1] )
reg->end = reg->start;
Expand All @@ -330,9 +379,83 @@ int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, reg_t *re
ss = se+1;
reg->end = hts_parse_decimal(ss, &se, 0);
if ( ss==se ) reg->end = reg->start;
else if ( reg->end==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
else reg->end--;
}
return 0;
}

int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
{
char *ss = (char*) line;
while ( *ss && isspace(*ss) ) ss++;
if ( !*ss ) return -1; // skip blank lines
if ( *ss=='#' ) return -1; // skip comments

char *se = ss;
while ( *se && *se!=':' ) se++;

*chr_beg = ss;
*chr_end = se-1;

if ( !*se )
{
reg->start = 0;
reg->end = MAX_COOR - 1;
return 0;
}

ss = se+1;
reg->start = hts_parse_decimal(ss, &se, 0);
if ( ss==se ) { fprintf(stderr,"Could not parse reg line: %s\n", line); return -2; }
if ( reg->start==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
reg->start--;

if ( !se[0] || !se[1] )
reg->end = se[0]=='-' ? MAX_COOR - 1 : reg->start;
else
{
ss = se+1;
reg->end = hts_parse_decimal(ss, &se, 0);
if ( ss==se ) reg->end = reg->start;
else if ( reg->end==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
else reg->end--;
}
return 0;
}

int regidx_loop(regidx_t *idx, regitr_t *itr)
{
if ( !itr->reg )
{
if ( !idx->nseq ) return 0;
itr->i = itr->n = 0;
itr->seq = -1;
}

itr->i++;

if ( itr->i >= itr->n )
{
itr->seq++;
if ( itr->seq >= idx->nseq ) return 0;

reglist_t *list = &idx->seq[itr->seq];
itr->reg = &list->regs[0];
itr->i = 0;
itr->n = list->nregs;
}

if ( idx->payload_size )
itr->payload = idx->seq[itr->seq].payload + itr->i*idx->payload_size;
else
itr->payload = NULL;

return 1;
}

const char *regitr_seqname(regidx_t *idx, regitr_t *itr)
{
return (const char*)idx->seq_names[itr->seq];
}

Loading

0 comments on commit ef6773a

Please sign in to comment.