Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added Pitch6lcatest.lca.gz
Empty file.
Empty file added Pitch6lcatest.stat.gz
Empty file.
Binary file added acc2tax.filt.gz
Binary file not shown.
Binary file added metaDMG-cpp
Binary file not shown.
Binary file added misc/MAP
Binary file not shown.
Binary file added misc/a.out
Binary file not shown.
Binary file added misc/compressbam
Binary file not shown.
Binary file added misc/dfit_optim
Binary file not shown.
Binary file added misc/extract_reads
Binary file not shown.
5 changes: 3 additions & 2 deletions ngsLCA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ int2node makeNodes(int2int &parent){
int main_lca(int argc, char **argv) {
htsFormat *dingding2 = (htsFormat *)calloc(1, sizeof(htsFormat));
if (argc == 1) {
fprintf(stderr, "\t-> ./metaDMG-cpp lca --names --nodes --acc2tax [-edit_dist_[min/max] --sim_score_[low/high] --min_mapq --bam --lca_rank --used_reads [0,1] --weight_type [0,1] #%s \n", METADAMAGE_VERSION);
fprintf(stderr, "\t-> ./metaDMG-cpp lca --names --nodes --acc2tax [-edit_dist_[min/max] --sim_score_[low/high] --min_mapq --bam --lca_rank --filtered_acc2tax --used_reads [0,1] --weight_type [0,1] #%s \n", METADAMAGE_VERSION);
return 0;
}
catchkill();
Expand All @@ -738,7 +738,8 @@ int main_lca(int argc, char **argv) {
int2int *i2i = NULL;
// fprintf(stderr,"p->header: %p\n",p->header);
if (p->header)
i2i = (int2int *)bamRefId2tax(p->header, p->acc2taxfile, p->htsfile, errmap, p->tempfolder,p->reallyDump);
i2i = bamRefId2tax(p->header, p->acc2taxfile, p->htsfile, errmap, p->tempfolder, p->reallyDump, p->filteredAcc2taxfile);


// map of taxid -> taxid
int2int parent;
Expand Down
5 changes: 5 additions & 0 deletions ngsLCA_cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pars *pars_init() {
pars *p = (pars *)calloc(1, sizeof(pars));
p->htsfile = strdup("CHL_155_12485.sort.bam");
p->acc2taxfile = strdup("nucl_gb.accession2taxid.gz");
p->filteredAcc2taxfile = strdup("acc2tax.filt.gz");
p->namesfile = strdup("names.dmp.gz");
p->nodesfile = strdup("nodes.dmp.gz");
p->hts = NULL;
Expand Down Expand Up @@ -205,6 +206,8 @@ pars *get_pars(int argc, char **argv) {
} else if (!strcasecmp("--acc2tax", key)) {
free(p->acc2taxfile);
p->acc2taxfile = strdup(val);
} else if (!strcasecmp("--filtered_acc2tax", key)) {
p->filteredAcc2taxfile = strdup(val);
} else if (!strcasecmp("--edit_dist_min", key))
p->editdistMin = atoi(val);
else if (!strcasecmp("--edit_dist_max", key))
Expand Down Expand Up @@ -313,6 +316,8 @@ void print_pars(FILE *fp, pars *p) {
fprintf(fp, "\t-> --weight_type\t%d\n", p->weighttype);
fprintf(fp, "\t-> --ignore_errors\t%d\n", p->ignore_errors);
fprintf(fp, "\t-> --temp\t%s\n", p->tempfolder);
fprintf(fp, "\t-> --filtered_acc2tax\t%s\n", p->filteredAcc2taxfile);

}

#ifdef __WITH_MAIN__
Expand Down
1 change: 1 addition & 0 deletions ngsLCA_cli.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ typedef struct {
char *nodesfile;
char *namesfile;
char *acc2taxfile;
char *filteredAcc2taxfile;
// hts strucutures
samFile *hts;
bam_hdr_t *header;
Expand Down
65 changes: 45 additions & 20 deletions shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,8 @@ void parse_nodes2(int2int &parent, int2intvec &child) {
// exit(0);
}

// bamfile is only used for making smaller filedump (using the filename)
int SIG_COND = 1;
int2int *bamRefId2tax(bam_hdr_t *hdr, char *acc2taxfile, char *bamfile, int2int &errmap, char *tempfolder,int reallyDump) {
int2int *bamRefId2tax(bam_hdr_t *hdr, char *acc2taxfile, char *bamfile, int2int &errmap, char *tempfolder, int reallyDump, char *filteredAcc2taxfile) {
fprintf(stderr, "\t-> Starting to extract (acc->taxid) from binary file: \'%s\'\n", acc2taxfile);
fflush(stderr);
int dodump = !fexists4(tempfolder, basename(acc2taxfile), basename(bamfile), ".bin");
Expand All @@ -239,49 +238,63 @@ int2int *bamRefId2tax(bam_hdr_t *hdr, char *acc2taxfile, char *bamfile, int2int

time_t t = time(NULL);
BGZF *fp = NULL;
if (dodump){
if(reallyDump)
fp = getbgzf4(tempfolder, basename(acc2taxfile), basename(bamfile), ".bin", "wb", 4);
}else
if (dodump) {
if (reallyDump)
fp = getbgzf4(tempfolder, basename(acc2taxfile), basename(bamfile), ".bin", "wb", 4);
} else {
fp = getbgzf4(tempfolder, basename(acc2taxfile), basename(bamfile), ".bin", "rb", 4);
// this contains refname(as int) -> taxid
}
// This contains refname(as int) -> taxid
int2int *am = new int2int;

// Open the filtered output file if specified
gzFile filteredFile = Z_NULL;
if (filteredAcc2taxfile != nullptr) {
filteredFile = gzopen(filteredAcc2taxfile, "wb");
if (!filteredFile) {
fprintf(stderr, "Error opening file '%s' for writing\n", filteredAcc2taxfile);
exit(1);
}
gzprintf(filteredFile, "BAM_Reference\tTaxID\n"); // Write header
}

if (dodump) {
char buf[4096];
int at = 0;
char buf2[4096];

kstring_t *kstr = (kstring_t *)malloc(sizeof(kstring_t));
kstr->l = kstr->m = 0;
kstr->s = NULL;
BGZF *fp2 = getbgzf(acc2taxfile, "rb", 2);
bgzf_getline(fp2, '\n', kstr); // skip header
bgzf_getline(fp2, '\n', kstr); // Skip header
kstr->l = 0;
while (SIG_COND && bgzf_getline(fp2, '\n', kstr)) {
if (kstr->l == 0)
break;
// fprintf(stderr,"at: %d = '%s\'\n",at,kstr->s);
if (!((at++ % 100000)))
if (isatty(fileno(stderr)))
fprintf(stderr, "\r\t-> At linenr: %d in \'%s\' ", at, acc2taxfile);
if (!((at++ % 100000)) && isatty(fileno(stderr)))
fprintf(stderr, "\r\t-> At linenr: %d in \'%s\' ", at, acc2taxfile);
char *tok = strtok(kstr->s, "\t\n ");
char *key = strtok(NULL, "\t\n ");
tok = strtok(NULL, "\t\n ");
int val = atoi(tok);
// fprintf(stderr,"key: %d val: %d\n",key,val);exit(0);
int valinbam = bam_name2id(hdr, key);

if (valinbam == -1)
continue;
if(fp!=NULL){
assert(bgzf_write(fp, &valinbam, sizeof(int)) == sizeof(int));
assert(bgzf_write(fp, &val, sizeof(int)) == sizeof(int));
}
// fprintf(stderr,"key: %s val: %d valinbam:%d\n",key,val,valinbam);

if (fp != NULL) {
assert(bgzf_write(fp, &valinbam, sizeof(int)) == sizeof(int));
assert(bgzf_write(fp, &val, sizeof(int)) == sizeof(int));
}

if (am->find(valinbam) != am->end())
fprintf(stderr, "\t-> Duplicate entries found \'%s\'\n", key);
(*am)[valinbam] = val;

// Write to the filtered file if specified
if (filteredFile != Z_NULL) {
gzprintf(filteredFile, "%s\t%d\n", key, val);
}
kstr->l = 0;
}
bgzf_close(fp2);
Expand All @@ -290,11 +303,23 @@ int2int *bamRefId2tax(bam_hdr_t *hdr, char *acc2taxfile, char *bamfile, int2int
while (bgzf_read(fp, &valinbam, sizeof(int))) {
assert(bgzf_read(fp, &val, sizeof(int)) == sizeof(int));
(*am)[valinbam] = val;

// Write to the filtered file if specified
if (filteredFile != Z_NULL) {
const char *refname = hdr->target_name[valinbam];
gzprintf(filteredFile, "%s\t%d\n", refname, val);
}
}
}

// Close the filtered file if it was opened
if (filteredFile != Z_NULL) {
gzclose(filteredFile);
fprintf(stderr, "\n\t-> Filtered acc2taxfile saved to '%s'\n", filteredAcc2taxfile);
}

bgzf_close(fp);
fprintf(stderr, "\t-> Number of entries to use from accesion to taxid: %lu, time taken: %.2f sec\n", am->size(), (float)(time(NULL) - t));
fprintf(stderr, "\t-> Number of entries to use from accession to taxid: %lu, time taken: %.2f sec\n", am->size(), (float)(time(NULL) - t));
return am;
}

Expand Down
3 changes: 2 additions & 1 deletion shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ int2char parse_names(const char *fname);
void parse_nodes(const char *fname, int2char &rank, int2int &parent);
void strip(char *line);
void parse_nodes(const char *fname, int2char &rank, int2int &parent, int2intvec &child, int dochild);
int2int *bamRefId2tax(bam_hdr_t *hdr, char *acc2taxfile, char *bamfile, int2int &errmap, char *,int reallyDump);
int2int *bamRefId2tax(bam_hdr_t *hdr, char *acc2taxfile, char *bamfile, int2int &errmap, char *tempfolder, int reallyDump, char *filteredAcc2taxfile = nullptr);

int fexists(const char *str);
1 change: 1 addition & 0 deletions version.h
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#define METADAMAGE_VERSION "v0.4-94-gc110946-dirty"