Skip to content

Commit

Permalink
First working bam version
Browse files Browse the repository at this point in the history
  • Loading branch information
pmelsted committed Feb 26, 2015
1 parent 6d7674b commit 4dc4271
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 9 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
# add_compile_options(-Wdeprecated-register)

add_subdirectory(src)
include_directories(${EXT_PROJECTS_DIR})

if (BUILD_TESTING)
add_subdirectory(${EXT_PROJECTS_DIR}/catch)
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ file(GLOB headers *.h *.hpp)

list(REMOVE_ITEM sources main.cpp)

include_directories(../ext/)
add_compile_options(-DSEQAN_HAS_ZLIB)
add_library(kallisto_core ${sources} ${headers})
target_include_directories(kallisto_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

Expand Down
11 changes: 7 additions & 4 deletions src/MinCollector.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ struct MinCollector {



void collect(std::vector<std::pair<int,int>>& v) {
int collect(std::vector<std::pair<int,int>>& v) {
if (v.empty()) {
return;
return -1;
}
sort(v.begin(), v.end()); // sort by increasing order

Expand All @@ -37,7 +37,7 @@ struct MinCollector {
}
// if u is empty do nothing
if (u.empty()) {
return;
return -1;
}

// find the range of support
Expand All @@ -50,20 +50,23 @@ struct MinCollector {
}

if ((maxpos-minpos + k) < min_range) {
return;
return -1;
}

auto search = index.ecmapinv.find(u);
if (search != index.ecmapinv.end()) {
// ec class already exists, update count
++counts[search->second];
return search->second;
} else {
// new ec class, update the index and count
auto necs = counts.size();
index.ecmap.insert({necs,u});
index.ecmapinv.insert({u,necs});
counts.push_back(1);
return necs;
}

}

void write(std::ostream& o) {
Expand Down
200 changes: 196 additions & 4 deletions src/ProcessReads.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,21 @@
#include "kseq.h"
#include <string>
#include <vector>
#include <unordered_map>
#include <algorithm>

#include <iostream>
#include <fstream>

#include <seqan/bam_io.h>


#include "common.h"

using namespace seqan;

template<typename Index, typename TranscriptCollector>
TranscriptCollector ProcessReads(Index& index, const ProgramOptions& opt) {
void ProcessReads(Index& index, const ProgramOptions& opt, TranscriptCollector &tc) {

// need to receive an index map
std::ios_base::sync_with_stdio(false);
Expand All @@ -31,7 +35,6 @@ TranscriptCollector ProcessReads(Index& index, const ProgramOptions& opt) {
int l1,l2; // length of read
size_t nreads = 0;

TranscriptCollector tc(index, opt);

// for each file

Expand Down Expand Up @@ -72,7 +75,7 @@ TranscriptCollector ProcessReads(Index& index, const ProgramOptions& opt) {
}

// collect the transcript information
tc.collect(v);
int ec = tc.collect(v);
if (opt.verbose && nreads % 10000 == 0 ) {
std::cerr << "Processed " << nreads << std::endl;
}
Expand All @@ -93,8 +96,197 @@ TranscriptCollector ProcessReads(Index& index, const ProgramOptions& opt) {
of.open(outfile.c_str(), std::ios::out);
tc.write(of);
of.close();
}

template<typename Index, typename TranscriptCollector>
void ProcessBams(Index& index, const ProgramOptions& opt, TranscriptCollector& tc) {

// need to receive an index map
std::ios_base::sync_with_stdio(false);
std::vector<std::pair<int,int>> v;
v.reserve(1000);


std::vector<int> rIdToTrans;

int l1,l2; // length of read
size_t nreads = 0;

BamFileIn bamFileIn(opt.files[0].c_str());
/*
if (!open(bamFileIn, opt.files[0].c_str())) {
std::cerr << "ERROR: Could not open bamfile " << opt.files[0] << std::endl;
exit(1);
}*/

BamHeader header;
readHeader(header, bamFileIn);

//typedef FormattedFileContext<BamFileIn, void>::Type TBamContext;

///BamIOContext<StringSet<CharString>>
auto const & bamContext = context(bamFileIn);

{
std::unordered_map<std::string, int> inv;
for (int i = 0; i < index.target_names_.size(); i++) {
inv.insert({index.target_names_[i], i});
}
auto cnames = contigNames(bamContext);
for (int i = 0; i < length(cnames[i]); i++) {
std::string cname(toCString(cnames[i]));
auto search = inv.find(cname);
if (search == inv.end()) {
std::cerr << "Error: could not find transcript name " << cname << " from bam file " << opt.files[0] << std::endl;
} else {
rIdToTrans.push_back(search->second);
}
}
}




// for each read
BamAlignmentRecord record;
if (atEnd(bamFileIn)) {
std::cerr << "Warning: Empty bam file " << opt.files[0] << std::endl;
return;
}
readRecord(record, bamFileIn);
std::vector<int> p;
CharString s1,s2;
CharString lastName;
bool done = false;

int mismatches = 0;
int alignKnotB = 0;
int alignBnotK = 0;
int exactMatches = 0;
int alignNone = 0;

while (!done) {
bool paired = hasFlagMultiple(record);
p.clear();
clear(s1);
clear(s2);

// we haven't processed record yet
while (true) {
// collect sequence
if (hasFlagFirst(record) && empty(s1)) {
s1 = record.seq;
if (hasFlagRC(record)) {
reverseComplement(s1);
}
} else if (paired && hasFlagLast(record) && empty(s2)) {
s2 = record.seq;
if (hasFlagRC(record)) {
reverseComplement(s2);
}
} else {
if (empty(s1) && empty(s2)) {
std::cerr << "Warning: weird sequence in bam file " << opt.files[0] << std::endl;
}
}


// gather alignment info
if (!hasFlagUnmapped(record)) {
p.push_back(rIdToTrans[record.rID]);
}

// are there more reads?
if (atEnd(bamFileIn)) {
done = true;
break;
}

// look at next read
lastName = record.qName;
readRecord(record, bamFileIn);
if (lastName != record.qName) {
break; // process the record next time
}
}

if (empty(s1) || empty(s2)) {
std::cerr << "Warning: only one read is present " << std::endl << s1 << std::endl << s2 << std::endl;
}
nreads++;



v.clear();
// process read
index.match(toCString(s1), length(s1), v);
if (paired) {
int vl = v.size();
index.match(toCString(s2), length(s2), v);
// fix k-mer positions, assuming an average
// fragment length distribution of fld.
int leftpos = ((int) opt.fld)-opt.k;
for (int i = vl; i < v.size(); i++) {
v[i].second = std::max(leftpos-v[i].second, 0);
}
}

// collect the transcript information
int ec = tc.collect(v);

if (opt.verbose && nreads % 10000 == 0 ) {
std::cerr << "Processed " << nreads << std::endl;
}

sort(p.begin(), p.end());
std::vector<int> lp;
if (!p.empty()) {
lp.push_back(p[0]);
for (int i = 1; i < p.size(); i++) {
if (p[i-1] != p[i]) {
lp.push_back(p[i]);
}
}
}

if (!lp.empty()) {
auto find = index.ecmapinv.find(lp);
if (find != index.ecmapinv.end()) {
if (find->second != ec) {
// mismatch
if (ec == -1) {
alignBnotK++;
} else {
mismatches++;
}
} else {
exactMatches++;
}
}
} else {
if (ec >= 0) {
alignKnotB++;
} else {
alignNone++;
}
}
}

std::cout << "Aligned " << nreads << std::endl
<< "exact matches = " << exactMatches << std::endl
<< "Kallisto mapped, not BAM = " << alignKnotB << std::endl
<< "Bam mapped, not Kallisto = " << alignBnotK << std::endl
<< "Both mapped, mismatches = " << mismatches << std::endl
<< "Neither mapped = " << alignNone << std::endl;

// writeoutput to outdir
std::string outfile = opt.output + "/counts.txt"; // figure out filenaming scheme
std::ofstream of;
of.open(outfile.c_str(), std::ios::out);
tc.write(of);
of.close();

return tc;
return;
}


Expand Down
8 changes: 7 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,13 @@ int main(int argc, char *argv[]) {
// run the em algorithm
KmerIndex index(opt);
index.load(opt);
auto collection = ProcessReads<KmerIndex, MinCollector<KmerIndex>>(index, opt);
auto firstFile = opt.files[0];
MinCollector<KmerIndex> collection(index, opt);
if (firstFile.size() >= 4 && firstFile.compare(firstFile.size()-4,4,".bam") == 0) {
ProcessBams<KmerIndex, MinCollector<KmerIndex>>(index, opt, collection);
} else {
ProcessReads<KmerIndex, MinCollector<KmerIndex>>(index, opt, collection);
}
// save modified index for future use
index.write((opt.output+"/index.saved"), false);
auto eff_lens = calc_eff_lens(index.trans_lens_, opt.fld);
Expand Down

0 comments on commit 4dc4271

Please sign in to comment.