Skip to content

Commit

Permalink
Added multiple overflow methods
Browse files Browse the repository at this point in the history
  • Loading branch information
Martinsos committed Jul 20, 2014
1 parent 8391f0d commit 1766765
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 36 deletions.
81 changes: 53 additions & 28 deletions src/Swimd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,21 @@ void print_mmxxxi(__mxxxi mm) {
printf("%d ", unpacked[i]);
}

/**
* @param stopOnOverflow If true, function will stop when first overflow happens.
* If false, function will not stop but continue with next sequence.
*
*/
template<class SIMD>
static int searchDatabaseSW_(unsigned char query[], int queryLength,
unsigned char** db, int dbLength, int dbSeqLengths[],
int gapOpen, int gapExt, int* scoreMatrix, int alphabetLength,
int scores[], bool calculated[]) {
int scores[], bool calculated[], const int overflowMethod) {

const typename SIMD::type LOWER_BOUND = std::numeric_limits<typename SIMD::type>::min();
const typename SIMD::type UPPER_BOUND = std::numeric_limits<typename SIMD::type>::max();

bool overflowOccured = false; // True if overflow was detected at least once.

// ----------------------- CHECK ARGUMENTS -------------------------- //
// Check if Q, R or scoreMatrix have values too big for used score type
Expand Down Expand Up @@ -273,48 +280,62 @@ static int searchDatabaseSW_(unsigned char query[], int queryLength,
_mmxxx_store_si((__mxxxi*)unpackedMaxH, maxH);

// ------------------------ OVERFLOW DETECTION -------------------------- //
bool overflowDetected = false; // True if overflow was detected for this column.
bool overflowed[SIMD::numSeqs];
if (!SIMD::satArthm) {
// This check is based on following assumptions:
// - overflow wraps
// - Q, R and all scores from scoreMatrix are between LOWER_BOUND/2 and UPPER_BOUND/2 exclusive
typename SIMD::type unpackedOfTest[SIMD::numSeqs];
_mmxxx_store_si((__mxxxi*)unpackedOfTest, ofTest);
for (int i = 0; i < SIMD::numSeqs; i++)
if (currDbSeqsPos[i] != 0 && unpackedOfTest[i] <= LOWER_BOUND/2)
return SWIMD_ERR_OVERFLOW;
for (int i = 0; i < SIMD::numSeqs; i++) {
overflowed[i] = currDbSeqsPos[i] != 0 &&
unpackedOfTest[i] <= LOWER_BOUND / 2;
}
} else {
if (SIMD::negRange) {
// Since I use saturation, I check if minUlH_P was non negative
typename SIMD::type unpackedOfTest[SIMD::numSeqs];
_mmxxx_store_si((__mxxxi*)unpackedOfTest, ofTest);
for (int i = 0; i < SIMD::numSeqs; i++)
if (currDbSeqsPos[i] != 0 && unpackedOfTest[i] >= 0)
return SWIMD_ERR_OVERFLOW;
for (int i = 0; i < SIMD::numSeqs; i++) {
overflowed[i] = currDbSeqsPos[i] != 0 && unpackedOfTest[i] >= 0;
}
} else {
// I check if upper bound is reached
for (int i = 0; i < SIMD::numSeqs; i++)
if (currDbSeqsPos[i] != 0 && unpackedMaxH[i] == UPPER_BOUND) {
return SWIMD_ERR_OVERFLOW;
}
for (int i = 0; i < SIMD::numSeqs; i++) {
overflowed[i] = currDbSeqsPos[i] != 0 &&
unpackedMaxH[i] == UPPER_BOUND;
}
}
}
for (int i = 0; i < SIMD::numSeqs; i++) {
overflowDetected = overflowDetected || overflowed[i];
}
overflowOccured = overflowOccured || overflowDetected;
// In buckets method, we stop calculation when overflow is detected.
if (overflowMethod == SWIMD_OVERFLOW_BUCKETS && overflowDetected) {
return SWIMD_ERR_OVERFLOW;
}

// ---------------------------------------------------------------------- //

// --------------------- CHECK AND HANDLE SEQUENCE END ------------------ //
if (shortestDbSeqLength == columnsSinceLastSeqEnd) { // If at least one sequence ended
if (overflowDetected || shortestDbSeqLength == columnsSinceLastSeqEnd) { // If at least one sequence ended
shortestDbSeqLength = -1;
typename SIMD::type resetMask[SIMD::numSeqs] __attribute__((aligned(16)));

for (int i = 0; i < SIMD::numSeqs; i++) {
if (currDbSeqsPos[i] != 0) { // If not null sequence
currDbSeqsLengths[i] -= columnsSinceLastSeqEnd;
if (currDbSeqsLengths[i] == 0) { // If sequence ended
if (overflowed[i] || currDbSeqsLengths[i] == 0) { // If sequence ended
numEndedDbSeqs++;
// Save best sequence score
scores[currDbSeqsIdxs[i]] = unpackedMaxH[i];
if (SIMD::negRange)
scores[currDbSeqsIdxs[i]] -= LOWER_BOUND;
calculated[currDbSeqsIdxs[i]] = true;
if (!overflowed[i]) {
// Save score and mark as calculated
scores[currDbSeqsIdxs[i]] = unpackedMaxH[i];
if (SIMD::negRange)
scores[currDbSeqsIdxs[i]] -= LOWER_BOUND;
calculated[currDbSeqsIdxs[i]] = true;
}
// Load next sequence
loadNextSequence(nextDbSeqIdx, dbLength, currDbSeqsIdxs[i], currDbSeqsPos[i],
currDbSeqsLengths[i], db, dbSeqLengths, calculated, numEndedDbSeqs);
Expand Down Expand Up @@ -361,6 +382,9 @@ static int searchDatabaseSW_(unsigned char query[], int queryLength,
// ---------------------------------------------------------------------- //
}

if (overflowOccured) {
return SWIMD_ERR_OVERFLOW;
}
return 0;
}

Expand All @@ -384,12 +408,13 @@ static inline bool loadNextSequence(int &nextDbSeqIdx, int dbLength, int &currDb
}
}

extern int searchDatabaseSW(unsigned char query[], int queryLength,
static int searchDatabaseSW(unsigned char query[], int queryLength,
unsigned char** db, int dbLength, int dbSeqLengths[],
int gapOpen, int gapExt, int* scoreMatrix, int alphabetLength,
int scores[]) {
int scores[], const int overflowMethod) {
int resultCode = 0;
const int chunkSize = 1024;
// Do buckets only if using buckets overflow method.
const int chunkSize = overflowMethod == SWIMD_OVERFLOW_BUCKETS ? 1024 : dbLength;
bool* calculated = new bool[chunkSize];
for (int startIdx = 0; startIdx < dbLength; startIdx += chunkSize) {
unsigned char** db_ = db + startIdx;
Expand All @@ -401,17 +426,17 @@ extern int searchDatabaseSW(unsigned char query[], int queryLength,
resultCode = searchDatabaseSW_< SimdSW<char> >(query, queryLength,
db_, dbLength_, dbSeqLengths_,
gapOpen, gapExt, scoreMatrix, alphabetLength, scores_,
calculated);
if (resultCode != 0) {
calculated, overflowMethod);
if (resultCode == SWIMD_ERR_OVERFLOW) {
resultCode = searchDatabaseSW_< SimdSW<short> >(query, queryLength,
db_, dbLength_, dbSeqLengths_,
gapOpen, gapExt, scoreMatrix, alphabetLength, scores_,
calculated);
if (resultCode != 0) {
calculated, overflowMethod);
if (resultCode == SWIMD_ERR_OVERFLOW) {
resultCode = searchDatabaseSW_< SimdSW<int> >(query, queryLength,
db_, dbLength_, dbSeqLengths_,
gapOpen, gapExt, scoreMatrix, alphabetLength, scores_,
calculated);
calculated, overflowMethod);
if (resultCode != 0)
break;
}
Expand Down Expand Up @@ -821,7 +846,7 @@ static int searchDatabase(unsigned char query[], int queryLength,
extern int swimdSearchDatabase(unsigned char query[], int queryLength,
unsigned char** db, int dbLength, int dbSeqLengths[],
int gapOpen, int gapExt, int* scoreMatrix, int alphabetLength,
int scores[], const int mode) {
int scores[], const int mode, const int overflowMethod) {
#if !defined(__SSE4_1__) && !defined(__AVX2__)
return SWIMD_ERR_NO_SIMD_SUPPORT;
#else
Expand All @@ -839,7 +864,7 @@ extern int swimdSearchDatabase(unsigned char query[], int queryLength,
gapOpen, gapExt, scoreMatrix, alphabetLength, scores);
} else if (mode == SWIMD_MODE_SW) {
return searchDatabaseSW(query, queryLength, db, dbLength, dbSeqLengths,
gapOpen, gapExt, scoreMatrix, alphabetLength, scores);
gapOpen, gapExt, scoreMatrix, alphabetLength, scores, overflowMethod);
}
return SWIMD_ERR_INVALID_MODE;
#endif
Expand Down
17 changes: 16 additions & 1 deletion src/Swimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ extern "C" {
#define SWIMD_MODE_HW 1
#define SWIMD_MODE_OV 2
#define SWIMD_MODE_SW 3

// Overflow handling
#define SWIMD_OVERFLOW_SIMPLE 0
#define SWIMD_OVERFLOW_BUCKETS 1


/**
* Compares query sequence with each database sequence and returns similarity scores.
Expand Down Expand Up @@ -55,12 +60,22 @@ extern "C" {
* _DBSEQ_
* _QUERY_
* SWIMD_MODE_SW: local alignment (Smith-Waterman)
* @param [in] overflowMethod Method that defines behavior regarding overflows.
* SWIMD_OVERFLOW_SIMPLE: all sequences are first calculated using
* char precision, those that overflowed are then calculated using
* short precision, and those that overflowed again are calculated
* using integer precision.
* SWIMD_OVERFLOW_BUCKETS: database is divided into buckets and each
* bucket is calculated independently. When overflow occurs,
* calculation is resumed with higher precision for all
* following sequences in that bucket.
*
* @return 0 if all okay, error code otherwise.
*/
int swimdSearchDatabase(unsigned char query[], int queryLength,
unsigned char** db, int dbLength, int dbSeqLengths[],
int gapOpen, int gapExt, int* scoreMatrix, int alphabetLength,
int scores[], const int mode);
int scores[], const int mode, const int overflowMethod);

#ifdef __cplusplus
}
Expand Down
2 changes: 1 addition & 1 deletion src/swimd_aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ int main(int argc, char * const argv[]) {
clock_t start = clock();
int resultCode = swimdSearchDatabase(query, queryLength, db, dbLength, dbSeqLengths,
gapOpen, gapExt, scoreMatrix.getMatrix(), alphabetLength,
scores, modeCode);
scores, modeCode, SWIMD_OVERFLOW_BUCKETS);
clock_t finish = clock();
cpuTime += ((double)(finish-start))/CLOCKS_PER_SEC;
// ---------------------------------------------------------------------------- //
Expand Down
13 changes: 7 additions & 6 deletions src/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ int main(int argc, char * const argv[]) {
fillRandomly(query, queryLength, alphabetLength);

// Create random database
int dbLength = 500;
int dbLength = 200;
unsigned char * db[dbLength];
int dbSeqsLengths[dbLength];
for (int i = 0; i < dbLength; i++) {
Expand Down Expand Up @@ -89,7 +89,8 @@ int main(int argc, char * const argv[]) {
return 1;
}
resultCode = swimdSearchDatabase(query, queryLength, db, dbLength, dbSeqsLengths,
gapOpen, gapExt, scoreMatrix, alphabetLength, scores, modeCode);
gapOpen, gapExt, scoreMatrix, alphabetLength, scores,
modeCode, SWIMD_OVERFLOW_SIMPLE);
finish = clock();
double time1 = ((double)(finish-start))/CLOCKS_PER_SEC;
printf("Time: %lf\n", time1);
Expand All @@ -99,8 +100,8 @@ int main(int argc, char * const argv[]) {
exit(0);
}
// Print result
printf("Result: ");
printInts(scores, dbLength);
//printf("Result: ");
//printInts(scores, dbLength);
printf("Maximum: %d\n", maximum(scores, dbLength));


Expand All @@ -120,8 +121,8 @@ int main(int argc, char * const argv[]) {
printf("Time: %lf\n", time2);

// Print result
printf("Result: ");
printInts(scores2, dbLength);
//printf("Result: ");
//printInts(scores2, dbLength);
printf("Maximum: %d\n", maximum(scores2, dbLength));


Expand Down

2 comments on commit 1766765

@tolot27
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit slows down the computation significantly. Although SWIMD_OVERFLOW_BUCKETS is faster then SWIMD_OVERFLOW_SIMPLE it is still slower than without this commit.

without commit:

Cpu time of searching: 1.721736
Read 208 database sequences in total.

with SWIMD_OVERFLOW_BUCKETS:

Cpu time of searching: 2.062271
Read 208 database sequences in total.

with SWIMD_OVERFLOW_SIMPLE:

Cpu time of searching: 2.393488
Read 208 database sequences in total.

Maybe it is not very representative, but my usual test case with a query sequence of 6845 residues and a database fasta file with 208 sequences and 231193 residues total.

Thats very tiny but the slow down is significant.

I'll report more tests soon after fixing #9 .

@Martinsos
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tolot27 Ah yes, this is true! I noticed the slowdown, however it was important was somebody to have this feature and I did not have time to fix this, so I left it as it is for now. I also thought that maybe my data was not representative enough. I did a quick investigation but could find no obvious reason for this slowdown! I should fix this as soon as possible, that is for sure. I will open an issue with this and put high priority.

Please sign in to comment.