Skip to content

Commit

Permalink
add functions for finding the previous letter while backtracing
Browse files Browse the repository at this point in the history
  • Loading branch information
Sawwave committed Feb 26, 2024
1 parent c138e58 commit ef2104e
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 16 deletions.
35 changes: 35 additions & 0 deletions src/AwFmIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,41 @@ enum AwFmReturnCode awFmGetLocalSequencePositionFromIndexPosition(const struct A
size_t globalPosition, size_t *sequenceNumber, size_t *localSequencePosition);


/*
* Function: awFmNucleotideBacktraceReturnPreviousLetterIndex
* --------------------
* Backtraces the given position in the bwt, and returns the the previous character to that position.
* The given BWT position will then be updated to refer to the position of the returned previous character.
*
* Inputs:
* index: Pointer to the valid AwFmIndex struct.
* bwtPosition: A valid position in the fm index
*
* Returns:
* Letter index of the character found at the given position in the bwt. In other words,
* the letter previous in the sequence to the given position.
*/
uint8_t awFmNucleotideBacktraceReturnPreviousLetterIndex(const struct AwFmIndex *_RESTRICT_ const index,
uint64_t *bwtPosition);


/*
* Function: awFmAminoBacktraceReturnPreviousLetterIndex
* --------------------
* Backtraces the given position in the bwt, and returns the the previous character to that position.
* The given BWT position will then be updated to refer to the position of the returned previous character.
*
* Inputs:
* index: Pointer to the valid AwFmIndex struct.
* bwtPosition: A valid position in the fm index
*
* Returns:
* Letter index of the character found at the given position in the bwt. In other words,
* the letter previous in the sequence to the given position.
*/
uint8_t awFmAminoBacktraceReturnPreviousLetterIndex( const struct AwFmIndex *_RESTRICT_ const index,
uint64_t *bwtPosition);

/*
* Function: awFmGetHeaderStringFromSequenceNumber
* --------------------
Expand Down
76 changes: 60 additions & 16 deletions src/AwFmSearch.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,11 +296,11 @@ bool awFmSingleKmerExists(


inline size_t awFmNucleotideBacktraceBwtPosition(
const struct AwFmIndex *_RESTRICT_ const index, const uint64_t bwtPosition) {
const uint64_t *prefixSums = index->prefixSums;
const uint64_t blockIndex = awFmGetBlockIndexFromGlobalPosition(bwtPosition);
const struct AwFmIndex *_RESTRICT_ const index, const uint64_t bwtPosition) {

const uint64_t *prefixSums = index->prefixSums;
const uint64_t blockIndex = awFmGetBlockIndexFromGlobalPosition(bwtPosition);
const uint8_t localQueryPosition = awFmGetBlockQueryPositionFromGlobalPosition(bwtPosition);

const struct AwFmNucleotideBlock *_RESTRICT_ const blockPtr = &index->bwtBlockList.asNucleotide[blockIndex];
const uint8_t letterIndex = awFmGetNucleotideLetterAtBwtPosition(blockPtr, localQueryPosition);

Expand All @@ -309,40 +309,84 @@ inline size_t awFmNucleotideBacktraceBwtPosition(
return 0;
}

const uint64_t baseOccurrence = blockPtr->baseOccurrences[letterIndex];
const uint64_t baseOccurrence = blockPtr->baseOccurrences[letterIndex];
const AwFmSimdVec256 occurrenceVector = awFmMakeNucleotideOccurrenceVector(blockPtr, letterIndex);


uint16_t vectorPopcount = AwFmMaskedVectorPopcount(occurrenceVector, localQueryPosition);
uint16_t vectorPopcount = AwFmMaskedVectorPopcount(occurrenceVector, localQueryPosition);
uint64_t backtraceBwtPosition = prefixSums[letterIndex] + baseOccurrence + vectorPopcount - 1;

return backtraceBwtPosition;
}


inline size_t awFmAminoBacktraceBwtPosition(const struct AwFmIndex *_RESTRICT_ const index, const uint64_t bwtPosition) {
const uint64_t *prefixSums = index->prefixSums;
const uint64_t blockIndex = awFmGetBlockIndexFromGlobalPosition(bwtPosition);
const uint8_t localQueryPosition = awFmGetBlockQueryPositionFromGlobalPosition(bwtPosition);
inline size_t awFmAminoBacktraceBwtPosition(
const struct AwFmIndex *_RESTRICT_ const index, const uint64_t bwtPosition) {

const uint64_t *prefixSums = index->prefixSums;
const uint64_t blockIndex = awFmGetBlockIndexFromGlobalPosition(bwtPosition);
const uint8_t localQueryPosition = awFmGetBlockQueryPositionFromGlobalPosition(bwtPosition);
const struct AwFmAminoBlock *_RESTRICT_ const blockPtr = &index->bwtBlockList.asAmino[blockIndex];
uint8_t letterIndex = awFmGetAminoLetterAtBwtPosition(blockPtr, localQueryPosition);
uint8_t letterIndex = awFmGetAminoLetterAtBwtPosition(blockPtr, localQueryPosition);

// if we encountered the sentinel, we know the position and can stop backtracing
if(__builtin_expect(letterIndex == 21, 0)) {
return 0;
}

AwFmSimdVec256 occurrenceVector = awFmMakeAminoAcidOccurrenceVector(blockPtr, letterIndex);
const uint64_t baseOccurrence = blockPtr->baseOccurrences[letterIndex];

const uint16_t vectorPopcount = AwFmMaskedVectorPopcount(occurrenceVector, localQueryPosition);
const uint64_t baseOccurrence = blockPtr->baseOccurrences[letterIndex];
const uint16_t vectorPopcount = AwFmMaskedVectorPopcount(occurrenceVector, localQueryPosition);
const uint64_t backtraceBwtPosition = prefixSums[letterIndex] + baseOccurrence + vectorPopcount - 1;

return backtraceBwtPosition;
}


inline uint8_t awFmNucleotideBacktraceReturnPreviousLetterIndex(
const struct AwFmIndex *_RESTRICT_ const index, uint64_t *bwtPosition) {

const uint64_t *prefixSums = index->prefixSums;
const uint64_t blockIndex = awFmGetBlockIndexFromGlobalPosition(*bwtPosition);
const uint8_t localQueryPosition = awFmGetBlockQueryPositionFromGlobalPosition(*bwtPosition);
const struct AwFmNucleotideBlock *_RESTRICT_ const blockPtr = &index->bwtBlockList.asNucleotide[blockIndex];
const uint8_t letterIndex = awFmGetNucleotideLetterAtBwtPosition(blockPtr, localQueryPosition);

// if we encountered the sentinel, we know the position and can stop backtracing
if(__builtin_expect(letterIndex == 5, 0)) {
return 0;
}

const uint64_t baseOccurrence = blockPtr->baseOccurrences[letterIndex];
const AwFmSimdVec256 occurrenceVector = awFmMakeNucleotideOccurrenceVector(blockPtr, letterIndex);
uint16_t vectorPopcount = AwFmMaskedVectorPopcount(occurrenceVector, localQueryPosition);
*bwtPosition = prefixSums[letterIndex] + baseOccurrence + vectorPopcount - 1;

return letterIndex;
}


inline uint8_t awFmAminoBacktraceReturnPreviousLetterIndex(
const struct AwFmIndex *_RESTRICT_ const index, uint64_t *bwtPosition) {

const uint64_t *prefixSums = index->prefixSums;
const uint64_t blockIndex = awFmGetBlockIndexFromGlobalPosition(*bwtPosition);
const uint8_t localQueryPosition = awFmGetBlockQueryPositionFromGlobalPosition(*bwtPosition);
const struct AwFmAminoBlock *_RESTRICT_ const blockPtr = &index->bwtBlockList.asAmino[blockIndex];
uint8_t letterIndex = awFmGetAminoLetterAtBwtPosition(blockPtr, localQueryPosition);

// if we encountered the sentinel, we know the position and can stop backtracing
if(__builtin_expect(letterIndex == 21, 0)) {
return 0;
}

AwFmSimdVec256 occurrenceVector = awFmMakeAminoAcidOccurrenceVector(blockPtr, letterIndex);
const uint64_t baseOccurrence = blockPtr->baseOccurrences[letterIndex];
const uint16_t vectorPopcount = AwFmMaskedVectorPopcount(occurrenceVector, localQueryPosition);
*bwtPosition = prefixSums[letterIndex] + baseOccurrence + vectorPopcount - 1;

return letterIndex;
}


inline void awFmNucleotideNonSeededSearch(const struct AwFmIndex *_RESTRICT_ const index, const char *_RESTRICT_ const kmer,
const uint8_t kmerLength, struct AwFmSearchRange *range) {

Expand Down

0 comments on commit ef2104e

Please sign in to comment.