Skip to content

Commit

Permalink
Allow references with up to 2**31 contigs
Browse files Browse the repository at this point in the history
This changes the RefRandstrobe struct in the following way.

Previously, bits were allocated in the following way:
- 40 bits base hash
- 24 bits auxiliary hash
- 32 bits position
- 23 bits reference/contig index
- 8 bits strobe2 offset
- 1 bit first_strobe_is_main flag

We take 8 bits away from the auxiliary hash and use those bits for the reference
index. New layout:

- 40 bits base hash
- 16 bits auxiliary hash
- 8 bits strobe2 offset
- 32 bits position
- 31 bits reference/contig index
- 1 bit first_strobe_is_main flag

Closes #421
  • Loading branch information
marcelm committed Oct 22, 2024
1 parent 5e39341 commit dc955af
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 20 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
This feature is currently *experimental* and only enabled when using the
`--mcs` command-line option.
Contributed by Ivan Tolstoganov (@Itolstoganov).
* #421: Allow references with up to 2^31 contigs (instead of 2^23
previously) by changing the way randstrobes are stored in the index.

## v0.14.0 (2024-10-03)

Expand All @@ -29,6 +31,8 @@
introduction of multi-context seeds.
Pre-generated index files (`.sti` files) therefore need to be re-generated.
(Strobealign will complain if you try to use an outdated index file.)
This reduces the number of allowed contigs in a reference from previously
2^24 to 2^23.

## v0.13.0 (2024-03-04)

Expand Down
2 changes: 1 addition & 1 deletion src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <sstream>

static Logger& logger = Logger::get();
static const uint32_t STI_FILE_FORMAT_VERSION = 3;
static const uint32_t STI_FILE_FORMAT_VERSION = 4;


namespace {
Expand Down
2 changes: 1 addition & 1 deletion src/randstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ static inline randstrobe_hash_t randstrobe_hash(syncmer_hash_t hash1, syncmer_ha
if (hash1 > hash2) {
std::swap(hash1, hash2);
}
return ((hash1 >> aux_len) << aux_len) ^ (hash2 >> (64 - aux_len));
return (((hash1 >> aux_len) << aux_len) ^ (hash2 >> (64 - aux_len))) & RANDSTROBE_HASH_MASK;
}

std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer) {
Expand Down
34 changes: 16 additions & 18 deletions src/randstrobes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,54 +17,52 @@
using syncmer_hash_t = uint64_t;
using randstrobe_hash_t = uint64_t;

static constexpr uint64_t RANDSTROBE_HASH_MASK = 0xFFFFFFFFFFFFFF00;

struct RefRandstrobe {
private:
randstrobe_hash_t m_hash;
randstrobe_hash_t m_hash_and_offset;
public:
uint32_t position;
private:
uint32_t m_packed; // packed representation of ref_index and strobe offset
// packed representation of ref_index and first_strobe_is_main
uint32_t m_ref_index_and_is_first;
public:

RefRandstrobe() { }

RefRandstrobe(randstrobe_hash_t hash, uint32_t position, uint32_t ref_index, uint8_t offset, bool first_strobe_is_main)
: m_hash(hash)
: m_hash_and_offset((hash & RANDSTROBE_HASH_MASK) | offset)
, position(position)
, m_packed((ref_index << 9) | (first_strobe_is_main << 8) | offset) { }
, m_ref_index_and_is_first((ref_index << 1) | (first_strobe_is_main & 1)) { }

bool operator< (const RefRandstrobe& other) const {
// Compare both hash and position to ensure that the order of the
// RefRandstrobes in the index is reproducible no matter which sorting
// function is used. This branchless comparison is faster than the
// equivalent one using std::tie.
__uint128_t lhs = (static_cast<__uint128_t>(m_hash) << 64) | ((static_cast<uint64_t>(position) << 32) | m_packed);
__uint128_t rhs = (static_cast<__uint128_t>(other.m_hash) << 64) | ((static_cast<uint64_t>(other.position) << 32) | m_packed);
__uint128_t lhs = (static_cast<__uint128_t>(m_hash_and_offset) << 64) | ((static_cast<uint64_t>(position) << 32) | m_ref_index_and_is_first);
__uint128_t rhs = (static_cast<__uint128_t>(other.m_hash_and_offset) << 64) | ((static_cast<uint64_t>(other.position) << 32) | m_ref_index_and_is_first);
return lhs < rhs;
}

bool first_strobe_is_main() const {
return (m_packed >> bit_alloc) & 1;
return m_ref_index_and_is_first & 1;
}

int reference_index() const {
return m_packed >> (bit_alloc + 1);
unsigned reference_index() const {
return m_ref_index_and_is_first >> 1;
}

int strobe2_offset() const {
return m_packed & mask;
unsigned strobe2_offset() const {
return m_hash_and_offset & 0xff;
}

randstrobe_hash_t hash() const {
return m_hash;
return m_hash_and_offset & RANDSTROBE_HASH_MASK;
}

private:
static constexpr int bit_alloc = 8;
static constexpr int mask = (1 << bit_alloc) - 1;

public:
static constexpr uint32_t max_number_of_references = (1 << (32 - bit_alloc - 1)) - 1; // bit_alloc - 1 because 1 bit to first_strobe_is_main()
static constexpr uint32_t max_number_of_references = (1u << 31) - 1;
};

struct QueryRandstrobe {
Expand Down

0 comments on commit dc955af

Please sign in to comment.