Skip to content

Commit 40f5550

Browse files
committed
1. precision/recall improvements 2. removed extra syncthreads 3. other small changes
1 parent 9413bf8 commit 40f5550

File tree

2 files changed

+34
-62
lines changed

2 files changed

+34
-62
lines changed

cudamapper/src/chainer_utils.cu

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,6 @@ __global__ void calculate_tiles_per_read(const std::int32_t* lengths,
153153
}
154154
}
155155

156-
// TODO VI: threads may be overwriting each other.
157-
// because the number of queries is greater than number of tiles, each thread here has to write the query starts
158-
// for multiple tiles
159156
__global__ void calculate_tile_starts(const std::int32_t* query_starts,
160157
const std::int32_t* tiles_per_query,
161158
std::int32_t* tile_starts,
@@ -164,27 +161,16 @@ __global__ void calculate_tile_starts(const std::int32_t* query_starts,
164161
const std::int32_t* tiles_per_query_up_to_point)
165162
{
166163
int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
167-
int32_t stride = blockDim.x * gridDim.x;
168164
if (d_thread_id < num_queries)
169165
{
170166
// for each tile, we look up the query it corresponds to and offset it by the which tile in the query
171167
// we're at multiplied by the total size of the tile
172-
// TODO VI: this memory access pattern seems a little off? Thread i and thread i+1 would overwrite eachother, no?
173168
for (int i = 0; i < tiles_per_query[d_thread_id]; i++)
174169
{
175170
//finds the offset in the ragged array and offsets to find start of "next" sub array
176171
tile_starts[tiles_per_query_up_to_point[d_thread_id] + i] = query_starts[d_thread_id] + (i * tile_size);
177172
}
178173
}
179-
//int counter = 0;
180-
//for (int i =0; i < num_queries; i++)
181-
//{
182-
// for (int j = 0; j < tiles_per_query[i]; j++)
183-
// {
184-
// tile_starts[counter] = query_starts[i] + (j * tile_size);
185-
// counter++;
186-
// }
187-
//}
188174
}
189175

190176
void encode_anchor_query_locations(const Anchor* anchors,
@@ -454,4 +440,4 @@ void encode_overlap_query_target_pairs(Overlap* overlaps,
454440
} // namespace chainerutils
455441
} // namespace cudamapper
456442
} // namespace genomeworks
457-
} // namespace claraparabricks
443+
} // namespace claraparabricks

cudamapper/src/overlapper_minimap.cu

Lines changed: 33 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,6 @@ __device__ __forceinline__ int32_t fast_approx_log2(const int32_t val)
314314
return 8;
315315
}
316316

317-
// TODO VI: This may need to be fixed at some point. Likely the last line
318317
__device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
319318
const Anchor& b,
320319
const int32_t word_size,
@@ -327,7 +326,7 @@ __device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
327326
return NEGATIVE_INT32_INFINITY;
328327

329328
int32_t b_query_pos = b.query_position_in_read_ + word_size;
330-
int32_t b_target_pos = b.target_position_in_read_;
329+
int32_t b_target_pos = static_cast<int32_t>(b.target_position_in_read_);
331330
if (b_target_pos < a.target_position_in_read_)
332331
{
333332
b_target_pos -= word_size;
@@ -337,7 +336,8 @@ __device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
337336
b_target_pos += word_size;
338337
}
339338

340-
int32_t x_dist = abs(int(b_target_pos) - int(a.target_position_in_read_));
339+
//int32_t x_dist = abs(b_target_pos - int(a.target_position_in_read_)); // we might have some type issues here?
340+
int32_t x_dist = abs(b_target_pos - int(a.target_position_in_read_)); // we might have some type issues here?
341341

342342
if (x_dist > max_dist || x_dist == 0)
343343
return NEGATIVE_INT32_INFINITY;
@@ -389,7 +389,8 @@ __device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
389389
/// \param max_bandwidth
390390
/// \return __global__
391391

392-
// TODO VI: each block operates on a single tile
392+
// Each block operates on a single tile
393+
// TODO VI: Do we need to pad the anchors array (with null tiles) so it's tile aligned? We may be missing out on the anchors in the last tile
393394
__global__ void chain_anchors_in_block(const Anchor* anchors,
394395
double* scores,
395396
int32_t* predecessors,
@@ -407,41 +408,41 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
407408
int32_t thread_id_in_block = threadIdx.x; // Equivalent to "j." Represents the end of a sliding window.
408409

409410
// figure out which tile I am currently working on
410-
int32_t batch_block_id = batch_id * BLOCK_COUNT + block_id; // TODO VI: not sure if this is correct...? I think we want this instead of tile size
411+
int32_t batch_block_id = batch_id * BLOCK_COUNT + block_id;
411412
if (batch_block_id < num_query_tiles)
412413
{
413414

414415
// All threads in the block will get the same tile_start number
415416
int32_t tile_start = tile_starts[batch_block_id];
416417

417418
// write is the leftmost index? global_read_index is offset by my thread_id
418-
// revist this part
419+
// initialize as the 0th anchor in the tile and the "me" anchor
419420
int32_t global_write_index = tile_start;
420-
int32_t global_read_index = tile_start + thread_id_in_block; // this is the right-most index of sliding window
421+
int32_t global_read_index = tile_start + thread_id_in_block;
421422
// printf("%d %d %d %d\n", block_id, global_write_index, global_read_index, tile_start);
422423
if (global_write_index < num_anchors)
423424
{
424425
__shared__ Anchor block_anchor_cache[PREDECESSOR_SEARCH_ITERATIONS];
425-
// I DON'T KNOW WHAT THIS IS FOR
426426
__shared__ bool block_max_select_mask[PREDECESSOR_SEARCH_ITERATIONS];
427427
__shared__ int32_t block_score_cache[PREDECESSOR_SEARCH_ITERATIONS];
428428
__shared__ int32_t block_predecessor_cache[PREDECESSOR_SEARCH_ITERATIONS];
429429

430430
// Initialize the local caches
431-
block_anchor_cache[thread_id_in_block] = anchors[global_read_index];
432-
// I _believe_ some or most of these will be 0
433-
// not sure why we downcast to integer here
434-
block_score_cache[thread_id_in_block] = static_cast<int32_t>(scores[global_read_index]);
435-
// I _believe some or most of these will be -1 at first
431+
// load the current anchor I'm on and it's associated data
432+
block_anchor_cache[thread_id_in_block] = anchors[global_read_index];
433+
block_score_cache[thread_id_in_block] = static_cast<int32_t>(scores[global_read_index]);
436434
block_predecessor_cache[thread_id_in_block] = predecessors[global_read_index];
437-
// Still not sure what this is for
438-
block_max_select_mask[thread_id_in_block] = false;
435+
block_max_select_mask[thread_id_in_block] = false;
439436

440437
// iterate through the tile
441-
for (int32_t i = PREDECESSOR_SEARCH_ITERATIONS, counter = 0; counter < batch_size; ++counter, ++i)
438+
// VI: We do iterate i throughout
439+
// Do we go over to the next tile here...? Is that ok?
440+
// Answer to the above is it should be fine (maybe)
441+
for (int32_t i = PREDECESSOR_SEARCH_ITERATIONS, counter = 0; counter < batch_size; counter++, i++)
442442
{
443443
__syncthreads();
444-
Anchor possible_successor_anchor = block_anchor_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
444+
// on the first iteration, every thread looks at the 0th anchor
445+
const Anchor possible_successor_anchor = block_anchor_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
445446
int32_t current_score = block_score_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
446447
int32_t current_pred = block_predecessor_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
447448
bool current_mask = block_max_select_mask[i % PREDECESSOR_SEARCH_ITERATIONS];
@@ -453,45 +454,31 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
453454
}
454455
__syncthreads();
455456

456-
if (thread_id_in_block == i % PREDECESSOR_SEARCH_ITERATIONS && global_read_index + i < num_anchors)
457+
// I think this is the thread at the LHS of the window
458+
if ((thread_id_in_block == (i % PREDECESSOR_SEARCH_ITERATIONS)) && (global_read_index + i < num_anchors))
457459
{
458-
// Implies that the thread is at the right_side (head, front) of a window
460+
// Implies that the thread is at the left_side (head, front of FIFO queue) of a window
459461
// Read in the anchor, score, and predecessor of the next anchor in memory.
460-
block_anchor_cache[thread_id_in_block] = anchors[global_read_index + i];
462+
// I think we load if we're at the LHS so we want to get the (threadIdx + 64)th anchor
463+
block_anchor_cache[thread_id_in_block] = anchors[global_write_index + i];
461464
block_score_cache[thread_id_in_block] = 0;
462465
block_predecessor_cache[thread_id_in_block] = -1;
463466
block_max_select_mask[thread_id_in_block] = false;
464-
//block_score_cache[thread_id_in_block] = scores[global_read_index + i];
465-
//block_predecessor_cache[thread_id_in_block] = predecessors[global_read_index + i];
466-
//block_max_select_mask[thread_id_in_block] = anchor_select_mask[global_read_index + i];
467467
}
468468

469469
__syncthreads();
470-
// Calculate score
470+
// Calculate score order matters here
471471
int32_t marginal_score = log_linear_anchor_weight(possible_successor_anchor, block_anchor_cache[thread_id_in_block], word_size, max_distance, max_bandwidth);
472-
// printf("%d %d | %d %d : %d %d : %d | %d %d %d %d | %d %d %d %d\n",
473-
// thread_id_in_block, (i % PREDECESSOR_SEARCH_ITERATIONS), counter,
474-
// i,
475-
// current_score,
476-
// marginal_score,
477-
// block_score_cache[thread_id_in_block],
478-
// block_anchor_cache[thread_id_in_block].query_read_id_,
479-
// block_anchor_cache[thread_id_in_block].query_position_in_read_,
480-
// block_anchor_cache[thread_id_in_block].target_read_id_,
481-
// block_anchor_cache[thread_id_in_block].target_position_in_read_,
482-
// possible_successor_anchor.query_read_id_,
483-
// possible_successor_anchor.query_position_in_read_,
484-
// possible_successor_anchor.target_read_id_,
485-
// possible_successor_anchor.target_position_in_read_);
472+
486473
__syncthreads();
487474

488-
// if
489-
if (current_score + marginal_score >= block_score_cache[thread_id_in_block] && (global_read_index + i) < num_anchors)
475+
if ((current_score + marginal_score >= block_score_cache[thread_id_in_block]) && (global_read_index + i < num_anchors))
490476
{
491477
//current_score = current_score + marginal_score;
492478
//current_pred = tile_start + counter - 1;
493479
block_score_cache[thread_id_in_block] = current_score + marginal_score;
494-
block_predecessor_cache[thread_id_in_block] = tile_start + counter;
480+
// TODO VI: I'm not entirely sure about this part
481+
block_predecessor_cache[thread_id_in_block] = tile_start + counter; // this expands to tile_starts[batch_block_id] + counter, where counter is 0 -> 1024
495482
if (current_score + marginal_score > word_size)
496483
{
497484
block_max_select_mask[thread_id_in_block] = true;
@@ -502,20 +489,22 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
502489
}
503490
__syncthreads();
504491

492+
// I'm leftmost thread in teh sliding window, so I write my result out from cache to global array
505493
if (thread_id_in_block == counter % PREDECESSOR_SEARCH_ITERATIONS && (global_write_index + counter) < num_anchors)
506494
{
507495
// Position thread_id_in_block is at the left-side (tail) of the window.
508496
// It has therefore completed n = PREDECESSOR_SEARCH_ITERATIONS iterations.
509497
// It's final score is known.
510498
// Write its score and predecessor to the global_write_index + counter.
511-
// VI: Why do we offset by counter here? Is it the same as threadIdx.x % 64?
512499
scores[global_write_index + counter] = static_cast<double>(current_score);
500+
//predecessors[global_write_index + counter] = block_predecessor_cache[thread_id_in_block];
513501
predecessors[global_write_index + counter] = current_pred;
502+
//anchor_select_mask[global_write_index + counter] = block_max_select_mask[thread_id_in_block];
514503
anchor_select_mask[global_write_index + counter] = current_mask;
515504
}
516-
// I don't think we need to syncthreads here...? There shouldn't be anyone writing to the same place
517-
__syncthreads();
505+
518506
}
507+
__syncthreads();
519508
}
520509
}
521510
}
@@ -756,11 +745,8 @@ void OverlapperMinimap::get_overlaps(std::vector<Overlap>& fused_overlaps,
756745

757746
int32_t num_batches = (n_query_tiles / BLOCK_COUNT) + 1;
758747

759-
fprintf(stderr, "THE NUMBER OF BATCHES IS: %d\n", num_batches);
760748
for (std::size_t batch = 0; batch < static_cast<size_t>(num_batches); ++batch)
761749
{
762-
763-
//std::cerr << "Running batch " << batch << ". Num anchors: " << n_anchors << std::endl;
764750
// Launch one 1792 blocks (from paper), with 64 threads
765751
// each batch is of size BLOCK_COUNT
766752
chain_anchors_in_block<<<BLOCK_COUNT, PREDECESSOR_SEARCH_ITERATIONS, 0, _cuda_stream>>>(d_anchors.data(),

0 commit comments

Comments
 (0)