@@ -314,7 +314,6 @@ __device__ __forceinline__ int32_t fast_approx_log2(const int32_t val)
314314 return 8 ;
315315}
316316
317- // TODO VI: This may need to be fixed at some point. Likely the last line
318317__device__ __forceinline__ int32_t log_linear_anchor_weight (const Anchor& a,
319318 const Anchor& b,
320319 const int32_t word_size,
@@ -327,7 +326,7 @@ __device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
327326 return NEGATIVE_INT32_INFINITY;
328327
329328 int32_t b_query_pos = b.query_position_in_read_ + word_size;
330- int32_t b_target_pos = b.target_position_in_read_ ;
329+ int32_t b_target_pos = static_cast < int32_t >( b.target_position_in_read_ ) ;
331330 if (b_target_pos < a.target_position_in_read_ )
332331 {
333332 b_target_pos -= word_size;
@@ -337,7 +336,8 @@ __device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
337336 b_target_pos += word_size;
338337 }
339338
340- int32_t x_dist = abs (int (b_target_pos) - int (a.target_position_in_read_ ));
339+ // int32_t x_dist = abs(b_target_pos - int(a.target_position_in_read_)); // we might have some type issues here?
340+ int32_t x_dist = abs (b_target_pos - int (a.target_position_in_read_ )); // we might have some type issues here?
341341
342342 if (x_dist > max_dist || x_dist == 0 )
343343 return NEGATIVE_INT32_INFINITY;
@@ -389,7 +389,8 @@ __device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
389389// / \param max_bandwidth
390390// / \return __global__
391391
392- // TODO VI: each block operates on a single tile
392+ // Each block operates on a single tile
393+ // TODO VI: Do we need to pad the anchors array (with null tiles) so it's tile aligned? We may be missing out on the anchors in the last tile
393394__global__ void chain_anchors_in_block (const Anchor* anchors,
394395 double * scores,
395396 int32_t * predecessors,
@@ -407,41 +408,41 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
407408 int32_t thread_id_in_block = threadIdx .x ; // Equivalent to "j." Represents the end of a sliding window.
408409
409410 // figure out which tile I am currently working on
410- int32_t batch_block_id = batch_id * BLOCK_COUNT + block_id; // TODO VI: not sure if this is correct...? I think we want this instead of tile size
411+ int32_t batch_block_id = batch_id * BLOCK_COUNT + block_id;
411412 if (batch_block_id < num_query_tiles)
412413 {
413414
414415 // All threads in the block will get the same tile_start number
415416 int32_t tile_start = tile_starts[batch_block_id];
416417
417418 // write is the leftmost index? global_read_index is offset by my thread_id
418- // revist this part
419+ // initialize as the 0th anchor in the tile and the "me" anchor
419420 int32_t global_write_index = tile_start;
420- int32_t global_read_index = tile_start + thread_id_in_block; // this is the right-most index of sliding window
421+ int32_t global_read_index = tile_start + thread_id_in_block;
421422 // printf("%d %d %d %d\n", block_id, global_write_index, global_read_index, tile_start);
422423 if (global_write_index < num_anchors)
423424 {
424425 __shared__ Anchor block_anchor_cache[PREDECESSOR_SEARCH_ITERATIONS];
425- // I DON'T KNOW WHAT THIS IS FOR
426426 __shared__ bool block_max_select_mask[PREDECESSOR_SEARCH_ITERATIONS];
427427 __shared__ int32_t block_score_cache[PREDECESSOR_SEARCH_ITERATIONS];
428428 __shared__ int32_t block_predecessor_cache[PREDECESSOR_SEARCH_ITERATIONS];
429429
430430 // Initialize the local caches
431- block_anchor_cache[thread_id_in_block] = anchors[global_read_index];
432- // I _believe_ some or most of these will be 0
433- // not sure why we downcast to integer here
434- block_score_cache[thread_id_in_block] = static_cast <int32_t >(scores[global_read_index]);
435- // I _believe some or most of these will be -1 at first
431+ // load the current anchor I'm on and it's associated data
432+ block_anchor_cache[thread_id_in_block] = anchors[global_read_index];
433+ block_score_cache[thread_id_in_block] = static_cast <int32_t >(scores[global_read_index]);
436434 block_predecessor_cache[thread_id_in_block] = predecessors[global_read_index];
437- // Still not sure what this is for
438- block_max_select_mask[thread_id_in_block] = false ;
435+ block_max_select_mask[thread_id_in_block] = false ;
439436
440437 // iterate through the tile
441- for (int32_t i = PREDECESSOR_SEARCH_ITERATIONS, counter = 0 ; counter < batch_size; ++counter, ++i)
438+ // VI: We do iterate i throughout
439+ // Do we go over to the next tile here...? Is that ok?
440+ // Answer to the above is it should be fine (maybe)
441+ for (int32_t i = PREDECESSOR_SEARCH_ITERATIONS, counter = 0 ; counter < batch_size; counter++, i++)
442442 {
443443 __syncthreads ();
444- Anchor possible_successor_anchor = block_anchor_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
444+ // on the first iteration, every thread looks at the 0th anchor
445+ const Anchor possible_successor_anchor = block_anchor_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
445446 int32_t current_score = block_score_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
446447 int32_t current_pred = block_predecessor_cache[i % PREDECESSOR_SEARCH_ITERATIONS];
447448 bool current_mask = block_max_select_mask[i % PREDECESSOR_SEARCH_ITERATIONS];
@@ -453,45 +454,31 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
453454 }
454455 __syncthreads ();
455456
456- if (thread_id_in_block == i % PREDECESSOR_SEARCH_ITERATIONS && global_read_index + i < num_anchors)
457+ // I think this is the thread at the LHS of the window
458+ if ((thread_id_in_block == (i % PREDECESSOR_SEARCH_ITERATIONS)) && (global_read_index + i < num_anchors))
457459 {
458- // Implies that the thread is at the right_side (head, front) of a window
460+ // Implies that the thread is at the left_side (head, front of FIFO queue ) of a window
459461 // Read in the anchor, score, and predecessor of the next anchor in memory.
460- block_anchor_cache[thread_id_in_block] = anchors[global_read_index + i];
462+ // I think we load if we're at the LHS so we want to get the (threadIdx + 64)th anchor
463+ block_anchor_cache[thread_id_in_block] = anchors[global_write_index + i];
461464 block_score_cache[thread_id_in_block] = 0 ;
462465 block_predecessor_cache[thread_id_in_block] = -1 ;
463466 block_max_select_mask[thread_id_in_block] = false ;
464- // block_score_cache[thread_id_in_block] = scores[global_read_index + i];
465- // block_predecessor_cache[thread_id_in_block] = predecessors[global_read_index + i];
466- // block_max_select_mask[thread_id_in_block] = anchor_select_mask[global_read_index + i];
467467 }
468468
469469 __syncthreads ();
470- // Calculate score
470+ // Calculate score order matters here
471471 int32_t marginal_score = log_linear_anchor_weight (possible_successor_anchor, block_anchor_cache[thread_id_in_block], word_size, max_distance, max_bandwidth);
472- // printf("%d %d | %d %d : %d %d : %d | %d %d %d %d | %d %d %d %d\n",
473- // thread_id_in_block, (i % PREDECESSOR_SEARCH_ITERATIONS), counter,
474- // i,
475- // current_score,
476- // marginal_score,
477- // block_score_cache[thread_id_in_block],
478- // block_anchor_cache[thread_id_in_block].query_read_id_,
479- // block_anchor_cache[thread_id_in_block].query_position_in_read_,
480- // block_anchor_cache[thread_id_in_block].target_read_id_,
481- // block_anchor_cache[thread_id_in_block].target_position_in_read_,
482- // possible_successor_anchor.query_read_id_,
483- // possible_successor_anchor.query_position_in_read_,
484- // possible_successor_anchor.target_read_id_,
485- // possible_successor_anchor.target_position_in_read_);
472+
486473 __syncthreads ();
487474
488- // if
489- if (current_score + marginal_score >= block_score_cache[thread_id_in_block] && (global_read_index + i) < num_anchors)
475+ if ((current_score + marginal_score >= block_score_cache[thread_id_in_block]) && (global_read_index + i < num_anchors))
490476 {
491477 // current_score = current_score + marginal_score;
492478 // current_pred = tile_start + counter - 1;
493479 block_score_cache[thread_id_in_block] = current_score + marginal_score;
494- block_predecessor_cache[thread_id_in_block] = tile_start + counter;
480+ // TODO VI: I'm not entirely sure about this part
481+ block_predecessor_cache[thread_id_in_block] = tile_start + counter; // this expands to tile_starts[batch_block_id] + counter, where counter is 0 -> 1024
495482 if (current_score + marginal_score > word_size)
496483 {
497484 block_max_select_mask[thread_id_in_block] = true ;
@@ -502,20 +489,22 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
502489 }
503490 __syncthreads ();
504491
492+ // I'm leftmost thread in teh sliding window, so I write my result out from cache to global array
505493 if (thread_id_in_block == counter % PREDECESSOR_SEARCH_ITERATIONS && (global_write_index + counter) < num_anchors)
506494 {
507495 // Position thread_id_in_block is at the left-side (tail) of the window.
508496 // It has therefore completed n = PREDECESSOR_SEARCH_ITERATIONS iterations.
509497 // It's final score is known.
510498 // Write its score and predecessor to the global_write_index + counter.
511- // VI: Why do we offset by counter here? Is it the same as threadIdx.x % 64?
512499 scores[global_write_index + counter] = static_cast <double >(current_score);
500+ // predecessors[global_write_index + counter] = block_predecessor_cache[thread_id_in_block];
513501 predecessors[global_write_index + counter] = current_pred;
502+ // anchor_select_mask[global_write_index + counter] = block_max_select_mask[thread_id_in_block];
514503 anchor_select_mask[global_write_index + counter] = current_mask;
515504 }
516- // I don't think we need to syncthreads here...? There shouldn't be anyone writing to the same place
517- __syncthreads ();
505+
518506 }
507+ __syncthreads ();
519508 }
520509 }
521510}
@@ -756,11 +745,8 @@ void OverlapperMinimap::get_overlaps(std::vector<Overlap>& fused_overlaps,
756745
757746 int32_t num_batches = (n_query_tiles / BLOCK_COUNT) + 1 ;
758747
759- fprintf (stderr, " THE NUMBER OF BATCHES IS: %d\n " , num_batches);
760748 for (std::size_t batch = 0 ; batch < static_cast <size_t >(num_batches); ++batch)
761749 {
762-
763- // std::cerr << "Running batch " << batch << ". Num anchors: " << n_anchors << std::endl;
764750 // Launch one 1792 blocks (from paper), with 64 threads
765751 // each batch is of size BLOCK_COUNT
766752 chain_anchors_in_block<<<BLOCK_COUNT, PREDECESSOR_SEARCH_ITERATIONS, 0 , _cuda_stream>>> (d_anchors.data (),
0 commit comments