Merge pull request #5 from edawson/chainer-utils-backtrace

edawson · web-flow · commit 9413bf8403e5 · 2020-10-12T21:08:29.000-05:00
[cudamapper] Remove OverlapperMinimap test file, refactor to use back…
diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
@@ -50,6 +50,81 @@ __device__ bool operator==(const QueryReadID& a, const QueryReadID& b)
     return a.query_read_id_ == b.query_read_id_;
 }
 
+__device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
+{
+    Overlap overlap;
+    overlap.num_residues_ = num_anchors;
+
+    overlap.query_read_id_  = start.query_read_id_;
+    overlap.target_read_id_ = start.target_read_id_;
+    assert(start.query_read_id_ == end.query_read_id_ && start.target_read_id_ == end.target_read_id_);
+
+    overlap.query_start_position_in_read_ = min(start.query_position_in_read_, end.query_position_in_read_);
+    overlap.query_end_position_in_read_   = max(start.query_position_in_read_, end.query_position_in_read_);
+    bool is_negative_strand               = end.target_position_in_read_ < start.target_position_in_read_;
+    if (is_negative_strand)
+    {
+        overlap.relative_strand                = RelativeStrand::Reverse;
+        overlap.target_start_position_in_read_ = end.target_position_in_read_;
+        overlap.target_end_position_in_read_   = start.target_position_in_read_;
+    }
+    else
+    {
+        overlap.relative_strand                = RelativeStrand::Forward;
+        overlap.target_start_position_in_read_ = start.target_position_in_read_;
+        overlap.target_end_position_in_read_   = end.target_position_in_read_;
+    }
+    return overlap;
+}
+
+__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
+                                              Overlap* overlaps,
+                                              double* scores,
+                                              bool* max_select_mask,
+                                              int32_t* predecessors,
+                                              const int32_t n_anchors,
+                                              const int32_t min_score)
+{
+    const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (d_tid < n_anchors)
+    {
+
+        int32_t global_overlap_index = d_tid;
+        if (scores[d_tid] >= min_score)
+        {
+
+            int32_t index                = global_overlap_index;
+            int32_t first_index          = index;
+            int32_t num_anchors_in_chain = 0;
+            Anchor final_anchor          = anchors[global_overlap_index];
+
+            while (index != -1)
+            {
+                first_index  = index;
+                int32_t pred = predecessors[index];
+                if (pred != -1)
+                {
+                    max_select_mask[pred] = false;
+                }
+                num_anchors_in_chain++;
+                index = predecessors[index];
+            }
+            Anchor first_anchor            = anchors[first_index];
+            overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
+            // Overlap final_overlap          = overlaps[global_overlap_index];
+            // printf("%d %d %d %d %d %d %d %f\n",
+            //        final_overlap.query_read_id_, final_overlap.query_start_position_in_read_, final_overlap.query_end_position_in_read_,
+            //        final_overlap.target_read_id_, final_overlap.target_start_position_in_read_, final_overlap.target_end_position_in_read_,
+            //        final_overlap.num_residues_,
+            //        final_score);
+        }
+        else
+        {
+            max_select_mask[global_overlap_index] = false;
+        }
+    }
+}
+
 __global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts)
 {
     std::int32_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -89,7 +164,7 @@ __global__ void calculate_tile_starts(const std::int32_t* query_starts,
                                       const std::int32_t* tiles_per_query_up_to_point)
 {
     int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int32_t stride = blockDim.x * gridDim.x;
+    int32_t stride      = blockDim.x * gridDim.x;
     if (d_thread_id < num_queries)
     {
         // for each tile, we look up the query it corresponds to and offset it by the which tile in the query
@@ -167,7 +242,7 @@ void encode_anchor_query_locations(const Anchor* anchors,
     cub::DeviceScan::ExclusiveSum(d_temp_storage,
                                   temp_storage_bytes,
                                   query_lengths.data(), // this is the vector of encoded lengths
-                                  query_starts.data(), // at this point, this vector is empty
+                                  query_starts.data(),  // at this point, this vector is empty
                                   n_queries,
                                   _cuda_stream);
 
@@ -178,15 +253,15 @@ void encode_anchor_query_locations(const Anchor* anchors,
     cub::DeviceScan::ExclusiveSum(d_temp_storage,
                                   temp_storage_bytes,
                                   query_lengths.data(), // this is the vector of encoded lengths
-                                  query_starts.data(), 
+                                  query_starts.data(),
                                   n_queries,
                                   _cuda_stream);
 
     // paper uses the ends and finds the beginnings with x - w + 1, are we converting to that here?
     // TODO VI: I'm not entirely sure what this is for? I think we want to change the read query
     // (defined by [query_start, query_start + query_length] to [query_end - query_length + 1, query_end])
     // The above () is NOT true
-    convert_offsets_to_ends<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), // this gives how many starts at each index
+    convert_offsets_to_ends<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(),  // this gives how many starts at each index
                                                                                            query_lengths.data(), // this is the vector of encoded lengths
                                                                                            query_ends.data(),
                                                                                            n_queries);
@@ -215,21 +290,21 @@ void encode_anchor_query_locations(const Anchor* anchors,
         d_temp_storage     = nullptr;
         temp_storage_bytes = 0;
         cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                temp_storage_bytes,
-                                tiles_per_query.data(), // this is the vector of encoded lengths
-                                d_tiles_per_query_up_to_point.data(), 
-                                n_queries,
-                                _cuda_stream);
+                                      temp_storage_bytes,
+                                      tiles_per_query.data(), // this is the vector of encoded lengths
+                                      d_tiles_per_query_up_to_point.data(),
+                                      n_queries,
+                                      _cuda_stream);
 
         d_temp_buf.clear_and_resize(temp_storage_bytes);
         d_temp_storage = d_temp_buf.data();
-        
+
         cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                temp_storage_bytes,
-                                tiles_per_query.data(), // this is the vector of encoded lengths
-                                d_tiles_per_query_up_to_point.data(), 
-                                n_queries,
-                                _cuda_stream);        
+                                      temp_storage_bytes,
+                                      tiles_per_query.data(), // this is the vector of encoded lengths
+                                      d_tiles_per_query_up_to_point.data(),
+                                      n_queries,
+                                      _cuda_stream);
 
         calculate_tile_starts<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), tiles_per_query.data(), tile_starts.data(), tile_size, n_queries, d_tiles_per_query_up_to_point.data());
     }
@@ -276,7 +351,6 @@ void encode_anchor_query_target_pairs(const Anchor* anchors,
                                        d_num_query_target_pairs.data(),
                                        n_anchors);
 
-    
     n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream);
 
     d_temp_storage     = nullptr;
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
@@ -126,6 +126,14 @@ struct TileResults
 __device__ bool
 operator==(const QueryTargetPair& a, const QueryTargetPair& b);
 
+__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
+                                              Overlap* overlaps,
+                                              double* scores,
+                                              bool* max_select_mask,
+                                              int32_t* predecessors,
+                                              const int32_t n_anchors,
+                                              const int32_t min_score);
+
 __global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts);
 
 __global__ void calculate_tile_starts(const std::int32_t* query_starts,
diff --git a/cudamapper/src/overlapper_minimap.cu b/cudamapper/src/overlapper_minimap.cu
@@ -161,8 +161,8 @@ __global__ void mask_overlaps(Overlap* overlaps, std::size_t n_overlaps, bool* s
         const bool mask_self_self     = false;
         auto query_bases_per_residue  = static_cast<double>(overlap_query_length) / static_cast<double>(overlaps[d_tid].num_residues_);
         auto target_bases_per_residue = static_cast<double>(overlap_target_length) / static_cast<double>(overlaps[d_tid].num_residues_);
-        select_mask[d_tid] = select_mask[d_tid] && (overlap_query_length >= min_overlap_length) && (overlap_target_length >= min_overlap_length);
-        select_mask[d_tid] = select_mask[d_tid] && (overlaps[d_tid].num_residues_ >= min_residues);
+        select_mask[d_tid]            = select_mask[d_tid] && (overlap_query_length >= min_overlap_length) && (overlap_target_length >= min_overlap_length);
+        select_mask[d_tid]            = select_mask[d_tid] && (overlaps[d_tid].num_residues_ >= min_residues);
         //mask[d_tid] &= !mask_self_self;
         select_mask[d_tid] = select_mask[d_tid] && (query_bases_per_residue < max_bases_per_residue) && (target_bases_per_residue < max_bases_per_residue);
         // Look at the overlaps and all the overlaps adjacent to me, up to some maximum. Between neighbor i and myself, if
@@ -314,7 +314,6 @@ __device__ __forceinline__ int32_t fast_approx_log2(const int32_t val)
         return 8;
 }
 
-
 // TODO VI: This may need to be fixed at some point. Likely the last line
 __device__ __forceinline__ int32_t log_linear_anchor_weight(const Anchor& a,
                                                             const Anchor& b,
@@ -398,7 +397,7 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
                                        const int32_t* tile_starts,
                                        const int32_t num_anchors,
                                        const int32_t num_query_tiles,
-                                       const int32_t batch_id,  // which batch number we are on
+                                       const int32_t batch_id,   // which batch number we are on
                                        const int32_t batch_size, // fixed to TILE_SIZE...?
                                        const int32_t word_size,
                                        const int32_t max_distance,
@@ -429,14 +428,14 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
             __shared__ int32_t block_predecessor_cache[PREDECESSOR_SEARCH_ITERATIONS];
 
             // Initialize the local caches
-            block_anchor_cache[thread_id_in_block]      = anchors[global_read_index];
+            block_anchor_cache[thread_id_in_block] = anchors[global_read_index];
             // I _believe_ some or most of these will be 0
             // not sure why we downcast to integer here
-            block_score_cache[thread_id_in_block]       = static_cast<int32_t>(scores[global_read_index]);
+            block_score_cache[thread_id_in_block] = static_cast<int32_t>(scores[global_read_index]);
             // I _believe some or most of these will be -1 at first
             block_predecessor_cache[thread_id_in_block] = predecessors[global_read_index];
             // Still not sure what this is for
-            block_max_select_mask[thread_id_in_block]   = false;
+            block_max_select_mask[thread_id_in_block] = false;
 
             // iterate through the tile
             for (int32_t i = PREDECESSOR_SEARCH_ITERATIONS, counter = 0; counter < batch_size; ++counter, ++i)
@@ -486,7 +485,7 @@ __global__ void chain_anchors_in_block(const Anchor* anchors,
                 //        possible_successor_anchor.target_position_in_read_);
                 __syncthreads();
 
-                // if 
+                // if
                 if (current_score + marginal_score >= block_score_cache[thread_id_in_block] && (global_read_index + i) < num_anchors)
                 {
                     //current_score                               = current_score + marginal_score;
@@ -739,7 +738,7 @@ void OverlapperMinimap::get_overlaps(std::vector<Overlap>& fused_overlaps,
     // generates the scheduler blocks
     chainerutils::encode_anchor_query_locations(d_anchors.data(),
                                                 n_anchors,
-                                                TILE_SIZE,  // This is 1024
+                                                TILE_SIZE, // This is 1024
                                                 query_id_starts,
                                                 query_id_lengths,
                                                 query_id_ends,
@@ -799,13 +798,13 @@ void OverlapperMinimap::get_overlaps(std::vector<Overlap>& fused_overlaps,
 #endif
 
     // the deschedule block. Get outputs from here
-    produce_anchor_chains<<<(n_anchors / block_size) + 1, block_size, 0, _cuda_stream>>>(d_anchors.data(),
-                                                                                         d_overlaps_source.data(),
-                                                                                         d_anchor_scores.data(),
-                                                                                         d_overlaps_select_mask.data(),
-                                                                                         d_anchor_predecessors.data(),
-                                                                                         n_anchors,
-                                                                                         20);
+    chainerutils::backtrace_anchors_to_overlaps<<<BLOCK_COUNT, block_size, 0, _cuda_stream>>>(d_anchors.data(),
+                                                                                              d_overlaps_source.data(),
+                                                                                              d_anchor_scores.data(),
+                                                                                              d_overlaps_select_mask.data(),
+                                                                                              d_anchor_predecessors.data(),
+                                                                                              n_anchors,
+                                                                                              40);
 
     // TODO VI: I think we can get better device occupancy here with some kernel refactoring
     mask_overlaps<<<(n_anchors / block_size) + 1, block_size, 0, _cuda_stream>>>(d_overlaps_source.data(),
diff --git a/cudamapper/tests/CMakeLists.txt b/cudamapper/tests/CMakeLists.txt
@@ -28,7 +28,6 @@ set(SOURCES
     Test_CudamapperMinimizer.cpp
     Test_CudamapperOverlapper.cpp
     Test_CudamapperOverlapperTriggered.cu
-    Test_CudamapperOverlapperMinimap.cu
     Test_CudamapperUtilsKmerFunctions.cpp
    )
 

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,6 @@ set(SOURCES`
`28`	`28`	`Test_CudamapperMinimizer.cpp`
`29`	`29`	`Test_CudamapperOverlapper.cpp`
`30`	`30`	`Test_CudamapperOverlapperTriggered.cu`
`31`		`- Test_CudamapperOverlapperMinimap.cu`
`32`	`31`	`Test_CudamapperUtilsKmerFunctions.cpp`
`33`	`32`	`)`
`34`	`33`