NVIDIA-Genomics-Research · edawson · Oct 19, 2020 · Oct 19, 2020 · Oct 19, 2020 · Oct 20, 2020
diff --git a/cudaaligner/samples/sample_cudaaligner.cpp b/cudaaligner/samples/sample_cudaaligner.cpp
@@ -21,12 +21,12 @@
 #include <claraparabricks/genomeworks/utils/genomeutils.hpp>
 
 #include <cuda_runtime_api.h>
+#include <getopt.h>
 #include <vector>
 #include <string>
 #include <stdexcept>
 #include <iostream>
 #include <random>
-#include <getopt.h>
 
 using namespace claraparabricks::genomeworks;
 using namespace claraparabricks::genomeworks::genomeutils;

diff --git a/cudamapper/CMakeLists.txt b/cudamapper/CMakeLists.txt
@@ -46,6 +46,7 @@ endif()
 
 cuda_add_library(${MODULE_NAME}
         src/application_parameters.cpp
+        src/chainer_utils.cu
         src/cudamapper.cpp
         src/index_batcher.cu
         src/index_descriptor.cpp

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
@@ -0,0 +1,210 @@
+/*
+* Copyright 2020 NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "chainer_utils.cuh"
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform_scan.h>
+#include <thrust/reduce.h>
+#include <claraparabricks/genomeworks/utils/cudautils.hpp>
+
+namespace claraparabricks
+{
+
+namespace genomeworks
+{
+
+namespace cudamapper
+{
+namespace chainerutils
+{
+
+__device__ __forceinline__ Anchor empty_anchor()
+{
+    Anchor a;
+    a.query_read_id_           = UINT32_MAX;
+    a.query_position_in_read_  = UINT32_MAX;
+    a.target_read_id_          = UINT32_MAX;
+    a.target_position_in_read_ = UINT32_MAX;
+    return a;
+}
+struct ConvertOverlapToNumResidues : public thrust::unary_function<Overlap, int32_t>
+{
+    __host__ __device__ int32_t operator()(const Overlap& o) const
+    {
+        return o.num_residues_;
+    }
+};
+
+__host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
+{
+    Overlap overlap;
+    overlap.num_residues_ = num_anchors;
+
+    overlap.query_read_id_  = start.query_read_id_;
+    overlap.target_read_id_ = start.target_read_id_;
+    assert(start.query_read_id_ == end.query_read_id_ && start.target_read_id_ == end.target_read_id_);
+
+    overlap.query_start_position_in_read_ = min(start.query_position_in_read_, end.query_position_in_read_);
+    overlap.query_end_position_in_read_   = max(start.query_position_in_read_, end.query_position_in_read_);
+    const bool is_negative_strand         = end.target_position_in_read_ < start.target_position_in_read_;
+    if (is_negative_strand)
+    {
+        overlap.relative_strand                = RelativeStrand::Reverse;
+        overlap.target_start_position_in_read_ = end.target_position_in_read_;
+        overlap.target_end_position_in_read_   = start.target_position_in_read_;
+    }
+    else
+    {
+        overlap.relative_strand                = RelativeStrand::Forward;
+        overlap.target_start_position_in_read_ = start.target_position_in_read_;
+        overlap.target_end_position_in_read_   = end.target_position_in_read_;
+    }
+    return overlap;
+}
+
+__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
+                                              Overlap* const overlaps,
+                                              int32_t* const overlap_terminal_anchors,
+                                              const float* const scores,
+                                              bool* const max_select_mask,
+                                              const int32_t* const predecessors,
+                                              const int64_t n_anchors,
+                                              const float min_score)
+{
+    const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride    = blockDim.x * gridDim.x;
+
+    for (int i = thread_id; i < n_anchors; i += stride)
+    {
+        if (scores[i] >= min_score)
+        {
+            int32_t index                = i;
+            int32_t first_index          = index;
+            int32_t num_anchors_in_chain = 0;
+            Anchor final_anchor          = anchors[i];
+
+            while (index != -1)
+            {
+                first_index  = index;
+                int32_t pred = predecessors[index];
+                if (pred != -1)
+                {
+                    max_select_mask[pred] = false;
+                }
+                num_anchors_in_chain++;
+                index = predecessors[index];
+            }
+            Anchor first_anchor         = anchors[first_index];
+            overlap_terminal_anchors[i] = i;
+            overlaps[i]                 = create_overlap(first_anchor, final_anchor, num_anchors_in_chain);
+        }
+        else
+        {
+            max_select_mask[i]          = false;
+            overlap_terminal_anchors[i] = -1;
+            overlaps[i]                 = create_overlap(empty_anchor(), empty_anchor(), 1);
+        }
+    }
+}
+
+void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
+                            device_buffer<int32_t>& unrolled_anchor_chains,
+                            device_buffer<int32_t>& anchor_chain_starts,
+                            int64_t& num_total_anchors,
+                            DefaultDeviceAllocator allocator,
+                            cudaStream_t cuda_stream)
+{
+    auto thrust_exec_policy = thrust::cuda::par(allocator).on(cuda_stream);
+    thrust::plus<int32_t> sum_op;
+    num_total_anchors = thrust::transform_reduce(thrust_exec_policy,
+                                                 overlaps.begin(),
+                                                 overlaps.end(),
+                                                 ConvertOverlapToNumResidues(),
+                                                 0,
+                                                 sum_op);
+
+    unrolled_anchor_chains.clear_and_resize(num_total_anchors);
+    anchor_chain_starts.clear_and_resize(overlaps.size());
+
+    thrust::transform_exclusive_scan(thrust_exec_policy,
+                                     overlaps.begin(),
+                                     overlaps.end(),
+                                     anchor_chain_starts.data(),
+                                     ConvertOverlapToNumResidues(),
+                                     0,
+                                     sum_op);
+}
+
+__global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
+                                                   const Anchor* const anchors,
+                                                   const bool* const select_mask,
+                                                   const int32_t* const predecessors,
+                                                   const int32_t* const chain_terminators,
+                                                   int32_t* const anchor_chains,
+                                                   int32_t* const anchor_chain_starts,
+                                                   const int32_t num_overlaps,
+                                                   const bool check_mask)
+{
+    const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride    = blockDim.x * gridDim.x;
+
+    // Processes one overlap per iteration,
+    // "i" corresponds to an overlap
+    for (int i = thread_id; i < num_overlaps; i += stride)
+    {
+
+        if (!check_mask || (check_mask & select_mask[i]))
+        {
+            int32_t anchor_chain_index = 0;
+            // As chaining proceeds backwards (i.e., it's a backtrace),
+            // we need to fill the new anchor chain array in in reverse order.
+            int32_t index = chain_terminators[i];
+            while (index != -1)
+            {
+                anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index - 1)] = index;
+                index                                                                                        = predecessors[index];
+                ++anchor_chain_index;
+            }
+        }
+    }
+}
+
+__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
+                                             const Anchor* const anchors,
+                                             const int32_t* const chain_starts,
+                                             const int32_t* const chain_lengths,
+                                             int32_t* const anchor_chains,
+                                             int32_t* const anchor_chain_starts,
+                                             const uint32_t num_overlaps)
+{
+    const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride    = blockDim.x * gridDim.x;
+    for (uint32_t i = thread_id; i < num_overlaps; i += stride)
+    {
+        int32_t chain_start  = chain_starts[i];
+        int32_t chain_length = chain_lengths[i];
+        for (int32_t index = chain_start; index < chain_start + chain_length; ++index)
+        {
+            anchor_chains[index] = index;
+        }
+    }
+}
+
+} // namespace chainerutils
+} // namespace cudamapper
+} // namespace genomeworks
+} // namespace claraparabricks
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
@@ -0,0 +1,129 @@
+/*
+* Copyright 2020 NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma once
+
+#include <claraparabricks/genomeworks/cudamapper/types.hpp>
+#include <claraparabricks/genomeworks/utils/device_buffer.hpp>
+
+namespace claraparabricks
+{
+
+namespace genomeworks
+{
+
+namespace cudamapper
+{
+namespace chainerutils
+{
+
+/// \brief Create an overlap from the first and last anchor in the chain and
+/// the total number of anchors in the chain.
+///
+/// \param start The first anchor in the chain.
+/// \param end The last anchor in the chain.
+/// \param num_anchors The total number of anchors in the chain.
+__host__ __device__ Overlap create_overlap(const Anchor& start,
+                                           const Anchor& end,
+                                           const int32_t num_anchors);
+
+/// \brief Given an array of anchors and an array denoting the immediate
+/// predecessor of each anchor, transform chains of anchors into overlaps.
+///
+/// \param anchors An array of anchors.
+/// \param overlaps An array of overlaps to be filled.
+/// \param scores An array of scores. Only chains with a score greater than min_score will be backtraced.
+/// \param max_select_mask A boolean mask, used to mask out any chains which are completely contained within larger chains during the backtrace.
+/// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain.
+/// \param n_anchors The number of anchors.
+/// \param min_score The minimum score of a chain for performing backtracing.
+/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors were allocated.
+/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...)
+__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
+                                              Overlap* const overlaps,
+                                              int32_t* const overlap_terminal_anchors,
+                                              const float* const scores,
+                                              bool* const max_select_mask,
+                                              const int32_t* const predecessors,
+                                              const int64_t n_anchors,
+                                              const float min_score);
+
+/// \brief Allocate a 1-dimensional array representing an unrolled ragged array
+/// of anchors within each overlap. The final array holds the indices within the anchors array used in chaining
+/// of the anchors in the chain.
+///
+/// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field
+/// \param unrolled_anchor_chains An array of int32_t. Will be resized on return.
+/// \param anchor_chain_starts An array holding the index of the first anchor in an overlap within the anchors array used during chaining.
+/// \param num_overlaps The number of overlaps in the overlaps array.
+/// \param num_total_anchors The total number of anchors across all overlaps.
+/// \param allocator The DefaultDeviceAllocator for this overlapper.
+/// \param cuda_stream The cudastream to allocate memory within.
+void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
+                            device_buffer<int32_t>& unrolled_anchor_chains,
+                            device_buffer<int32_t>& anchor_chain_starts,
+                            int64_t& num_total_anchors,
+                            DefaultDeviceAllocator allocator,
+                            cudaStream_t cuda_stream = 0);
+
+/// \brief Given an array of overlaps, fill a 1D unrolled ragged array
+/// containing the anchors used to generate each overlap. Anchors must have
+/// been chained with a chaining function that fills the predecessors array
+/// with the immediate predecessor of each anchor.
+///
+/// \param overlaps An array of overlaps.
+/// \param anchors The array of anchors used to generate overlaps.
+/// \param select_mask An array of bools, used to mask overlaps from output.
+/// \param predecessors The predecessors array from anchor chaining.
+/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain.
+/// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
+/// \param num_overlaps The number of overlaps in the overlaps array
+/// \param check_mask A boolean. If true, only overlaps where select_mask is true will have their anchor chains calculated.
+/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors/chains/starts were allocated.
+/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...)
+__global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
+                                                   const Anchor* const anchors,
+                                                   const bool* const select_mask,
+                                                   const int32_t* const predecessors,
+                                                   const int32_t* const chain_terminators,
+                                                   int32_t* const anchor_chains,
+                                                   int32_t* const anchor_chain_starts,
+                                                   const int32_t num_overlaps,
+                                                   const bool check_mask);
+
+/// \brief Fill a 1D unrolled ragged array with the anchors used to produce each overlap.
+///
+/// \param overlaps An array of overlaps.
+/// \param anchors The array of anchors used to generate overlaps.
+/// \param chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
+/// \param chain_lengths An array which holds the length of each run of anchors, corresponding to the chain_starts array.
+/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain.
+/// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
+/// \param num_overlaps The number of overlaps in the overlaps array.
+/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors/chains/starts were allocated.
+/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...)
+__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
+                                             const Anchor* const anchors,
+                                             const int32_t* const chain_starts,
+                                             const int32_t* const chain_lengths,
+                                             int32_t* const anchor_chains,
+                                             int32_t* const anchor_chain_starts,
+                                             const uint32_t num_overlaps);
+
+} // namespace chainerutils
+} // namespace cudamapper
+} // namespace genomeworks
+} // namespace claraparabricks
diff --git a/cudamapper/src/overlapper_triggered.cu b/cudamapper/src/overlapper_triggered.cu
@@ -24,11 +24,11 @@
 
 #include <claraparabricks/genomeworks/utils/cudautils.hpp>
 
-#ifndef NDEBUG // only needed to check if input is sorted in assert
 #include <algorithm>
+
+#ifndef NDEBUG
 #include <thrust/host_vector.h>
 #endif
-
 namespace claraparabricks
 {
 

diff --git a/cudamapper/tests/CMakeLists.txt b/cudamapper/tests/CMakeLists.txt
@@ -29,6 +29,7 @@ set(SOURCES
     Test_CudamapperOverlapper.cpp
     Test_CudamapperOverlapperTriggered.cu
     Test_CudamapperUtilsKmerFunctions.cpp
+    Test_CudamapperChainerUtils.cu
    )
 
 get_property(cudamapper_data_include_dir GLOBAL PROPERTY cudamapper_data_include_dir)