Merge pull request #4306 from haritha-j/knnFix

[gpu] Replace volatile shared memory with shfl_sync in KNNSearch
PointCloudLibrary · Aug 11, 2020 · c0c0cb2 · c0c0cb2
2 parents 0de7b33 + 66fa90b
commit c0c0cb2
Showing 1 changed file with 79 additions and 131 deletions.
diff --git a/gpu/octree/src/cuda/knn_search.cu b/gpu/octree/src/cuda/knn_search.cu
@@ -43,6 +43,7 @@
 #include "utils/boxutils.hpp"
 #include "utils/bitonic_sort.hpp"
 #include "octree_iterator.hpp"
+#include <tuple>
 
 namespace pcl { namespace device { namespace knn_search
 {   
@@ -94,7 +95,7 @@ namespace pcl { namespace device { namespace knn_search
 
         OctreeIterator iterator;     
 
-        __device__ __forceinline__ Warp_knnSearch(const Batch& batch_arg, int query_index_arg) 
+        __device__ __forceinline__ Warp_knnSearch(const Batch& batch_arg, const int query_index_arg)
             : batch(batch_arg), query_index(query_index_arg), min_distance(std::numeric_limits<float>::max()), min_idx(0), iterator(batch.octree) { }
 
         __device__ __forceinline__ void launch(bool active)
@@ -130,8 +131,8 @@ namespace pcl { namespace device { namespace knn_search
 
         __device__ __forceinline__ int examineNode(OctreeIterator& iterator)
         {                        
-            int node_idx = *iterator;
-            int code = batch.octree.codes[node_idx];
+            const int node_idx = *iterator;
+            const int code = batch.octree.codes[node_idx];
 
             float3 node_minp = batch.octree.minp;
             float3 node_maxp = batch.octree.maxp;        
@@ -145,9 +146,9 @@ namespace pcl { namespace device { namespace knn_search
             }                    
 
             //need to go to next level
-            int node = batch.octree.nodes[node_idx];
-            int children_mask = node & 0xFF;            
-            bool isLeaf = children_mask == 0;            
+            const int node = batch.octree.nodes[node_idx];
+            const int children_mask = node & 0xFF;            
+            const bool isLeaf = children_mask == 0;            
 
             if (isLeaf)
             {
@@ -156,160 +157,107 @@ namespace pcl { namespace device { namespace knn_search
             }
 
             //goto next level
-            int first = node >> 8;
-            int len   = __popc(children_mask);
+            const int first = node >> 8;
+            const int len   = __popc(children_mask);
             iterator.gotoNextLevel(first, len);                    
             return -1;
         };
 
-        __device__ __forceinline__ void processLeaf(int node_idx)
+        __device__ __forceinline__ void processLeaf(const int node_idx)
         {   
             int mask = __ballot_sync(0xFFFFFFFF, node_idx != -1);            
 
-            unsigned int laneId = Warp::laneId();
-            unsigned int warpId = Warp::id();
+            const unsigned int laneId = Warp::laneId();
 
             __shared__ volatile int per_warp_buffer[KernelPolicy::WARPS_COUNT];
 
             while(mask)
             {
-                int active_lane = __ffs(mask) - 1; //[0..31]                        
-                mask &= ~(1 << active_lane);   
+                const int active_lane = __ffs(mask) - 1; //[0..31]
+                mask &= ~(1 << active_lane);
 
-                volatile int* warp_buffer = &per_warp_buffer[warpId];
-
-                //broadcast beg
+                //broadcast beg and end
+                int fbeg, fend;
                 if (active_lane == laneId)
-                    *warp_buffer = batch.octree.begs[node_idx];                    
-                int beg = *warp_buffer;
+                {
+                    fbeg = batch.octree.begs[node_idx];
+                    fend = batch.octree.ends[node_idx];
+                }
+                const int beg = __shfl_sync(0xFFFFFFFF, fbeg, active_lane);
+                const int end = __shfl_sync(0xFFFFFFFF, fend, active_lane);
 
-                //broadcast end
-                if (active_lane == laneId)
-                    *warp_buffer = batch.octree.ends[node_idx];
-                int end = *warp_buffer;
-
-                float3 active_query;
-                volatile float* warp_buffer_float = (float*)&per_warp_buffer[warpId];
 
                 //broadcast warp_query
-                if (active_lane == laneId)
-                    *warp_buffer_float = query.x;
-                active_query.x = *warp_buffer_float;
+                const float3 active_query = make_float3(
+                    __shfl_sync(0xFFFFFFFF, query.x, active_lane),
+                    __shfl_sync(0xFFFFFFFF, query.y, active_lane),
+                    __shfl_sync(0xFFFFFFFF, query.z, active_lane)
+                );
 
-                if (active_lane == laneId)
-                    *warp_buffer_float = query.y;
-                active_query.y = *warp_buffer_float;
-
-                if (active_lane == laneId)
-                    *warp_buffer_float = query.z;
-                active_query.z = *warp_buffer_float;                            
+                const auto nearestPoint = NearestWarpKernel<KernelPolicy::CTA_SIZE>(beg, batch.points_step, end - beg, active_query);
 
-                //broadcast query_index
                 if (active_lane == laneId)
-                    *warp_buffer = query_index;
-                float active_query_index = *warp_buffer;                            
-
-                float dist;
-                int offset = NearestWarpKernel<KernelPolicy::CTA_SIZE>(batch.points + beg, batch.points_step, end - beg, active_query, dist);
-
-                if (active_lane == laneId)
-                    if (min_distance > dist)
+                    if (min_distance > nearestPoint.second)
                     {
-                       min_distance = dist;   
-                       min_idx = beg + offset;
-                    }                                                          
+                       min_distance = nearestPoint.second;
+                       min_idx = beg + nearestPoint.first;
+                    }
             }
         }
 
-        template<int CTA_SIZE>
-		__device__ __forceinline__ int NearestWarpKernel(const float* points, int points_step, int length, const float3& active_query, float& dist)
-		{                        						
-            __shared__ volatile float dist2[CTA_SIZE];
-            __shared__ volatile int   index[CTA_SIZE];
-
-            int tid = threadIdx.x;
-			dist2[tid] = std::numeric_limits<float>::max();
-
-			//serial step
-            for (int idx = Warp::laneId(); idx < length; idx += Warp::STRIDE)
-			{
-				float dx = points[idx                  ] - active_query.x;
-				float dy = points[idx + points_step    ] - active_query.y;
-				float dz = points[idx + points_step * 2] - active_query.z;
-
-				float d2 = dx * dx + dy * dy + dz * dz;
-
-				if (dist2[tid] > d2)
-				{
-					dist2[tid] = d2;
-					index[tid] = idx;                            
-				}
-			}
-			//parallel step
-			unsigned int lane = Warp::laneId();
-
-			float mind2 = dist2[tid];
-
-			if (lane < 16)
-			{
-				float next = dist2[tid + 16];
-				if (mind2 > next) 
-				{ 
-					dist2[tid] = mind2 = next; 
-					index[tid] = index[tid + 16]; 
-				}                        
-			}
-
-			if (lane < 8)
-			{
-				float next = dist2[tid + 8];
-				if (mind2 > next) 
-				{ 
-					dist2[tid] = mind2 = next; 
-					index[tid] = index[tid + 8]; 
-				}                        
-			}
-
-			if (lane < 4)
-			{
-				float next = dist2[tid + 4];
-				if (mind2 > next) 
-				{ 
-					dist2[tid] = mind2 = next; 
-					index[tid] = index[tid + 4]; 
-				}                        
-			}
-
-			if (lane < 2)
-			{
-				float next = dist2[tid + 2];
-				if (mind2 > next) 
-				{ 
-					dist2[tid] = mind2 = next; 
-					index[tid] = index[tid + 2]; 
-				}                        
-			}
-
-			if (lane < 1)
-			{
-				float next = dist2[tid + 1];
-				if (mind2 > next) 
-				{ 
-					dist2[tid] = mind2 = next; 
-					index[tid] = index[tid + 1]; 
-				}                        
-			}        
-
-            dist = sqrt(dist2[tid - lane]);
-			return index[tid - lane];
-		}          
+        template <int CTA_SIZE>
+        __device__  std::pair<int, float>
+        NearestWarpKernel(const int beg,
+                          const int field_step,
+                          const int length,
+                          const float3& active_query)
+
+        {
+          int index = 0;
+          float dist2 = std::numeric_limits<float>::max();
+
+          // serial step
+          for (int idx = Warp::laneId(); idx < length; idx += Warp::STRIDE) {
+            const float dx = batch.points[beg + idx] - active_query.x;
+            const float dy = batch.points[beg + idx + field_step] - active_query.y;
+            const float dz = batch.points[beg + idx + field_step * 2] - active_query.z;
+
+            const float d2 = dx * dx + dy * dy + dz * dz;
+
+            if (dist2 > d2) {
+              dist2 = d2;
+              index = idx;
+            }
+          }
+
+          // find minimum distance among warp threads
+          constexpr unsigned FULL_MASK = 0xFFFFFFFF;
+          static_assert(KernelPolicy::WARP_SIZE <= 8*sizeof(unsigned int), "WARP_SIZE exceeds size of bit_offset.");
+
+          for (unsigned int bit_offset = KernelPolicy::WARP_SIZE / 2; bit_offset > 0;
+               bit_offset /= 2) {
+            const float next = __shfl_down_sync(FULL_MASK, dist2, bit_offset);
+            const int next_index = __shfl_down_sync(FULL_MASK, index, bit_offset);
+
+            if (dist2 > next) {
+              dist2 = next;
+              index = next_index;
+            }
+          }
+
+          // retrieve index and distance
+          index = __shfl_sync(FULL_MASK, index, 0);
+          const float dist = sqrt(__shfl_sync(FULL_MASK, dist2, 0));
+
+          return std::make_pair(index, dist);
+        }
     };
 
     __global__ void KernelKNN(const Batch batch) 
     {           
-        int query_index = blockIdx.x * blockDim.x + threadIdx.x;
+        const int query_index = blockIdx.x * blockDim.x + threadIdx.x;
 
-        bool active = query_index < batch.queries_num;
+        const bool active = query_index < batch.queries_num;
 
         if (__all_sync(0xFFFFFFFF, active == false)) 
             return;