diff --git a/cmake/pcl_find_cuda.cmake b/cmake/pcl_find_cuda.cmake index f481e06e56a..837147135ad 100644 --- a/cmake/pcl_find_cuda.cmake +++ b/cmake/pcl_find_cuda.cmake @@ -6,7 +6,7 @@ if(MSVC) endif() set(CUDA_FIND_QUIETLY TRUE) -find_package(CUDA) +find_package(CUDA 7.0) if(CUDA_FOUND) message(STATUS "Found CUDA Toolkit v${CUDA_VERSION_STRING}") @@ -50,16 +50,8 @@ if(CUDA_FOUND) set(__cuda_arch_bin "3.0 3.5 5.0 5.2 5.3 6.0 6.1 7.0") elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "8.0") set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0 5.2 5.3 6.0 6.1") - elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "6.5") - set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0 5.2") - elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "6.0") - set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0") - elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "5.0") - set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5") - elseif(${CUDA_VERSION_STRING} VERSION_GREATER "4.1") - set(__cuda_arch_bin "2.0 2.1(2.0) 3.0") else() - set(__cuda_arch_bin "2.0 2.1(2.0)") + set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0 5.2") endif() set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") diff --git a/cuda/apps/src/kinect_segmentation_cuda.cpp b/cuda/apps/src/kinect_segmentation_cuda.cpp index a698dd469de..7ecc17cfd10 100644 --- a/cuda/apps/src/kinect_segmentation_cuda.cpp +++ b/cuda/apps/src/kinect_segmentation_cuda.cpp @@ -225,7 +225,7 @@ class Segmentation normals = computeFastPointNormals (data); else normals = computePointNormals (data->points.begin (), data->points.end (), focallength, data, radius_cm / 100.0f, nr_neighbors); - cudaThreadSynchronize (); + cudaDeviceSynchronize (); } // retrieve normals as an image.. diff --git a/cuda/common/include/pcl/cuda/cutil.h b/cuda/common/include/pcl/cuda/cutil.h index 1f69ebc72c8..eee7ab5e1a7 100644 --- a/cuda/common/include/pcl/cuda/cutil.h +++ b/cuda/common/include/pcl/cuda/cutil.h @@ -735,18 +735,6 @@ extern "C" { //////////////////////////////////////////////////////////////////////////// //! Macros -#if CUDART_VERSION >= 4000 -#define CUT_DEVICE_SYNCHRONIZE( ) cudaDeviceSynchronize(); -#else -#define CUT_DEVICE_SYNCHRONIZE( ) cudaThreadSynchronize(); -#endif - -#if CUDART_VERSION >= 4000 -#define CUT_DEVICE_RESET( ) cudaDeviceReset(); -#else -#define CUT_DEVICE_RESET( ) cudaThreadExit(); -#endif - // This is for the CUTIL bank checker #ifdef _DEBUG #if __DEVICE_EMULATION__ @@ -792,7 +780,7 @@ extern "C" { # define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call); \ # define CUDA_SAFE_THREAD_SYNC( ) { \ - cudaError err = CUT_DEVICE_SYNCHRONIZE(); \ + cudaError err = cudaDeviceSynchronize(); \ if ( cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ @@ -822,7 +810,7 @@ extern "C" { errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ exit(EXIT_FAILURE); \ } \ - err = CUT_DEVICE_SYNCHRONIZE(); \ + err = cudaDeviceSynchronize(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ @@ -891,7 +879,7 @@ extern "C" { errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ exit(EXIT_FAILURE); \ } \ - err = CUT_DEVICE_SYNCHRONIZE(); \ + err = cudaDeviceSynchronize(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ @@ -906,7 +894,7 @@ extern "C" { errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ exit(EXIT_FAILURE); \ } \ - err = CUT_DEVICE_SYNCHRONIZE(); \ + err = cudaDeviceSynchronize(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ diff --git a/cuda/common/include/pcl/cuda/cutil_inline_runtime.h b/cuda/common/include/pcl/cuda/cutil_inline_runtime.h index fae1f817771..c177d2f5aa9 100644 --- a/cuda/common/include/pcl/cuda/cutil_inline_runtime.h +++ b/cuda/common/include/pcl/cuda/cutil_inline_runtime.h @@ -42,20 +42,12 @@ inline cudaError cutilDeviceSynchronize() { -#if CUDART_VERSION >= 4000 return cudaDeviceSynchronize(); -#else - return cudaThreadSynchronize(); -#endif } inline cudaError cutilDeviceReset() { -#if CUDART_VERSION >= 4000 return cudaDeviceReset(); -#else - return cudaThreadExit(); -#endif } inline void __cutilCondition(int val, char *file, int line) diff --git a/cuda/common/include/pcl/cuda/time_gpu.h b/cuda/common/include/pcl/cuda/time_gpu.h index 73deef9a1b3..4457029c699 100644 --- a/cuda/common/include/pcl/cuda/time_gpu.h +++ b/cuda/common/include/pcl/cuda/time_gpu.h @@ -83,7 +83,7 @@ namespace pcl { CUT_CHECK_ERROR ("dude"); // Measure time needed to copy data - cutilSafeCall (cudaThreadSynchronize ()); + cutilSafeCall (cudaDeviceSynchronize ()); cutilSafeCall (cudaEventRecord (end_, 0)); cutilSafeCall (cudaEventSynchronize (end_)); cutilSafeCall (cudaEventElapsedTime (&elapsed_time_, start_, end_)); diff --git a/gpu/containers/src/initialization.cpp b/gpu/containers/src/initialization.cpp index 653a83265f1..11f8f2cd781 100644 --- a/gpu/containers/src/initialization.cpp +++ b/gpu/containers/src/initialization.cpp @@ -169,7 +169,6 @@ void pcl::gpu::printCudaDeviceInfo(int device) printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, sm_cores, sm_cores * prop.multiProcessorCount); printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); -#if (CUDART_VERSION >= 4000) // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock, memBusWidth, L2CacheSize; getCudaAttribute( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); @@ -187,7 +186,6 @@ void pcl::gpu::printCudaDeviceInfo(int device) printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); -#endif printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); @@ -198,11 +196,7 @@ void pcl::gpu::printCudaDeviceInfo(int device) printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); -#if CUDART_VERSION >= 4000 printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); -#else - printf(" Concurrent copy and execution: %s\n", prop.deviceOverlap ? "Yes" : "No"); -#endif printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); @@ -211,10 +205,8 @@ void pcl::gpu::printCudaDeviceInfo(int device) printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); -#if CUDART_VERSION >= 4000 printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); -#endif printf(" Compute Mode:\n"); printf(" %s \n", computeMode[prop.computeMode]); } diff --git a/gpu/people/CMakeLists.txt b/gpu/people/CMakeLists.txt index 174ef99a82c..520e65fd30e 100644 --- a/gpu/people/CMakeLists.txt +++ b/gpu/people/CMakeLists.txt @@ -18,23 +18,19 @@ REMOVE_VTK_DEFINITIONS() #find NPP unset(CUDA_npp_LIBRARY CACHE) -if(${CUDA_VERSION} VERSION_LESS "5.5") - find_cuda_helper_libs(npp) +find_cuda_helper_libs(nppc) +find_cuda_helper_libs(npps) +if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0") + find_cuda_helper_libs(nppim) + find_cuda_helper_libs(nppidei) else() - find_cuda_helper_libs(nppc) - find_cuda_helper_libs(npps) - if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0") - find_cuda_helper_libs(nppim) - find_cuda_helper_libs(nppidei) - else() - find_cuda_helper_libs(nppi) - endif() - - if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0") - set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppim_LIBRARY} ${CUDA_nppidei_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library") - else() - set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library") - endif() + find_cuda_helper_libs(nppi) +endif() + +if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0") + set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppim_LIBRARY} ${CUDA_nppidei_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library") +else() + set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library") endif() #Label_skeleton diff --git a/gpu/people/src/cuda/multi_tree.cu b/gpu/people/src/cuda/multi_tree.cu index e24c53b5180..8c26d3eadaa 100644 --- a/gpu/people/src/cuda/multi_tree.cu +++ b/gpu/people/src/cuda/multi_tree.cu @@ -164,7 +164,7 @@ namespace pcl KernelCUDA_runTree<<< grid, block >>>( focal, treeHeight, numNodes, nodes, leaves, labels); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void CUDA_runMultiTreePass ( int FGThresh, @@ -198,7 +198,7 @@ namespace pcl } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /////////////////////////////////////////////////////////////////////////////////////// @@ -323,7 +323,7 @@ namespace pcl KernelCUDA_MultiTreeMerge<<< grid, block >>>( numTrees, labels ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /** \brief This will merge the votes from the different trees into one final vote, including probabilistic's */ @@ -348,7 +348,7 @@ namespace pcl KernelCUDA_MultiTreeCreateProb<<< grid, block >>>( numTrees, probabilities); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } } } diff --git a/gpu/people/src/cuda/prob.cu b/gpu/people/src/cuda/prob.cu index 797fcaeec76..8d90c77b5ce 100644 --- a/gpu/people/src/cuda/prob.cu +++ b/gpu/people/src/cuda/prob.cu @@ -263,7 +263,7 @@ namespace pcl KernelCUDA_SelectLabel<<< grid, block >>>( labels, probabilities ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /** \brief This will combine two probabilities according their weight **/ @@ -282,7 +282,7 @@ namespace pcl KernelCUDA_CombineProb<<< grid, block >>>( probIn1, weight1, probIn2, weight2, probOut ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /** \brief This will combine two probabilities according their weight **/ @@ -299,7 +299,7 @@ namespace pcl KernelCUDA_WeightedSumProb<<< grid, block >>>( probIn, weight, probOut ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } /** \brief This will blur the input labelprobability with the given kernel **/ @@ -336,12 +336,12 @@ namespace pcl KernelCUDA_GaussianBlurVer<<< grid, block >>>( probIn, kernel, kernel.size(), probTemp ); //KernelCUDA_GaussianBlurVer<<< grid, block >>>( probIn, kernel, kernel.size(), probOut ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); // CUDA kernel call Horizontal KernelCUDA_GaussianBlurHor<<< grid, block >>>( probTemp, kernel, kernel.size(), probOut ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); return 1; } } diff --git a/gpu/people/src/cuda/utils.cu b/gpu/people/src/cuda/utils.cu index 33e3fc5674e..e692d7713df 100644 --- a/gpu/people/src/cuda/utils.cu +++ b/gpu/people/src/cuda/utils.cu @@ -54,7 +54,7 @@ void pcl::device::colorLMap(const Labels& labels, const DeviceArray& map colorKernel<<< grid, block >>>( labels, rgba ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } void pcl::device::mixedColorMap(const Labels& labels, const DeviceArray& map, const Image& rgba, Image& output) @@ -170,7 +170,7 @@ void pcl::device::prepareForeGroundDepth(const Depth& depth1, Mask& inverse_mask fgDepthKernel<<< grid, block >>>( depth1, inverse_mask, depth2 ); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaThreadSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); }