diff --git a/cmake/pcl_find_cuda.cmake b/cmake/pcl_find_cuda.cmake
index f481e06e56a..837147135ad 100644
--- a/cmake/pcl_find_cuda.cmake
+++ b/cmake/pcl_find_cuda.cmake
@@ -6,7 +6,7 @@ if(MSVC)
 endif()
 
 set(CUDA_FIND_QUIETLY TRUE)
-find_package(CUDA)
+find_package(CUDA 7.0)
 
 if(CUDA_FOUND)
   message(STATUS "Found CUDA Toolkit v${CUDA_VERSION_STRING}")
@@ -50,16 +50,8 @@ if(CUDA_FOUND)
     set(__cuda_arch_bin "3.0 3.5 5.0 5.2 5.3 6.0 6.1 7.0")
   elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "8.0")
     set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0 5.2 5.3 6.0 6.1")
-  elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "6.5")
-    set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0 5.2")
-  elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "6.0")
-    set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0")
-  elseif(NOT ${CUDA_VERSION_STRING} VERSION_LESS "5.0")
-    set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5")
-  elseif(${CUDA_VERSION_STRING} VERSION_GREATER "4.1")
-    set(__cuda_arch_bin "2.0 2.1(2.0) 3.0")
   else()
-    set(__cuda_arch_bin "2.0 2.1(2.0)")
+    set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5 5.0 5.2")
   endif()
 
   set(CUDA_ARCH_BIN ${__cuda_arch_bin} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
diff --git a/cuda/apps/src/kinect_segmentation_cuda.cpp b/cuda/apps/src/kinect_segmentation_cuda.cpp
index a698dd469de..7ecc17cfd10 100644
--- a/cuda/apps/src/kinect_segmentation_cuda.cpp
+++ b/cuda/apps/src/kinect_segmentation_cuda.cpp
@@ -225,7 +225,7 @@ class Segmentation
           normals = computeFastPointNormals<Storage> (data);
         else
           normals = computePointNormals<Storage> (data->points.begin (), data->points.end (), focallength, data, radius_cm / 100.0f, nr_neighbors);
-        cudaThreadSynchronize ();
+        cudaDeviceSynchronize ();
       }
 
       // retrieve normals as an image..
diff --git a/cuda/common/include/pcl/cuda/cutil.h b/cuda/common/include/pcl/cuda/cutil.h
index 1f69ebc72c8..eee7ab5e1a7 100644
--- a/cuda/common/include/pcl/cuda/cutil.h
+++ b/cuda/common/include/pcl/cuda/cutil.h
@@ -735,18 +735,6 @@ extern "C" {
     ////////////////////////////////////////////////////////////////////////////
     //! Macros
 
-#if CUDART_VERSION >= 4000
-#define CUT_DEVICE_SYNCHRONIZE( )   cudaDeviceSynchronize();
-#else
-#define CUT_DEVICE_SYNCHRONIZE( )   cudaThreadSynchronize();
-#endif
-
-#if CUDART_VERSION >= 4000
-#define CUT_DEVICE_RESET( )   cudaDeviceReset();
-#else
-#define CUT_DEVICE_RESET( )   cudaThreadExit();
-#endif
-
 // This is for the CUTIL bank checker
 #ifdef _DEBUG
     #if __DEVICE_EMULATION__
@@ -792,7 +780,7 @@ extern "C" {
 #  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);                                            \
 
 #  define CUDA_SAFE_THREAD_SYNC( ) {                                         \
-    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                 \
+    cudaError err = cudaDeviceSynchronize();                                 \
     if ( cudaSuccess != err) {                                               \
         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
@@ -822,7 +810,7 @@ extern "C" {
                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
         exit(EXIT_FAILURE);                                                  \
     }                                                                        \
-    err = CUT_DEVICE_SYNCHRONIZE();                                           \
+    err = cudaDeviceSynchronize();                                           \
     if( cudaSuccess != err) {                                                \
         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
@@ -891,7 +879,7 @@ extern "C" {
                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
         exit(EXIT_FAILURE);                                                  \
     }                                                                        \
-    err = CUT_DEVICE_SYNCHRONIZE();                                           \
+    err = cudaDeviceSynchronize();                                           \
     if( cudaSuccess != err) {                                                \
         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
@@ -906,7 +894,7 @@ extern "C" {
                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
         exit(EXIT_FAILURE);                                                  \
     }                                                                        \
-    err = CUT_DEVICE_SYNCHRONIZE();                                           \
+    err = cudaDeviceSynchronize();                                           \
     if( cudaSuccess != err) {                                                \
         fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
                 errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
diff --git a/cuda/common/include/pcl/cuda/cutil_inline_runtime.h b/cuda/common/include/pcl/cuda/cutil_inline_runtime.h
index fae1f817771..c177d2f5aa9 100644
--- a/cuda/common/include/pcl/cuda/cutil_inline_runtime.h
+++ b/cuda/common/include/pcl/cuda/cutil_inline_runtime.h
@@ -42,20 +42,12 @@
 
 inline cudaError cutilDeviceSynchronize()
 {
-#if CUDART_VERSION >= 4000
 	return cudaDeviceSynchronize();
-#else
-	return cudaThreadSynchronize();
-#endif
 }
 
 inline cudaError cutilDeviceReset()
 {
-#if CUDART_VERSION >= 4000
 	return cudaDeviceReset();
-#else
-	return cudaThreadExit();
-#endif
 }
 
 inline void __cutilCondition(int val, char *file, int line) 
diff --git a/cuda/common/include/pcl/cuda/time_gpu.h b/cuda/common/include/pcl/cuda/time_gpu.h
index 73deef9a1b3..4457029c699 100644
--- a/cuda/common/include/pcl/cuda/time_gpu.h
+++ b/cuda/common/include/pcl/cuda/time_gpu.h
@@ -83,7 +83,7 @@ namespace pcl
         {
           CUT_CHECK_ERROR ("dude");
           // Measure time needed to copy data
-          cutilSafeCall (cudaThreadSynchronize ());
+          cutilSafeCall (cudaDeviceSynchronize ());
           cutilSafeCall (cudaEventRecord (end_, 0));
           cutilSafeCall (cudaEventSynchronize (end_));
           cutilSafeCall (cudaEventElapsedTime (&elapsed_time_, start_, end_));
diff --git a/gpu/containers/src/initialization.cpp b/gpu/containers/src/initialization.cpp
index 653a83265f1..11f8f2cd781 100644
--- a/gpu/containers/src/initialization.cpp
+++ b/gpu/containers/src/initialization.cpp
@@ -169,7 +169,6 @@ void pcl::gpu::printCudaDeviceInfo(int device)
         printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, sm_cores, sm_cores * prop.multiProcessorCount);
         printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
 
-#if (CUDART_VERSION >= 4000)
         // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
         int memoryClock, memBusWidth, L2CacheSize;
         getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );        
@@ -187,7 +186,6 @@ void pcl::gpu::printCudaDeviceInfo(int device)
         printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
             prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
             prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-#endif
         printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
         printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
         printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
@@ -198,11 +196,7 @@ void pcl::gpu::printCudaDeviceInfo(int device)
         printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
         printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
 
-#if CUDART_VERSION >= 4000
         printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
-#else
-        printf("  Concurrent copy and execution:                 %s\n", prop.deviceOverlap ? "Yes" : "No");
-#endif
         printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
         printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
         printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
@@ -211,10 +205,8 @@ void pcl::gpu::printCudaDeviceInfo(int device)
         printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
         printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
         printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
-#if CUDART_VERSION >= 4000
         printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
         printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
-#endif
         printf("  Compute Mode:\n");
         printf("      %s \n", computeMode[prop.computeMode]);
     }
diff --git a/gpu/people/CMakeLists.txt b/gpu/people/CMakeLists.txt
index 174ef99a82c..520e65fd30e 100644
--- a/gpu/people/CMakeLists.txt
+++ b/gpu/people/CMakeLists.txt
@@ -18,23 +18,19 @@ REMOVE_VTK_DEFINITIONS()
 
 #find NPP
 unset(CUDA_npp_LIBRARY CACHE)
-if(${CUDA_VERSION} VERSION_LESS "5.5")
-  find_cuda_helper_libs(npp)
+find_cuda_helper_libs(nppc)
+find_cuda_helper_libs(npps)
+if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0")
+  find_cuda_helper_libs(nppim)
+  find_cuda_helper_libs(nppidei)
 else()
-  find_cuda_helper_libs(nppc)
-  find_cuda_helper_libs(npps)
-  if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0")
-    find_cuda_helper_libs(nppim)
-    find_cuda_helper_libs(nppidei)
-  else()
-    find_cuda_helper_libs(nppi)
-  endif()
-
-  if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0")
-    set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppim_LIBRARY} ${CUDA_nppidei_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library")
-  else()
-    set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library")
-  endif()
+  find_cuda_helper_libs(nppi)
+endif()
+
+if(${CUDA_VERSION} VERSION_GREATER_EQUAL "9.0")
+  set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppim_LIBRARY} ${CUDA_nppidei_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library")
+else()
+  set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY} CACHE STRING "npp library")
 endif()
 
 #Label_skeleton
diff --git a/gpu/people/src/cuda/multi_tree.cu b/gpu/people/src/cuda/multi_tree.cu
index e24c53b5180..8c26d3eadaa 100644
--- a/gpu/people/src/cuda/multi_tree.cu
+++ b/gpu/people/src/cuda/multi_tree.cu
@@ -164,7 +164,7 @@ namespace pcl
 
       KernelCUDA_runTree<<< grid, block >>>( focal, treeHeight, numNodes, nodes, leaves, labels);
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );      
+      cudaSafeCall( cudaDeviceSynchronize() );      
     }
 
     void CUDA_runMultiTreePass ( int   FGThresh,
@@ -198,7 +198,7 @@ namespace pcl
       }
 
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );      
+      cudaSafeCall( cudaDeviceSynchronize() );      
     }
 
     ///////////////////////////////////////////////////////////////////////////////////////
@@ -323,7 +323,7 @@ namespace pcl
       KernelCUDA_MultiTreeMerge<<< grid, block >>>( numTrees, labels );
 
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );            
+      cudaSafeCall( cudaDeviceSynchronize() );            
     }
 
     /** \brief This will merge the votes from the different trees into one final vote, including probabilistic's */
@@ -348,7 +348,7 @@ namespace pcl
       KernelCUDA_MultiTreeCreateProb<<< grid, block >>>( numTrees, probabilities);
 
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );            
+      cudaSafeCall( cudaDeviceSynchronize() );            
     }
   }
 }
diff --git a/gpu/people/src/cuda/prob.cu b/gpu/people/src/cuda/prob.cu
index 797fcaeec76..8d90c77b5ce 100644
--- a/gpu/people/src/cuda/prob.cu
+++ b/gpu/people/src/cuda/prob.cu
@@ -263,7 +263,7 @@ namespace pcl
       KernelCUDA_SelectLabel<<< grid, block >>>( labels, probabilities );
 
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );
+      cudaSafeCall( cudaDeviceSynchronize() );
     }
 
     /** \brief This will combine two probabilities according their weight **/
@@ -282,7 +282,7 @@ namespace pcl
       KernelCUDA_CombineProb<<< grid, block >>>( probIn1, weight1, probIn2, weight2, probOut );
 
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );
+      cudaSafeCall( cudaDeviceSynchronize() );
     }
 
     /** \brief This will combine two probabilities according their weight **/
@@ -299,7 +299,7 @@ namespace pcl
       KernelCUDA_WeightedSumProb<<< grid, block >>>( probIn, weight, probOut );
 
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );
+      cudaSafeCall( cudaDeviceSynchronize() );
     }
 
     /** \brief This will blur the input labelprobability with the given kernel **/
@@ -336,12 +336,12 @@ namespace pcl
       KernelCUDA_GaussianBlurVer<<< grid, block >>>( probIn, kernel, kernel.size(), probTemp );
       //KernelCUDA_GaussianBlurVer<<< grid, block >>>( probIn, kernel, kernel.size(), probOut );
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );
+      cudaSafeCall( cudaDeviceSynchronize() );
 
       // CUDA kernel call Horizontal
       KernelCUDA_GaussianBlurHor<<< grid, block >>>( probTemp, kernel, kernel.size(), probOut );
       cudaSafeCall( cudaGetLastError() );
-      cudaSafeCall( cudaThreadSynchronize() );
+      cudaSafeCall( cudaDeviceSynchronize() );
       return 1;
     }
   }
diff --git a/gpu/people/src/cuda/utils.cu b/gpu/people/src/cuda/utils.cu
index 33e3fc5674e..e692d7713df 100644
--- a/gpu/people/src/cuda/utils.cu
+++ b/gpu/people/src/cuda/utils.cu
@@ -54,7 +54,7 @@ void pcl::device::colorLMap(const Labels& labels, const DeviceArray<uchar4>& map
   colorKernel<<< grid, block >>>( labels, rgba );
 
   cudaSafeCall( cudaGetLastError() );
-  cudaSafeCall( cudaThreadSynchronize() );  
+  cudaSafeCall( cudaDeviceSynchronize() );  
 }
 
 void pcl::device::mixedColorMap(const Labels& labels, const DeviceArray<uchar4>& map, const Image& rgba, Image& output)
@@ -170,7 +170,7 @@ void pcl::device::prepareForeGroundDepth(const Depth& depth1, Mask& inverse_mask
   fgDepthKernel<<< grid, block >>>( depth1, inverse_mask, depth2 );
 
   cudaSafeCall( cudaGetLastError() );
-  cudaSafeCall( cudaThreadSynchronize() );
+  cudaSafeCall( cudaDeviceSynchronize() );
 }