[Update] Information about CUDA debugging and profiling was added.

sangwook236 · Feb 4, 2025 · 0ffbe94 · 0ffbe94
1 parent ec6e342
commit 0ffbe94
Show file tree

Hide file tree

Showing 3 changed files with 161 additions and 110 deletions.
diff --git a/sw_dev/cpp/ext/doc/high_performance_computing/cuda_usage_guide.txt b/sw_dev/cpp/ext/doc/high_performance_computing/cuda_usage_guide.txt
@@ -6,7 +6,16 @@
 
 	- Documentation.
 		https://docs.nvidia.com/cuda/
+
 		https://docs.nvidia.com/cuda/cuda-c-programming-guide/
+		https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/
+
+		https://docs.nvidia.com/cuda/hopper-compatibility-guide/
+		https://docs.nvidia.com/cuda/ada-compatibility-guide/
+		https://docs.nvidia.com/cuda/blackwell-compatibility-guide/
+		https://docs.nvidia.com/cuda/hopper-tuning-guide/
+		https://docs.nvidia.com/cuda/ada-tuning-guide/
+		https://docs.nvidia.com/cuda/blackwell-tuning-guide/
 
 		https://docs.nvidia.com/cuda/cuda-runtime-api/
 		https://docs.nvidia.com/cuda/cuda-driver-api/
@@ -21,9 +30,7 @@
 		https://nvidia.github.io/cccl/thrust/
 
 		https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/
-		https://docs.nvidia.com/nsight-compute/
-		https://docs.nvidia.com/nsight-visual-studio-edition/
-		https://docs.nvidia.com/cuda/profiler-users-guide/
+		https://docs.nvidia.com/cuda/cuda-binary-utilities/
 
 	- Tutorial.
 		http://www.vizworld.com/2009/06/isc-2009-cuda-tutorial-from-nvidia/
@@ -145,6 +152,19 @@
 [-] Usage (TensorRT, TRTorch, TF-TRT).
 	Refer to ./tensorrt_usage_guide.txt
 
+[-] Debugging.
+	https://docs.nvidia.com/cuda/cuda-gdb/
+	https://docs.nvidia.com/compute-sanitizer/
+	https://docs.nvidia.com/nsight-visual-studio-edition/
+
+[-] Profiling.
+	https://docs.nvidia.com/cuda/profiler-users-guide/
+
+	- Nsight.
+		https://docs.nvidia.com/nsight-systems/
+		https://docs.nvidia.com/nsight-systems/UserGuide/
+		https://docs.nvidia.com/nsight-compute/
+
 [-] Installation.
 	- Log.
 		/var/log/cuda-installer.log

diff --git a/sw_dev/cpp/ext/test/high_performance_computing/cuda/cuda_basic_operation.cu b/sw_dev/cpp/ext/test/high_performance_computing/cuda/cuda_basic_operation.cu
@@ -2,7 +2,6 @@
 #include <iostream>
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
-//#include <cutil.h>  // CUDA utility tools
 
 
 #if defined(__CUDACC__)  // Defined only in .cu files
@@ -478,109 +477,14 @@ void basic_operation()
 {
 	cudaError_t cudaStatus = cudaSuccess;
 
-	// Error handling
-	{
-		std::cout << "Error: " << cudaErrorNoDevice << std::endl;
-		std::cout << "Error name: " << cudaGetErrorName(cudaErrorNoDevice) << std::endl;
-		std::cout << "Error string: " << cudaGetErrorString(cudaErrorNoDevice) << std::endl;
-
-		const auto lastErr = cudaGetLastError();
-		//const auto lastErr = cudaPeekAtLastError();
-		std::cout << "Last error: " << lastErr << std::endl;
-		std::cout << "Last error name: " << cudaGetErrorName(lastErr) << std::endl;
-		std::cout << "Last error string: " << cudaGetErrorString(lastErr) << std::endl;
-	}
-
-	//-----
-	// Device
-	{
-		int device_count = -1;
-		cudaStatus = cudaGetDeviceCount(&device_count);
-		if (cudaSuccess != cudaStatus)
-		{
-			std::cerr << "cudaGetDeviceCount() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
-			return;
-		}
-		std::cout << "#devices = " << device_count << std::endl;
-
-		// Choose which GPU to run on, change this on a multi-GPU system.
-		cudaStatus = cudaSetDevice(0);
-		if (cudaSuccess != cudaStatus)
-		{
-			std::cerr << "cudaSetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
-			return;
-		}
-
-		int device = -1;
-		cudaStatus = cudaGetDevice(&device);
-		if (cudaSuccess != cudaStatus)
-		{
-			std::cerr << "cudaGetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
-			return;
-		}
-		std::cout << "Device ID = " << device << std::endl;
-
-		cudaDeviceProp prop;
-		cudaStatus = cudaGetDeviceProperties(&prop, device);
-		{
-			std::cerr << "cudaGetDeviceProperties() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
-			return;
-		}
-		std::cout << "Device properties:" << std::endl;
-		std::cout << "\tName: " << prop.name << std::endl;
-		std::cout << "\tTotal global memory = " << prop.totalGlobalMem << std::endl;
-		std::cout << "\tShared memory per block = " << prop.sharedMemPerBlock << std::endl;
-		std::cout << "\t#registers(32bits) per block = " << prop.regsPerBlock << std::endl;
-		std::cout << "\tWarp size = " << prop.warpSize << std::endl;
-		std::cout << "\tmax threads per block = " << prop.maxThreadsPerBlock << std::endl;
-		std::cout << "\tmax threads dimension = " << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << std::endl;
-		std::cout << "\tmax gride size = " << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << std::endl;
-		std::cout << "\tMajor version = " << prop.major << std::endl;
-		std::cout << "\tMinor version = " << prop.minor << std::endl;
-		std::cout << "\tClock rate = " << prop.clockRate << std::endl;
-		std::cout << "\t#SMs = " << prop.multiProcessorCount << std::endl;  // #SMs
-		std::cout << "\tCan map host memory = " << prop.canMapHostMemory << std::endl;
-		std::cout << "\tCompute mode = " << prop.computeMode << std::endl;
-		std::cout << "\tConcurrent kernels = " << prop.concurrentKernels << std::endl;
-	}
-
-	//-----
-	// Timer
+	// Choose which GPU to run on, change this on a multi-GPU system.
+	cudaStatus = cudaSetDevice(0);
+	if (cudaSuccess != cudaStatus)
 	{
-#if 0
-		unsigned int timer;
-		cutCreateTimer(&timer);
-		cutStartTimer(timer)
-
-		//kernel<<65535, 512>>(...);
-		//cudaDeviceSynchronize();
-
-		cutStopTimer(timer)
-		const double elapsed_time = cutGetTimerValue(timer);
-		std::cout << "Elapsed time = " << elapsed_time << " msec." << std::endl;
-#else
-		cudaEvent_t start, stop;
-		cudaEventCreate(&start);
-		cudaEventCreate(&stop);
-
-		cudaEventRecord(start, 0);
-
-		//kernel<<65535, 512>>(...);
-		//cudaDeviceSynchronize();
-
-		cudaEventRecord(stop, 0);
-		cudaEventSynchronize(stop);
-		float elapsed_time;
-		cudaEventElapsedTime(&elapsed_time, start, stop);
-		//cudaEventElapsedTime_v2(&elapsed_time, start, stop);
-		std::cout << "Elapsed time = " << elapsed_time << " msec." << std::endl;
-
-		cudaEventDestroy(start);
-		cudaEventDestroy(stop);
-#endif
+		std::cerr << "cudaSetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
+		return;
 	}
 
-	//-----
 	local::access_device_variables();
 
 	//local::simple_example_1();

diff --git a/sw_dev/cpp/ext/test/high_performance_computing/cuda/cuda_main.cpp b/sw_dev/cpp/ext/test/high_performance_computing/cuda/cuda_main.cpp
@@ -1,5 +1,6 @@
 #include <iostream>
-#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+//#include <cutil.h>  // CUDA utility tools
 
 
 #if defined(__CUDACC__)  // Defined only in .cu files
@@ -12,6 +13,136 @@
 namespace {
 namespace local {
 
+void basic_functionality()
+{
+	{
+		int runtimeVersion = 0;
+		cudaRuntimeGetVersion(&runtimeVersion);
+		int driverVersion = 0;
+		cudaDriverGetVersion(&driverVersion);
+		std::cout << "CUDA runtime version = " << runtimeVersion << ", CUDA driver version = " << driverVersion << std::endl;
+	}
+
+	// Error handling
+	{
+		std::cout << "Error handling:" << std::endl;
+
+		std::cout << "\tError: " << cudaErrorNoDevice << std::endl;
+		std::cout << "\tError name: " << cudaGetErrorName(cudaErrorNoDevice) << std::endl;
+		std::cout << "\tError string: " << cudaGetErrorString(cudaErrorNoDevice) << std::endl;
+
+		const auto lastErr = cudaGetLastError();
+		//const auto lastErr = cudaPeekAtLastError();
+		std::cout << "\tLast error: " << lastErr << std::endl;
+		std::cout << "\tLast error name: " << cudaGetErrorName(lastErr) << std::endl;
+		std::cout << "\tLast error string: " << cudaGetErrorString(lastErr) << std::endl;
+	}
+
+	// Event management
+	{
+		std::cout << "Event management:" << std::endl;
+
+		// Timer
+#if 0
+		unsigned int timer;
+		cutCreateTimer(&timer);
+		cutStartTimer(timer)
+
+		//kernel<<65535, 512>>(...);
+		//cudaDeviceSynchronize();
+
+		cutStopTimer(timer)
+		const double elapsed_time = cutGetTimerValue(timer);
+		std::cout << "\tElapsed time = " << elapsed_time << " msec." << std::endl;
+#else
+		cudaEvent_t start, stop;
+		cudaEventCreate(&start);
+		cudaEventCreate(&stop);
+
+		cudaEventRecord(start, 0);
+
+		//kernel<<65535, 512>>(...);
+		//cudaDeviceSynchronize();
+
+		cudaEventRecord(stop, 0);
+		cudaEventSynchronize(stop);
+		float elapsed_time;
+		cudaEventElapsedTime(&elapsed_time, start, stop);
+		//cudaEventElapsedTime_v2(&elapsed_time, start, stop);
+		std::cout << "\tElapsed time = " << elapsed_time << " msec." << std::endl;
+
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+#endif
+	}
+
+	// Device management
+	{
+		int device_count = -1;
+		auto cudaStatus = cudaGetDeviceCount(&device_count);
+		if (cudaSuccess != cudaStatus)
+		{
+			std::cerr << "cudaGetDeviceCount() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
+			return;
+		}
+		std::cout << "#devices = " << device_count << std::endl;
+
+#if 0
+		// Choose which GPU to run on, change this on a multi-GPU system.
+		cudaStatus = cudaSetDevice(0);
+		if (cudaSuccess != cudaStatus)
+		{
+			std::cerr << "cudaSetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
+			return;
+		}
+#endif
+
+		// The device on which the active host thread executes the device code
+		int device = -1;
+		cudaStatus = cudaGetDevice(&device);
+		if (cudaSuccess != cudaStatus)
+		{
+			std::cerr << "cudaGetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
+			return;
+		}
+		std::cout << "Current device = " << device << std::endl;
+
+		cudaDeviceProp prop;
+		cudaStatus = cudaGetDeviceProperties(&prop, device);
+		if (cudaSuccess != cudaStatus)
+		{
+			std::cerr << "cudaGetDeviceProperties() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
+			return;
+		}
+		std::cout << "Device properties:" << std::endl;
+		std::cout << "\tName: " << prop.name << std::endl;
+		std::cout << "\tTotal global memory = " << prop.totalGlobalMem << std::endl;
+		std::cout << "\tShared memory per block = " << prop.sharedMemPerBlock << std::endl;
+		std::cout << "\t#registers(32bits) per block = " << prop.regsPerBlock << std::endl;
+		std::cout << "\tWarp size = " << prop.warpSize << std::endl;
+		std::cout << "\tMax threads per block = " << prop.maxThreadsPerBlock << std::endl;
+		std::cout << "\tMax threads dimension = " << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << std::endl;
+		std::cout << "\tMax gride size = " << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << std::endl;
+		std::cout << "\tMajor version = " << prop.major << std::endl;
+		std::cout << "\tMinor version = " << prop.minor << std::endl;
+		std::cout << "\tClock rate = " << prop.clockRate << std::endl;
+		std::cout << "\t#SMs = " << prop.multiProcessorCount << std::endl;  // #SMs
+		std::cout << "\tCan map host memory? = " << prop.canMapHostMemory << std::endl;
+		std::cout << "\tCompute mode = " << prop.computeMode << std::endl;
+		std::cout << "\tConcurrent kernels = " << prop.concurrentKernels << std::endl;
+
+#if 0
+		// cudaDeviceReset() must be called before exiting in order for profiling and tracing tools such as Nsight and Visual Profiler to show complete traces.
+		cudaStatus = cudaDeviceReset();
+		if (cudaSuccess != cudaStatus)
+		{
+			std::cerr << "cudaDeviceReset() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
+			return;
+		}
+#endif
+	}
+}
+
 }  // namespace local
 }  // unnamed namespace
 
@@ -24,11 +155,7 @@ void texture_test();
 
 int cuda_main(int argc, char *argv[])
 {
-	int runtimeVersion = 0;
-	cudaRuntimeGetVersion(&runtimeVersion);
-	int driverVersion = 0;
-	cudaDriverGetVersion(&driverVersion);
-	std::cout << "CUDA runtime version = " << runtimeVersion << ", CUDA driver version = " << driverVersion << std::endl;
+	local::basic_functionality();
 
 	//-----
 	my_cuda::basic_operation();