Skip to content

Commit

Permalink
[Update] Information about CUDA debugging and profiling was added.
Browse files Browse the repository at this point in the history
  • Loading branch information
sangwook236 committed Feb 4, 2025
1 parent ec6e342 commit 0ffbe94
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 110 deletions.
26 changes: 23 additions & 3 deletions sw_dev/cpp/ext/doc/high_performance_computing/cuda_usage_guide.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,16 @@

- Documentation.
https://docs.nvidia.com/cuda/

https://docs.nvidia.com/cuda/cuda-c-programming-guide/
https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/

https://docs.nvidia.com/cuda/hopper-compatibility-guide/
https://docs.nvidia.com/cuda/ada-compatibility-guide/
https://docs.nvidia.com/cuda/blackwell-compatibility-guide/
https://docs.nvidia.com/cuda/hopper-tuning-guide/
https://docs.nvidia.com/cuda/ada-tuning-guide/
https://docs.nvidia.com/cuda/blackwell-tuning-guide/

https://docs.nvidia.com/cuda/cuda-runtime-api/
https://docs.nvidia.com/cuda/cuda-driver-api/
Expand All @@ -21,9 +30,7 @@
https://nvidia.github.io/cccl/thrust/

https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/
https://docs.nvidia.com/nsight-compute/
https://docs.nvidia.com/nsight-visual-studio-edition/
https://docs.nvidia.com/cuda/profiler-users-guide/
https://docs.nvidia.com/cuda/cuda-binary-utilities/

- Tutorial.
http://www.vizworld.com/2009/06/isc-2009-cuda-tutorial-from-nvidia/
Expand Down Expand Up @@ -145,6 +152,19 @@
[-] Usage (TensorRT, TRTorch, TF-TRT).
Refer to ./tensorrt_usage_guide.txt

[-] Debugging.
https://docs.nvidia.com/cuda/cuda-gdb/
https://docs.nvidia.com/compute-sanitizer/
https://docs.nvidia.com/nsight-visual-studio-edition/

[-] Profiling.
https://docs.nvidia.com/cuda/profiler-users-guide/

- Nsight.
https://docs.nvidia.com/nsight-systems/
https://docs.nvidia.com/nsight-systems/UserGuide/
https://docs.nvidia.com/nsight-compute/

[-] Installation.
- Log.
/var/log/cuda-installer.log
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
//#include <cutil.h> // CUDA utility tools


#if defined(__CUDACC__) // Defined only in .cu files
Expand Down Expand Up @@ -478,109 +477,14 @@ void basic_operation()
{
cudaError_t cudaStatus = cudaSuccess;

// Error handling
{
std::cout << "Error: " << cudaErrorNoDevice << std::endl;
std::cout << "Error name: " << cudaGetErrorName(cudaErrorNoDevice) << std::endl;
std::cout << "Error string: " << cudaGetErrorString(cudaErrorNoDevice) << std::endl;

const auto lastErr = cudaGetLastError();
//const auto lastErr = cudaPeekAtLastError();
std::cout << "Last error: " << lastErr << std::endl;
std::cout << "Last error name: " << cudaGetErrorName(lastErr) << std::endl;
std::cout << "Last error string: " << cudaGetErrorString(lastErr) << std::endl;
}

//-----
// Device
{
int device_count = -1;
cudaStatus = cudaGetDeviceCount(&device_count);
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaGetDeviceCount() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
std::cout << "#devices = " << device_count << std::endl;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaSetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}

int device = -1;
cudaStatus = cudaGetDevice(&device);
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaGetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
std::cout << "Device ID = " << device << std::endl;

cudaDeviceProp prop;
cudaStatus = cudaGetDeviceProperties(&prop, device);
{
std::cerr << "cudaGetDeviceProperties() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
std::cout << "Device properties:" << std::endl;
std::cout << "\tName: " << prop.name << std::endl;
std::cout << "\tTotal global memory = " << prop.totalGlobalMem << std::endl;
std::cout << "\tShared memory per block = " << prop.sharedMemPerBlock << std::endl;
std::cout << "\t#registers(32bits) per block = " << prop.regsPerBlock << std::endl;
std::cout << "\tWarp size = " << prop.warpSize << std::endl;
std::cout << "\tmax threads per block = " << prop.maxThreadsPerBlock << std::endl;
std::cout << "\tmax threads dimension = " << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << std::endl;
std::cout << "\tmax gride size = " << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << std::endl;
std::cout << "\tMajor version = " << prop.major << std::endl;
std::cout << "\tMinor version = " << prop.minor << std::endl;
std::cout << "\tClock rate = " << prop.clockRate << std::endl;
std::cout << "\t#SMs = " << prop.multiProcessorCount << std::endl; // #SMs
std::cout << "\tCan map host memory = " << prop.canMapHostMemory << std::endl;
std::cout << "\tCompute mode = " << prop.computeMode << std::endl;
std::cout << "\tConcurrent kernels = " << prop.concurrentKernels << std::endl;
}

//-----
// Timer
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaSuccess != cudaStatus)
{
#if 0
unsigned int timer;
cutCreateTimer(&timer);
cutStartTimer(timer)

//kernel<<65535, 512>>(...);
//cudaDeviceSynchronize();

cutStopTimer(timer)
const double elapsed_time = cutGetTimerValue(timer);
std::cout << "Elapsed time = " << elapsed_time << " msec." << std::endl;
#else
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start, 0);

//kernel<<65535, 512>>(...);
//cudaDeviceSynchronize();

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, start, stop);
//cudaEventElapsedTime_v2(&elapsed_time, start, stop);
std::cout << "Elapsed time = " << elapsed_time << " msec." << std::endl;

cudaEventDestroy(start);
cudaEventDestroy(stop);
#endif
std::cerr << "cudaSetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}

//-----
local::access_device_variables();

//local::simple_example_1();
Expand Down
139 changes: 133 additions & 6 deletions sw_dev/cpp/ext/test/high_performance_computing/cuda/cuda_main.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
//#include <cutil.h> // CUDA utility tools


#if defined(__CUDACC__) // Defined only in .cu files
Expand All @@ -12,6 +13,136 @@
namespace {
namespace local {

void basic_functionality()
{
{
int runtimeVersion = 0;
cudaRuntimeGetVersion(&runtimeVersion);
int driverVersion = 0;
cudaDriverGetVersion(&driverVersion);
std::cout << "CUDA runtime version = " << runtimeVersion << ", CUDA driver version = " << driverVersion << std::endl;
}

// Error handling
{
std::cout << "Error handling:" << std::endl;

std::cout << "\tError: " << cudaErrorNoDevice << std::endl;
std::cout << "\tError name: " << cudaGetErrorName(cudaErrorNoDevice) << std::endl;
std::cout << "\tError string: " << cudaGetErrorString(cudaErrorNoDevice) << std::endl;

const auto lastErr = cudaGetLastError();
//const auto lastErr = cudaPeekAtLastError();
std::cout << "\tLast error: " << lastErr << std::endl;
std::cout << "\tLast error name: " << cudaGetErrorName(lastErr) << std::endl;
std::cout << "\tLast error string: " << cudaGetErrorString(lastErr) << std::endl;
}

// Event management
{
std::cout << "Event management:" << std::endl;

// Timer
#if 0
unsigned int timer;
cutCreateTimer(&timer);
cutStartTimer(timer)

//kernel<<65535, 512>>(...);
//cudaDeviceSynchronize();

cutStopTimer(timer)
const double elapsed_time = cutGetTimerValue(timer);
std::cout << "\tElapsed time = " << elapsed_time << " msec." << std::endl;
#else
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start, 0);

//kernel<<65535, 512>>(...);
//cudaDeviceSynchronize();

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, start, stop);
//cudaEventElapsedTime_v2(&elapsed_time, start, stop);
std::cout << "\tElapsed time = " << elapsed_time << " msec." << std::endl;

cudaEventDestroy(start);
cudaEventDestroy(stop);
#endif
}

// Device management
{
int device_count = -1;
auto cudaStatus = cudaGetDeviceCount(&device_count);
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaGetDeviceCount() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
std::cout << "#devices = " << device_count << std::endl;

#if 0
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaSetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
#endif

// The device on which the active host thread executes the device code
int device = -1;
cudaStatus = cudaGetDevice(&device);
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaGetDevice() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
std::cout << "Current device = " << device << std::endl;

cudaDeviceProp prop;
cudaStatus = cudaGetDeviceProperties(&prop, device);
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaGetDeviceProperties() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
std::cout << "Device properties:" << std::endl;
std::cout << "\tName: " << prop.name << std::endl;
std::cout << "\tTotal global memory = " << prop.totalGlobalMem << std::endl;
std::cout << "\tShared memory per block = " << prop.sharedMemPerBlock << std::endl;
std::cout << "\t#registers(32bits) per block = " << prop.regsPerBlock << std::endl;
std::cout << "\tWarp size = " << prop.warpSize << std::endl;
std::cout << "\tMax threads per block = " << prop.maxThreadsPerBlock << std::endl;
std::cout << "\tMax threads dimension = " << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << std::endl;
std::cout << "\tMax gride size = " << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << std::endl;
std::cout << "\tMajor version = " << prop.major << std::endl;
std::cout << "\tMinor version = " << prop.minor << std::endl;
std::cout << "\tClock rate = " << prop.clockRate << std::endl;
std::cout << "\t#SMs = " << prop.multiProcessorCount << std::endl; // #SMs
std::cout << "\tCan map host memory? = " << prop.canMapHostMemory << std::endl;
std::cout << "\tCompute mode = " << prop.computeMode << std::endl;
std::cout << "\tConcurrent kernels = " << prop.concurrentKernels << std::endl;

#if 0
// cudaDeviceReset() must be called before exiting in order for profiling and tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaSuccess != cudaStatus)
{
std::cerr << "cudaDeviceReset() failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return;
}
#endif
}
}

} // namespace local
} // unnamed namespace

Expand All @@ -24,11 +155,7 @@ void texture_test();

int cuda_main(int argc, char *argv[])
{
int runtimeVersion = 0;
cudaRuntimeGetVersion(&runtimeVersion);
int driverVersion = 0;
cudaDriverGetVersion(&driverVersion);
std::cout << "CUDA runtime version = " << runtimeVersion << ", CUDA driver version = " << driverVersion << std::endl;
local::basic_functionality();

//-----
my_cuda::basic_operation();
Expand Down

0 comments on commit 0ffbe94

Please sign in to comment.