Skip to content

Commit 8037901

Browse files
authored
Add code of occupancy computing on DCU and avoid threadID bug for DCU profiler (#44520)
1 parent fcfaa10 commit 8037901

File tree

10 files changed

+108
-3
lines changed

10 files changed

+108
-3
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,7 @@ endif()
435435
if(WITH_ROCM)
436436
include(hip)
437437
include(miopen) # set miopen libraries, must before configure
438+
include(cupti)
438439
endif()
439440

440441
if(WITH_XPU_KP)

cmake/configure.cmake

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,13 @@ elseif(WITH_ROCM)
178178
add_definitions(-DEIGEN_USE_GPU)
179179
add_definitions(-DEIGEN_USE_HIP)
180180

181+
if(CUPTI_FOUND)
182+
include_directories(${CUPTI_INCLUDE_DIR})
183+
add_definitions(-DPADDLE_WITH_CUPTI)
184+
else()
185+
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
186+
endif()
187+
181188
if(NOT MIOPEN_FOUND)
182189
message(FATAL_ERROR "Paddle needs MIOpen to compile")
183190
endif()

cmake/cupti.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
if(NOT WITH_GPU)
1+
if(NOT WITH_GPU AND NOT WITH_ROCM)
22
return()
33
endif()
44

paddle/fluid/platform/dynload/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ if(NOT APPLE)
3535
if(WITH_RCCL)
3636
list(APPEND HIP_SRCS rccl.cc)
3737
endif()
38+
if(CUPTI_FOUND)
39+
list(APPEND HIP_SRCS cupti.cc)
40+
endif()
3841
endif()
3942
endif()
4043

paddle/fluid/platform/profiler/chrometracing_logger.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,11 @@ void ChromeTracingLogger::HandleTypeKernel(
401401
float warps_per_sm = 0.0;
402402
float occupancy = 0.0;
403403
#if defined(PADDLE_WITH_CUPTI)
404+
#ifdef PADDLE_WITH_HIP
405+
constexpr int threads_per_warp = 64;
406+
#else
404407
constexpr int threads_per_warp = 32;
408+
#endif
405409
const gpuDeviceProp& device_property =
406410
GetDeviceProperties(device_node.DeviceId());
407411
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
@@ -411,6 +415,15 @@ void ChromeTracingLogger::HandleTypeKernel(
411415
blocks_per_sm *
412416
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
413417
threads_per_warp;
418+
#ifdef PADDLE_WITH_HIP
419+
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
420+
kernel_info.dynamic_shared_memory,
421+
kernel_info.block_x,
422+
kernel_info.block_y,
423+
kernel_info.block_z,
424+
kernel_info.kernelFunc,
425+
kernel_info.launchType);
426+
#else
414427
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
415428
kernel_info.registers_per_thread,
416429
kernel_info.static_shared_memory,
@@ -419,6 +432,8 @@ void ChromeTracingLogger::HandleTypeKernel(
419432
kernel_info.block_y,
420433
kernel_info.block_z,
421434
blocks_per_sm);
435+
#endif // PADDLE_WITH_HIP
436+
422437
#endif
423438
float dur = nsToMsFloat(device_node.Duration());
424439
std::string dur_display;

paddle/fluid/platform/profiler/cupti_data_process.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel,
5252
event.kernel_info.queued = kernel->queued;
5353
event.kernel_info.submitted = kernel->submitted;
5454
event.kernel_info.completed = kernel->completed;
55+
#ifdef PADDLE_WITH_HIP
56+
event.kernel_info.kernelFunc = kernel->kernelFunc;
57+
event.kernel_info.launchType = kernel->launchType;
58+
#endif
5559
collector->AddDeviceEvent(std::move(event));
5660
}
5761

@@ -279,7 +283,11 @@ void AddApiRecord(const CUpti_ActivityAPI* api,
279283
} else {
280284
tid = iter->second;
281285
}
286+
#ifdef PADDLE_WITH_HIP
287+
event.thread_id = api->threadId;
288+
#else
282289
event.thread_id = tid;
290+
#endif
283291
event.correlation_id = api->correlationId;
284292
event.callback_id = api->cbid;
285293
collector->AddRuntimeEvent(std::move(event));

paddle/fluid/platform/profiler/trace_event.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ struct KernelEventInfo {
105105
uint64_t submitted;
106106
// The completed timestamp for the kernel execution, in ns.
107107
uint64_t completed;
108+
#ifdef PADDLE_WITH_HIP
109+
void* kernelFunc;
110+
uint8_t launchType;
111+
#endif
108112
};
109113

110114
static constexpr size_t kMemKindMaxLen = 50;

paddle/fluid/platform/profiler/utils.cc

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,58 @@ std::string json_vector<std::string>(
4343
}
4444

4545
#ifdef PADDLE_WITH_CUPTI
46+
47+
#ifdef PADDLE_WITH_HIP
48+
49+
#include "hip/hip_runtime.h"
50+
float CalculateEstOccupancy(uint32_t DeviceId,
51+
int32_t DynamicSharedMemory,
52+
int32_t BlockX,
53+
int32_t BlockY,
54+
int32_t BlockZ,
55+
void* kernelFunc,
56+
uint8_t launchType) {
57+
float occupancy = 0.0;
58+
std::vector<int> device_ids = GetSelectedDevices();
59+
if (DeviceId < device_ids.size()) {
60+
const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
61+
int blockSize = BlockX * BlockY * BlockZ;
62+
int numBlock = 0;
63+
hipError_t status;
64+
if (launchType == 0) {
65+
status = hipOccupancyMaxActiveBlocksPerMultiprocessor(
66+
&numBlock, kernelFunc, blockSize, DynamicSharedMemory);
67+
if (status == hipSuccess) {
68+
occupancy = static_cast<double>(numBlock) * blockSize /
69+
device_property.maxThreadsPerMultiProcessor;
70+
} else {
71+
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
72+
<< status << std::endl;
73+
}
74+
} else if (launchType == 100) {
75+
status = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
76+
&numBlock,
77+
reinterpret_cast<hipFunction_t>(kernelFunc),
78+
blockSize,
79+
DynamicSharedMemory);
80+
if (status == hipSuccess) {
81+
occupancy = static_cast<double>(numBlock) * blockSize /
82+
device_property.maxThreadsPerMultiProcessor;
83+
} else {
84+
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
85+
<< status << std::endl;
86+
}
87+
} else {
88+
LOG(WARNING) << "Failed to calculate estimated occupancy, can not "
89+
"recognize launchType : "
90+
<< launchType << std::endl;
91+
}
92+
}
93+
return occupancy;
94+
}
95+
96+
#else
97+
4698
float CalculateEstOccupancy(uint32_t DeviceId,
4799
uint16_t RegistersPerThread,
48100
int32_t StaticSharedMemory,
@@ -88,7 +140,9 @@ float CalculateEstOccupancy(uint32_t DeviceId,
88140
}
89141
return occupancy;
90142
}
91-
#endif
143+
#endif // PADDLE_WITH_HIP
144+
145+
#endif // PADDLE_WITH_CUPTI
92146

93147
const char* StringTracerMemEventType(TracerMemEventType type) {
94148
static const char* categary_name_[] = {

paddle/fluid/platform/profiler/utils.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
125125
}
126126

127127
#ifdef PADDLE_WITH_CUPTI
128+
#ifdef PADDLE_WITH_HIP
129+
float CalculateEstOccupancy(uint32_t DeviceId,
130+
int32_t DynamicSharedMemory,
131+
int32_t BlockX,
132+
int32_t BlockY,
133+
int32_t BlockZ,
134+
void* kernelFunc,
135+
uint8_t launchType);
136+
#else
128137
float CalculateEstOccupancy(uint32_t deviceId,
129138
uint16_t registersPerThread,
130139
int32_t staticSharedMemory,
@@ -133,7 +142,8 @@ float CalculateEstOccupancy(uint32_t deviceId,
133142
int32_t blockY,
134143
int32_t blockZ,
135144
float blocksPerSm);
136-
#endif
145+
#endif // PADDLE_WITH_HIP
146+
#endif // PADDLE_WITH_CUPTI
137147

138148
} // namespace platform
139149
} // namespace paddle

paddle/phi/backends/dynload/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ if(NOT APPLE)
3535
if(WITH_RCCL)
3636
list(APPEND HIP_SRCS rccl.cc)
3737
endif()
38+
if(CUPTI_FOUND)
39+
list(APPEND HIP_SRCS cupti.cc)
40+
endif()
3841
endif()
3942
endif()
4043

0 commit comments

Comments
 (0)