Skip to content

Commit 64720d8

Browse files
authored
[SYCL][CUDA][PI] Fix infinite loop when parallel_for range exceeds INT_MAX (#5095)
1 parent 6699a5d commit 64720d8

File tree

1 file changed

+8
-10
lines changed

1 file changed

+8
-10
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
245245
// Determine local work sizes that result in uniform work groups.
246246
// The default threadsPerBlock only require handling the first work_dim
247247
// dimension.
248-
void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
248+
void guessLocalWorkSize(size_t *threadsPerBlock, const size_t *global_work_size,
249249
const size_t maxThreadsPerBlock[3], pi_kernel kernel,
250250
pi_uint32 local_size) {
251251
assert(threadsPerBlock != nullptr);
@@ -259,10 +259,9 @@ void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
259259

260260
(void)minGrid; // Not used, avoid warnings
261261

262-
threadsPerBlock[0] =
263-
std::min(static_cast<int>(maxThreadsPerBlock[0]),
264-
std::min(static_cast<int>(global_work_size[0]),
265-
static_cast<int>(recommendedBlockSize)));
262+
threadsPerBlock[0] = std::min(
263+
maxThreadsPerBlock[0],
264+
std::min(global_work_size[0], static_cast<size_t>(recommendedBlockSize)));
266265

267266
// Find a local work group size that is a divisor of the global
268267
// work group size to produce uniform work groups.
@@ -2613,7 +2612,7 @@ pi_result cuda_piEnqueueKernelLaunch(
26132612

26142613
// Set the number of threads per block to the number of threads per warp
26152614
// by default unless user has provided a better number
2616-
int threadsPerBlock[3] = {32, 1, 1};
2615+
size_t threadsPerBlock[3] = {32u, 1u, 1u};
26172616
size_t maxWorkGroupSize = 0u;
26182617
size_t maxThreadsPerBlock[3] = {};
26192618
bool providedLocalWorkGroupSize = (local_work_size != nullptr);
@@ -2644,7 +2643,7 @@ pi_result cuda_piEnqueueKernelLaunch(
26442643
return PI_INVALID_WORK_GROUP_SIZE;
26452644
if (0u != (global_work_size[dim] % local_work_size[dim]))
26462645
return PI_INVALID_WORK_GROUP_SIZE;
2647-
threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
2646+
threadsPerBlock[dim] = local_work_size[dim];
26482647
return PI_SUCCESS;
26492648
};
26502649

@@ -2664,12 +2663,11 @@ pi_result cuda_piEnqueueKernelLaunch(
26642663
return PI_INVALID_WORK_GROUP_SIZE;
26652664
}
26662665

2667-
int blocksPerGrid[3] = {1, 1, 1};
2666+
size_t blocksPerGrid[3] = {1u, 1u, 1u};
26682667

26692668
for (size_t i = 0; i < work_dim; i++) {
26702669
blocksPerGrid[i] =
2671-
static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
2672-
threadsPerBlock[i];
2670+
(global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
26732671
}
26742672

26752673
std::unique_ptr<_pi_event> retImplEv{nullptr};

0 commit comments

Comments
 (0)