@@ -245,7 +245,7 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
245
245
// Determine local work sizes that result in uniform work groups.
246
246
// The default threadsPerBlock only require handling the first work_dim
247
247
// dimension.
248
- void guessLocalWorkSize (int *threadsPerBlock, const size_t *global_work_size,
248
+ void guessLocalWorkSize (size_t *threadsPerBlock, const size_t *global_work_size,
249
249
const size_t maxThreadsPerBlock[3 ], pi_kernel kernel,
250
250
pi_uint32 local_size) {
251
251
assert (threadsPerBlock != nullptr );
@@ -259,10 +259,9 @@ void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
259
259
260
260
(void )minGrid; // Not used, avoid warnings
261
261
262
- threadsPerBlock[0 ] =
263
- std::min (static_cast <int >(maxThreadsPerBlock[0 ]),
264
- std::min (static_cast <int >(global_work_size[0 ]),
265
- static_cast <int >(recommendedBlockSize)));
262
+ threadsPerBlock[0 ] = std::min (
263
+ maxThreadsPerBlock[0 ],
264
+ std::min (global_work_size[0 ], static_cast <size_t >(recommendedBlockSize)));
266
265
267
266
// Find a local work group size that is a divisor of the global
268
267
// work group size to produce uniform work groups.
@@ -2613,7 +2612,7 @@ pi_result cuda_piEnqueueKernelLaunch(
2613
2612
2614
2613
// Set the number of threads per block to the number of threads per warp
2615
2614
// by default unless user has provided a better number
2616
- int threadsPerBlock[3 ] = {32 , 1 , 1 };
2615
+ size_t threadsPerBlock[3 ] = {32u , 1u , 1u };
2617
2616
size_t maxWorkGroupSize = 0u ;
2618
2617
size_t maxThreadsPerBlock[3 ] = {};
2619
2618
bool providedLocalWorkGroupSize = (local_work_size != nullptr );
@@ -2644,7 +2643,7 @@ pi_result cuda_piEnqueueKernelLaunch(
2644
2643
return PI_INVALID_WORK_GROUP_SIZE;
2645
2644
if (0u != (global_work_size[dim] % local_work_size[dim]))
2646
2645
return PI_INVALID_WORK_GROUP_SIZE;
2647
- threadsPerBlock[dim] = static_cast < int >( local_work_size[dim]) ;
2646
+ threadsPerBlock[dim] = local_work_size[dim];
2648
2647
return PI_SUCCESS;
2649
2648
};
2650
2649
@@ -2664,12 +2663,11 @@ pi_result cuda_piEnqueueKernelLaunch(
2664
2663
return PI_INVALID_WORK_GROUP_SIZE;
2665
2664
}
2666
2665
2667
- int blocksPerGrid[3 ] = {1 , 1 , 1 };
2666
+ size_t blocksPerGrid[3 ] = {1u , 1u , 1u };
2668
2667
2669
2668
for (size_t i = 0 ; i < work_dim; i++) {
2670
2669
blocksPerGrid[i] =
2671
- static_cast <int >(global_work_size[i] + threadsPerBlock[i] - 1 ) /
2672
- threadsPerBlock[i];
2670
+ (global_work_size[i] + threadsPerBlock[i] - 1 ) / threadsPerBlock[i];
2673
2671
}
2674
2672
2675
2673
std::unique_ptr<_pi_event> retImplEv{nullptr };
0 commit comments