@@ -420,7 +420,7 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
420
420
assert (PtrStorage && " Invalid pointer storage" );
421
421
422
422
*PtrStorage = MemoryManager->allocate (Size, nullptr );
423
- if (*PtrStorage == nullptr )
423
+ if (Size && *PtrStorage == nullptr )
424
424
return Plugin::error (ErrorCode::OUT_OF_RESOURCES,
425
425
" failure to allocate from AMDGPU memory manager" );
426
426
@@ -429,7 +429,8 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
429
429
430
430
// / Release an allocation to be reused.
431
431
Error deallocate (void *Ptr) {
432
- assert (Ptr && " Invalid pointer" );
432
+ if (!Ptr)
433
+ return Plugin::success ();
433
434
434
435
if (MemoryManager->free (Ptr))
435
436
return Plugin::error (ErrorCode::UNKNOWN,
@@ -3365,7 +3366,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3365
3366
KernelLaunchParamsTy LaunchParams,
3366
3367
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3367
3368
if (ArgsSize != LaunchParams.Size &&
3368
- ArgsSize != LaunchParams.Size + getImplicitArgsSize ())
3369
+ ArgsSize > LaunchParams.Size + getImplicitArgsSize ())
3369
3370
return Plugin::error (ErrorCode::INVALID_ARGUMENT,
3370
3371
" mismatch of kernel arguments size" );
3371
3372
@@ -3401,23 +3402,39 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
3401
3402
if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
3402
3403
return Err;
3403
3404
3404
- hsa_utils::AMDGPUImplicitArgsTy *ImplArgs = nullptr ;
3405
- if (ArgsSize == LaunchParams.Size + getImplicitArgsSize ()) {
3406
- ImplArgs = reinterpret_cast <hsa_utils::AMDGPUImplicitArgsTy *>(
3407
- utils::advancePtr (AllArgs, LaunchParams.Size ));
3408
-
3409
- // Set the COV5+ implicit arguments to the appropriate values.
3410
- std::memset (ImplArgs, 0 , getImplicitArgsSize ());
3411
- ImplArgs->BlockCountX = NumBlocks[0 ];
3412
- ImplArgs->BlockCountY = NumBlocks[1 ];
3413
- ImplArgs->BlockCountZ = NumBlocks[2 ];
3414
- ImplArgs->GroupSizeX = NumThreads[0 ];
3415
- ImplArgs->GroupSizeY = NumThreads[1 ];
3416
- ImplArgs->GroupSizeZ = NumThreads[2 ];
3417
- ImplArgs->GridDims = NumBlocks[2 ] * NumThreads[2 ] > 1
3418
- ? 3
3419
- : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 );
3420
- ImplArgs->DynamicLdsSize = KernelArgs.DynCGroupMem ;
3405
+ uint64_t ImplArgsOffset = utils::roundUp (
3406
+ LaunchParams.Size , alignof (hsa_utils::AMDGPUImplicitArgsTy));
3407
+ if (ArgsSize > ImplArgsOffset) {
3408
+ hsa_utils::AMDGPUImplicitArgsTy *ImplArgs =
3409
+ reinterpret_cast <hsa_utils::AMDGPUImplicitArgsTy *>(
3410
+ utils::advancePtr (AllArgs, ImplArgsOffset));
3411
+
3412
+ // Set the COV5+ implicit arguments to the appropriate values if present.
3413
+ uint64_t ImplArgsSize = ArgsSize - ImplArgsOffset;
3414
+ std::memset (ImplArgs, 0 , ImplArgsSize);
3415
+
3416
+ using ImplArgsTy = hsa_utils::AMDGPUImplicitArgsTy;
3417
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::BlockCountX, ImplArgsSize,
3418
+ NumBlocks[0 ]);
3419
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::BlockCountY, ImplArgsSize,
3420
+ NumBlocks[1 ]);
3421
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::BlockCountZ, ImplArgsSize,
3422
+ NumBlocks[2 ]);
3423
+
3424
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GroupSizeX, ImplArgsSize,
3425
+ NumThreads[0 ]);
3426
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GroupSizeY, ImplArgsSize,
3427
+ NumThreads[1 ]);
3428
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GroupSizeZ, ImplArgsSize,
3429
+ NumThreads[2 ]);
3430
+
3431
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::GridDims, ImplArgsSize,
3432
+ NumBlocks[2 ] * NumThreads[2 ] > 1
3433
+ ? 3
3434
+ : 1 + (NumBlocks[1 ] * NumThreads[1 ] != 1 ));
3435
+
3436
+ hsa_utils::initImplArg (ImplArgs, &ImplArgsTy::DynamicLdsSize, ImplArgsSize,
3437
+ KernelArgs.DynCGroupMem );
3421
3438
}
3422
3439
3423
3440
// Push the kernel launch into the stream.
0 commit comments