diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index e1b209b3c2..5859a2bebd 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -41,17 +41,30 @@ bool IsDriverVersionNewerOrSimilar(ur_context_handle_t Context, (DriverVersionMinor >= VersionMinor) && (DriverVersionBuild >= VersionBuild)); } + +// Default to using compute engine for fill operation, but allow to +// override this with an environment variable. +static bool PreferCopyEngineForFill = [] { + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL"); + const char *PiRet = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); + return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0)); +}(); + }; // namespace ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, ze_command_list_handle_t CommandListResetEvents, + ze_command_list_handle_t CopyCommandList, ZeStruct ZeDesc, + ZeStruct ZeCopyDesc, const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList) : Context(Context), Device(Device), ZeCommandList(CommandList), ZeCommandListResetEvents(CommandListResetEvents), - ZeCommandListDesc(ZeDesc), ZeFencesMap(), ZeActiveFence(nullptr), + ZeCommandListDesc(ZeDesc), ZeCopyCommandList(CopyCommandList), + ZeCopyCommandListDesc(ZeCopyDesc), ZeFencesMap(), ZeActiveFence(nullptr), QueueProperties(), SyncPoints(), NextSyncPoint(0), IsUpdatable(Desc ? Desc->isUpdatable : false), IsProfilingEnabled(Desc ? Desc->enableProfiling : false), @@ -74,6 +87,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { if (ZeCommandList) { ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); } + if (UseCopyEngine() && ZeCopyCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList)); + } // Release the memory allocated to the CommandListResetEvents stored in the // command_buffer @@ -304,7 +320,8 @@ static ur_result_t getEventsFromSyncPoints( // buffer. static ur_result_t enqueueCommandBufferMemCopyHelper( ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, - void *Dst, const void *Src, size_t Size, uint32_t NumSyncPointsInWaitList, + void *Dst, const void *Src, size_t Size, bool PreferCopyEngine, + uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint) { if (CommandBuffer->IsInOrderCmdList) { @@ -331,9 +348,18 @@ static ur_result_t enqueueCommandBufferMemCopyHelper( *RetSyncPoint = SyncPoint; } + ze_command_list_handle_t ZeCommandList = CommandBuffer->ZeCommandList; + // If the copy engine available, the command is enqueued in the + // ZeCopyCommandList. + if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) { + ZeCommandList = CommandBuffer->ZeCopyCommandList; + // We indicate that the ZeCopyCommandList contains commands to be + // submitted. + CommandBuffer->MCopyCommandListEmpty = false; + } ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (CommandBuffer->ZeCommandList, Dst, Src, Size, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + (ZeCommandList, Dst, Src, Size, LaunchEvent->ZeEvent, + ZeEventList.size(), ZeEventList.data())); logger::debug("calling zeCommandListAppendMemoryCopy() with" " ZeEvent {}", @@ -349,7 +375,7 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( void *Dst, const void *Src, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch, size_t DstSlicePitch, - uint32_t NumSyncPointsInWaitList, + bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint) { @@ -410,10 +436,20 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( *RetSyncPoint = SyncPoint; } + ze_command_list_handle_t ZeCommandList = CommandBuffer->ZeCommandList; + // If the copy engine available, the command is enqueued in the + // ZeCopyCommandList. + if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) { + ZeCommandList = CommandBuffer->ZeCopyCommandList; + // We indicate that the ZeCopyCommandList contains commands to be + // submitted. + CommandBuffer->MCopyCommandListEmpty = false; + } + ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, - DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + (ZeCommandList, Dst, &ZeDstRegion, DstPitch, DstSlicePitch, Src, + &ZeSrcRegion, SrcPitch, SrcSlicePitch, LaunchEvent->ZeEvent, + ZeEventList.size(), ZeEventList.data())); logger::debug("calling zeCommandListAppendMemoryCopyRegion() with" " ZeEvent {}", @@ -427,20 +463,51 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( static ur_result_t enqueueCommandBufferFillHelper( ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, void *Ptr, const void *Pattern, size_t PatternSize, size_t Size, - uint32_t NumSyncPointsInWaitList, + bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint) { // Pattern size must be a power of two. UR_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0), UR_RESULT_ERROR_INVALID_VALUE); - // Pattern size must fit the compute queue capabilities. - UR_ASSERT( - PatternSize <= - CommandBuffer->Device - ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeProperties.maxMemoryFillPatternSize, - UR_RESULT_ERROR_INVALID_VALUE); + ze_command_list_handle_t ZeCommandList; + // If the copy engine available, the command is enqueued in the + // ZeCopyCommandList. + if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) { + // Check engine fill capabilty (as this is done for non-graph cmd + // submission) + if (CommandBuffer->Device->hasMainCopyEngine() && + CommandBuffer->Device + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy] + .ZeProperties.maxMemoryFillPatternSize < PatternSize) { + // If the maximum fill pattern size of the copy engine is smaller than the + // current PatternSize, the command is added to the compute command list. + + // Pattern size must fit the compute queue capabilities. + UR_ASSERT( + PatternSize <= + CommandBuffer->Device + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties.maxMemoryFillPatternSize, + UR_RESULT_ERROR_INVALID_VALUE); + + ZeCommandList = CommandBuffer->ZeCommandList; + } else { + ZeCommandList = CommandBuffer->ZeCopyCommandList; + // We indicate that the ZeCopyCommandList contains commands to be + // submitted. + CommandBuffer->MCopyCommandListEmpty = false; + } + } else { + // Pattern size must fit the compute queue capabilities. + UR_ASSERT( + PatternSize <= + CommandBuffer->Device + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties.maxMemoryFillPatternSize, + UR_RESULT_ERROR_INVALID_VALUE); + ZeCommandList = CommandBuffer->ZeCommandList; + } if (CommandBuffer->IsInOrderCmdList) { ZE2UR_CALL(zeCommandListAppendMemoryFill, @@ -468,7 +535,7 @@ static ur_result_t enqueueCommandBufferFillHelper( } ZE2UR_CALL(zeCommandListAppendMemoryFill, - (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size, + (ZeCommandList, Ptr, Pattern, PatternSize, Size, LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); logger::debug("calling zeCommandListAppendMemoryFill() with" @@ -490,8 +557,6 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) : false; - // Force compute queue type for now. Copy engine types may be better suited - // for host to device copies. uint32_t QueueGroupOrdinal = Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] .ZeOrdinal; @@ -524,10 +589,38 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, // command-buffers, then reusing them. ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, &ZeCommandList)); + + // Create a list for copy commands. + // Note that to simplify the implementation, the current implementation only + // uses the main copy engine and does not use the link engine even if + // available. + ze_command_list_handle_t ZeCopyCommandList = nullptr; + ZeStruct ZeCopyCommandListDesc; + if (Device->hasMainCopyEngine()) { + uint32_t QueueGroupOrdinalCopy = + Device + ->QueueGroup + [ur_device_handle_t_::queue_group_info_t::type::MainCopy] + .ZeOrdinal; + + ZeCopyCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinalCopy; + // Dependencies between commands are explicitly enforced by sync points when + // enqueuing. Consequently, relax the command ordering in the command list + // can enable the backend to further optimize the workload + ZeCopyCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING; + + // TODO We could optimize this by pooling both Level Zero command-lists and + // UR command-buffers, then reusing them. + ZE2UR_CALL(zeCommandListCreate, + (Context->ZeContext, Device->ZeDevice, &ZeCopyCommandListDesc, + &ZeCopyCommandList)); + } + try { *CommandBuffer = new ur_exp_command_buffer_handle_t_( Context, Device, ZeCommandList, ZeCommandListResetEvents, - ZeCommandListDesc, CommandBufferDesc, IsInOrder); + ZeCopyCommandList, ZeCommandListDesc, ZeCopyCommandListDesc, + CommandBufferDesc, IsInOrder); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -557,6 +650,15 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ZE2UR_CALL( zeCommandListAppendBarrier, (ZeCommandList, nullptr, PrecondEvents.size(), PrecondEvents.data())); + + if (Device->hasMainCopyEngine()) { + // The copy command-list must be executed once the preconditions have been + // met. We therefore begin this command-list with a barrier on the + // preconditions. + ZE2UR_CALL(zeCommandListAppendBarrier, + (ZeCopyCommandList, nullptr, PrecondEvents.size(), + PrecondEvents.data())); + } return UR_RESULT_SUCCESS; } @@ -618,7 +720,13 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { // Close the command lists and have them ready for dispatch. ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents)); + + if (CommandBuffer->UseCopyEngine()) { + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCopyCommandList)); + } + CommandBuffer->IsFinalized = true; + return UR_RESULT_SUCCESS; } @@ -750,8 +858,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { + + bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) || + !IsDevicePointer(CommandBuffer->Context, Dst); + + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueCommandBufferMemCopyHelper( - UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, + UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } @@ -761,8 +875,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { - auto SrcBuffer = ur_cast(SrcMem); - auto DstBuffer = ur_cast(DstMem); + auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem); + auto DstBuffer = ur_cast<_ur_buffer *>(DstMem); std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); std::scoped_lock, ur_shared_mutex> LockAll( @@ -775,10 +889,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); + + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueCommandBufferMemCopyHelper( UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst + DstOffset, - ZeHandleSrc + SrcOffset, Size, NumSyncPointsInWaitList, SyncPointWaitList, - SyncPoint); + ZeHandleSrc + SrcOffset, Size, PreferCopyEngine, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( @@ -789,8 +907,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { - auto SrcBuffer = ur_cast(SrcMem); - auto DstBuffer = ur_cast(DstMem); + auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem); + auto DstBuffer = ur_cast<_ur_buffer *>(DstMem); std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); std::scoped_lock, ur_shared_mutex> LockAll( @@ -803,10 +921,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); + + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueCommandBufferMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_COPY_RECT, CommandBuffer, ZeHandleDst, ZeHandleSrc, SrcOrigin, DstOrigin, Region, SrcRowPitch, DstRowPitch, SrcSlicePitch, - DstSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + DstSlicePitch, PreferCopyEngine, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( @@ -820,12 +943,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + // Always prefer copy engine for writes + bool PreferCopyEngine = true; return enqueueCommandBufferMemCopyHelper( UR_COMMAND_MEM_BUFFER_WRITE, CommandBuffer, ZeHandleDst + Offset, // dst Src, // src - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( @@ -841,11 +967,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + + // Always prefer copy engine for writes + bool PreferCopyEngine = true; + return enqueueCommandBufferMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_WRITE_RECT, CommandBuffer, ZeHandleDst, const_cast(static_cast(Src)), HostOffset, BufferOffset, Region, HostRowPitch, BufferRowPitch, HostSlicePitch, - BufferSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + BufferSlicePitch, PreferCopyEngine, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( @@ -858,9 +989,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( char *ZeHandleSrc = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, CommandBuffer->Device)); + + // Always prefer copy engine for reads + bool PreferCopyEngine = true; + return enqueueCommandBufferMemCopyHelper( UR_COMMAND_MEM_BUFFER_READ, CommandBuffer, Dst, ZeHandleSrc + Offset, - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( @@ -876,11 +1012,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, CommandBuffer->Device)); + + // Always prefer copy engine for reads + bool PreferCopyEngine = true; + return enqueueCommandBufferMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_READ_RECT, CommandBuffer, Dst, ZeHandleSrc, BufferOffset, HostOffset, Region, BufferRowPitch, HostRowPitch, - BufferSlicePitch, HostSlicePitch, NumSyncPointsInWaitList, - SyncPointWaitList, SyncPoint); + BufferSlicePitch, HostSlicePitch, PreferCopyEngine, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( @@ -1026,7 +1166,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset, Pattern, // It will be interpreted as an 8-bit value, PatternSize, // which is indicated with this pattern_size==1 - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -1040,7 +1181,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, Ptr, Pattern, // It will be interpreted as an 8-bit value, PatternSize, // which is indicated with this pattern_size==1 - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( @@ -1118,6 +1260,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ZE2UR_CALL(zeCommandQueueExecuteCommandLists, (ZeCommandQueue, 1, &CommandBuffer->ZeCommandList, ZeFence)); + // The Copy command-list is submitted to the main copy queue if it is not + // empty. + if (!CommandBuffer->MCopyCommandListEmpty) { + auto &QGroupCopy = Queue->getQueueGroup(true); + uint32_t QueueGroupOrdinal; + auto &ZeCopyCommandQueue = QGroupCopy.getZeQueue(&QueueGroupOrdinal); + ZE2UR_CALL( + zeCommandQueueExecuteCommandLists, + (ZeCopyCommandQueue, 1, &CommandBuffer->ZeCopyCommandList, nullptr)); + } + // Execution event for this enqueue of the UR command-buffer ur_event_handle_t RetEvent{}; @@ -1126,6 +1279,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, false, NumEventsInWaitList, EventWaitList, false)); + // Reset the wait-event for the UR command-buffer that is signaled when its // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index 13f08062bf..784ddbf0e8 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -29,7 +29,9 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, ze_command_list_handle_t CommandListResetEvents, + ze_command_list_handle_t CopyCommandList, ZeStruct ZeDesc, + ZeStruct ZeCopyDesc, const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList); ~ur_exp_command_buffer_handle_t_(); @@ -44,6 +46,9 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { return NextSyncPoint; } + // Indicates if a copy engine is available for use + bool UseCopyEngine() const { return ZeCopyCommandList != nullptr; } + // UR context associated with this command-buffer ur_context_handle_t Context; // Device associated with this command buffer @@ -54,6 +59,13 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ze_command_list_handle_t ZeCommandListResetEvents; // Level Zero command list descriptor ZeStruct ZeCommandListDesc; + // Level Zero Copy command list handle + ze_command_list_handle_t ZeCopyCommandList; + // Level Zero Copy command list descriptor + ZeStruct ZeCopyCommandListDesc; + // This flag is must be set to false if at least one copy command has been + // added to `ZeCopyCommandList` + bool MCopyCommandListEmpty = true; // Level Zero fences for each queue the command-buffer has been enqueued to. // These should be destroyed when the command-buffer is released. std::unordered_map ZeFencesMap;