Skip to content

[UR] Add remaining calls shared with queue in level-zero v2 adapter #17061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 0 additions & 47 deletions unified-runtime/source/adapters/level_zero/v2/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,53 +170,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp(
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

ur_result_t urCommandBufferAppendUSMFillExp(
ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory,
const void *pPattern, size_t patternSize, size_t size,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) {
logger::error("{} function not implemented!", __FUNCTION__);
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

ur_result_t urCommandBufferAppendMemBufferFillExp(
ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
const void *pPattern, size_t patternSize, size_t offset, size_t size,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) {
logger::error("{} function not implemented!", __FUNCTION__);
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

ur_result_t urCommandBufferAppendUSMPrefetchExp(
ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
size_t size, ur_usm_migration_flags_t flags,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) {
logger::error("{} function not implemented!", __FUNCTION__);
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

ur_result_t urCommandBufferAppendUSMAdviseExp(
ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) {
logger::error("{} function not implemented!", __FUNCTION__);
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

ur_result_t urCommandBufferUpdateKernelLaunchExp(
ur_exp_command_buffer_command_handle_t hCommand,
const ur_exp_command_buffer_update_kernel_launch_desc_t
Expand Down
109 changes: 109 additions & 0 deletions unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,115 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp(
return exceptionToResult(std::current_exception());
}

ur_result_t urCommandBufferAppendUSMFillExp(
ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory,
const void *pPattern, size_t patternSize, size_t size,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) try {

// the same issue as in urCommandBufferAppendKernelLaunchExp
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = phEvent;
// sync mechanic can be ignored, because all lists are in-order
std::ignore = numSyncPointsInWaitList;
std::ignore = pSyncPointWaitList;
std::ignore = pSyncPoint;

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendUSMFill(
pMemory, patternSize, pPattern, size, 0, nullptr, nullptr));
return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
}

ur_result_t urCommandBufferAppendMemBufferFillExp(
ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
const void *pPattern, size_t patternSize, size_t offset, size_t size,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) try {

// the same issue as in urCommandBufferAppendKernelLaunchExp
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = phEvent;
// sync mechanic can be ignored, because all lists are in-order
std::ignore = numSyncPointsInWaitList;
std::ignore = pSyncPointWaitList;
std::ignore = pSyncPoint;

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendMemBufferFill(
hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr));
return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
}

ur_result_t urCommandBufferAppendUSMPrefetchExp(
ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
size_t size, ur_usm_migration_flags_t flags,
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) try {

// the same issue as in urCommandBufferAppendKernelLaunchExp
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = phEvent;
// sync mechanic can be ignored, because all lists are in-order
std::ignore = numSyncPointsInWaitList;
std::ignore = pSyncPointWaitList;
std::ignore = pSyncPoint;

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendUSMPrefetch(
pMemory, size, flags, 0, nullptr, nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
}

ur_result_t urCommandBufferAppendUSMAdviseExp(
ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
ur_exp_command_buffer_command_handle_t *phCommand) try {

// the same issue as in urCommandBufferAppendKernelLaunchExp
std::ignore = numEventsInWaitList;
std::ignore = phEventWaitList;
std::ignore = phEvent;
// sync mechanic can be ignored, because all lists are in-order
std::ignore = numSyncPointsInWaitList;
std::ignore = pSyncPointWaitList;
std::ignore = pSyncPoint;

std::ignore = phCommand;

UR_CALL(hCommandBuffer->commandListManager.appendUSMAdvise(pMemory, size,
advice, nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
}

ur_result_t
urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer,
ur_exp_command_buffer_info_t propName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,50 @@ ur_command_list_manager::~ur_command_list_manager() {
ur::level_zero::urDeviceRelease(device);
}

ur_result_t ur_command_list_manager::appendGenericFillUnlocked(
ur_mem_buffer_t *dst, size_t offset, size_t patternSize,
const void *pPattern, size_t size, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
ur_command_t commandType) {

auto zeSignalEvent = getSignalEvent(phEvent, commandType);

auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);

auto pDst = ur_cast<char *>(dst->getDevicePtr(
device, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size,
[&](void *src, void *dst, size_t size) {
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
(zeCommandList.get(), dst, src, size, nullptr,
waitListView.num, waitListView.handles));
waitListView.clear();
}));

// PatternSize must be a power of two for zeCommandListAppendMemoryFill.
// When it's not, the fill is emulated with zeCommandListAppendMemoryCopy.
if (isPowerOf2(patternSize)) {
ZE2UR_CALL(zeCommandListAppendMemoryFill,
(zeCommandList.get(), pDst, pPattern, patternSize, size,
zeSignalEvent, waitListView.num, waitListView.handles));
} else {
// Copy pattern into every entry in memory array pointed by Ptr.
uint32_t numOfCopySteps = size / patternSize;
const void *src = pPattern;

for (uint32_t step = 0; step < numOfCopySteps; ++step) {
void *dst = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(pDst) +
step * patternSize);
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
(zeCommandList.get(), dst, src, patternSize,
step == numOfCopySteps - 1 ? zeSignalEvent : nullptr,
waitListView.num, waitListView.handles));
waitListView.clear();
}
}

return UR_RESULT_SUCCESS;
}

ur_result_t ur_command_list_manager::appendGenericCopyUnlocked(
ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset,
size_t dstOffset, size_t size, uint32_t numEventsInWaitList,
Expand Down Expand Up @@ -209,6 +253,96 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy(
return UR_RESULT_SUCCESS;
}

ur_result_t ur_command_list_manager::appendMemBufferFill(
ur_mem_handle_t hMem, const void *pPattern, size_t patternSize,
size_t offset, size_t size, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill");

auto hBuffer = hMem->getBuffer();
UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE);

std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
hBuffer->getMutex());

return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size,
numEventsInWaitList, phEventWaitList,
phEvent, UR_COMMAND_MEM_BUFFER_FILL);
}

ur_result_t ur_command_list_manager::appendUSMFill(
void *pMem, size_t patternSize, const void *pPattern, size_t size,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill");

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);

ur_usm_handle_t dstHandle(context, size, pMem);
return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size,
numEventsInWaitList, phEventWaitList,
phEvent, UR_COMMAND_USM_FILL);
}

ur_result_t ur_command_list_manager::appendUSMPrefetch(
const void *pMem, size_t size, ur_usm_migration_flags_t flags,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch");

std::ignore = flags;

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);

auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);

auto [pWaitEvents, numWaitEvents] =
getWaitListView(phEventWaitList, numEventsInWaitList);

if (pWaitEvents) {
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
(zeCommandList.get(), numWaitEvents, pWaitEvents));
}
// TODO: figure out how to translate "flags"
ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
(zeCommandList.get(), pMem, size));
if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
(zeCommandList.get(), zeSignalEvent));
}

return UR_RESULT_SUCCESS;
}

ur_result_t
ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size,
ur_usm_advice_flags_t advice,
ur_event_handle_t *phEvent) {
TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise");

std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn;t related to this PR in particular but to the whole command_list_manager implementation. I didn't notice this before but now, we have 2 different mutexes: this one (from command_list_manager) and a separate one in queue_immediate_in_order class.

Some functions now lock this mutex, while others lock the one from queue_immediate_in_order which means there's no synchronization.

This should be fixed, probably by always using lock from ur_command_list_manager (in every queue_immediate_in_order and command_buffer function).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense - should I add it to this PR, or these changes should be in separate one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can do that in a separate PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we are really trying to protect is the state inside of the command list manager. Maybe we should create some sort of scoped locking abstraction for objects (like mutexes in Rust), e.g.,:

class CommandBuffer {
  Mutex<CommandListManager> cmdListMgr;
}

{
  auto mgr = cmdbuf->cmdListMgr.lock();
  mgr->AppendFoo();
}

This way it will be impossible to make the same mistake and we don't have to extend the scope of a lock to the outside of its member functions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, that's actually a pretty good idea, and it should be enough to just create a structure that holds a reference to the cmdListMgr and implements command_list_manager operator->(), then mgr->whatever() should work automatically.


auto zeAdvice = ur_cast<ze_memory_advice_t>(advice);

auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);

auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0);

if (pWaitEvents) {
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
(zeCommandList.get(), numWaitEvents, pWaitEvents));
}

ZE2UR_CALL(zeCommandListAppendMemAdvise,
(zeCommandList.get(), device->ZeDevice, pMem, size, zeAdvice));

if (zeSignalEvent) {
ZE2UR_CALL(zeCommandListAppendSignalEvent,
(zeCommandList.get(), zeSignalEvent));
}
return UR_RESULT_SUCCESS;
}

ur_result_t ur_command_list_manager::appendMemBufferRead(
ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size,
void *pDst, uint32_t numEventsInWaitList,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,27 @@ struct ur_command_list_manager : public _ur_object {
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent);

ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern,
size_t patternSize, size_t offset,
size_t size, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent);

ur_result_t appendUSMFill(void *pMem, size_t patternSize,
const void *pPattern, size_t size,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent);

ur_result_t appendUSMPrefetch(const void *pMem, size_t size,
ur_usm_migration_flags_t flags,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent);

ur_result_t appendUSMAdvise(const void *pMem, size_t size,
ur_usm_advice_flags_t advice,
ur_event_handle_t *phEvent);
ze_command_list_handle_t getZeCommandList();

wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents,
Expand All @@ -107,6 +128,12 @@ struct ur_command_list_manager : public _ur_object {
ur_command_t commandType);

private:
ur_result_t appendGenericFillUnlocked(
ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize,
const void *pPattern, size_t size, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
ur_command_t commandType);

ur_result_t appendGenericCopyUnlocked(
ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking,
size_t srcOffset, size_t dstOffset, size_t size,
Expand Down
Loading
Loading