Skip to content

[SYCL] Fix handling of host-side memory in 2D memops #8359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 103 additions & 12 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <sycl/reduction_forward.hpp>
#include <sycl/sampler.hpp>
#include <sycl/stl.hpp>
#include <sycl/usm/usm_pointer_info.hpp>

#include <functional>
#include <limits>
Expand Down Expand Up @@ -2469,13 +2470,30 @@ class __SYCL_EXPORT handler {
throw sycl::exception(sycl::make_error_code(errc::invalid),
"Source pitch must be greater than or equal "
"to the width specified in 'ext_oneapi_memcpy2d'");
// If the backends supports 2D copy we use that. Otherwise we use a fallback
// kernel.
if (supportsUSMMemcpy2D())

// Get the type of the pointers.
context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
bool SrcIsHost =
SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
bool DestIsHost = DestAllocType == usm::alloc::unknown ||
DestAllocType == usm::alloc::host;

// Do the following:
// 1. If both are host, use host_task to copy.
// 2. If either pointer is host or of the backend supports native memcpy2d,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: "or OF the backend"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in #8553.

// use special command.
// 3. Otherwise, launch a kernel for copying.
if (SrcIsHost && DestIsHost) {
commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
Height);
} else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
ext_oneapi_memcpy2d_impl(Dest, DestPitch, Src, SrcPitch, Width, Height);
else
} else {
commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
Height);
}
}

/// Copies data from one 2D memory region to another, both pointed by
Expand Down Expand Up @@ -2503,14 +2521,31 @@ class __SYCL_EXPORT handler {
throw sycl::exception(sycl::make_error_code(errc::invalid),
"Source pitch must be greater than or equal "
"to the width specified in 'ext_oneapi_copy2d'");
// If the backends supports 2D copy we use that. Otherwise we use a fallback
// kernel.
if (supportsUSMMemcpy2D())

// Get the type of the pointers.
context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
bool SrcIsHost =
SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
bool DestIsHost = DestAllocType == usm::alloc::unknown ||
DestAllocType == usm::alloc::host;

// Do the following:
// 1. If both are host, use host_task to copy.
// 2. If either pointer is host or of the backend supports native memcpy2d,
// use special command.
// 3. Otherwise, launch a kernel for copying.
if (SrcIsHost && DestIsHost) {
commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
Height);
} else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
ext_oneapi_memcpy2d_impl(Dest, DestPitch * sizeof(T), Src,
SrcPitch * sizeof(T), Width * sizeof(T), Height);
else
} else {
commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
Height);
}
}

/// Fills the memory pointed by a USM pointer with the value specified.
Expand Down Expand Up @@ -2538,9 +2573,16 @@ class __SYCL_EXPORT handler {
"Destination pitch must be greater than or equal "
"to the width specified in 'ext_oneapi_memset2d'");
T CharVal = static_cast<T>(Value);

context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);

// If the backends supports 2D fill we use that. Otherwise we use a fallback
// kernel.
if (supportsUSMMemset2D())
// kernel. If the target is on host we will always do the operation on host.
if (DestAllocType == usm::alloc::unknown ||
DestAllocType == usm::alloc::host)
commonUSMFill2DFallbackHostTask(Dest, DestPitch, CharVal, Width, Height);
else if (supportsUSMMemset2D())
ext_oneapi_memset2d_impl(Dest, DestPitch, Value, Width, Height);
else
commonUSMFill2DFallbackKernel(Dest, DestPitch, CharVal, Width, Height);
Expand Down Expand Up @@ -2568,9 +2610,16 @@ class __SYCL_EXPORT handler {
throw sycl::exception(sycl::make_error_code(errc::invalid),
"Destination pitch must be greater than or equal "
"to the width specified in 'ext_oneapi_fill2d'");

context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);

// If the backends supports 2D fill we use that. Otherwise we use a fallback
// kernel.
if (supportsUSMFill2D())
// kernel. If the target is on host we will always do the operation on host.
if (DestAllocType == usm::alloc::unknown ||
DestAllocType == usm::alloc::host)
commonUSMFill2DFallbackHostTask(Dest, DestPitch, Pattern, Width, Height);
else if (supportsUSMFill2D())
ext_oneapi_fill2d_impl(Dest, DestPitch, &Pattern, sizeof(T), Width,
Height);
else
Expand Down Expand Up @@ -2792,6 +2841,8 @@ class __SYCL_EXPORT handler {
NumWorkItems, KernelFunc);
}

const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;

// Checks if 2D memory operations are supported by the underlying platform.
bool supportsUSMMemcpy2D();
bool supportsUSMFill2D();
Expand All @@ -2806,6 +2857,8 @@ class __SYCL_EXPORT handler {
void commonUSMCopy2DFallbackKernel(const void *Src, size_t SrcPitch,
void *Dest, size_t DestPitch, size_t Width,
size_t Height) {
// Otherwise the data is accessible on the device so we do the operation
// there instead.
// Limit number of work items to be resistant to big copies.
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
Expand All @@ -2825,12 +2878,33 @@ class __SYCL_EXPORT handler {
});
}

// Common function for launching a 2D USM memcpy host-task to avoid
// redefinitions of the kernel from copy and memcpy.
template <typename T>
void commonUSMCopy2DFallbackHostTask(const void *Src, size_t SrcPitch,
void *Dest, size_t DestPitch,
size_t Width, size_t Height) {
// If both pointers are host USM or unknown (assumed non-USM) we use a
// host-task to satisfy dependencies.
host_task([=] {
const T *CastedSrc = static_cast<const T *>(Src);
T *CastedDest = static_cast<T *>(Dest);
for (size_t I = 0; I < Height; ++I) {
const T *SrcItBegin = CastedSrc + SrcPitch * I;
T *DestItBegin = CastedDest + DestPitch * I;
std::copy(SrcItBegin, SrcItBegin + Width, DestItBegin);
}
});
}

// Common function for launching a 2D USM fill kernel to avoid redefinitions
// of the kernel from memset and fill.
template <typename T>
void commonUSMFill2DFallbackKernel(void *Dest, size_t DestPitch,
const T &Pattern, size_t Width,
size_t Height) {
// Otherwise the data is accessible on the device so we do the operation
// there instead.
// Limit number of work items to be resistant to big fill operations.
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
Expand All @@ -2849,6 +2923,23 @@ class __SYCL_EXPORT handler {
});
}

// Common function for launching a 2D USM fill kernel or host_task to avoid
// redefinitions of the kernel from memset and fill.
template <typename T>
void commonUSMFill2DFallbackHostTask(void *Dest, size_t DestPitch,
const T &Pattern, size_t Width,
size_t Height) {
// If the pointer is host USM or unknown (assumed non-USM) we use a
// host-task to satisfy dependencies.
host_task([=] {
T *CastedDest = static_cast<T *>(Dest);
for (size_t I = 0; I < Height; ++I) {
T *ItBegin = CastedDest + DestPitch * I;
std::fill(ItBegin, ItBegin + Width, Pattern);
}
});
}

// Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy.
void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src,
size_t SrcPitch, size_t Width, size_t Height);
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/sycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
#include <sycl/sub_group.hpp>
#include <sycl/types.hpp>
#include <sycl/usm.hpp>
#include <sycl/usm/usm_pointer_info.hpp>
#include <sycl/version.hpp>
#if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
#include <sycl/ext/oneapi/backend/level_zero.hpp>
Expand Down
14 changes: 0 additions & 14 deletions sycl/include/sycl/usm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,19 +277,5 @@ T *aligned_alloc(
Kind, PropList, CodeLoc);
}

// Pointer queries
/// Query the allocation type from a USM pointer
///
/// \param ptr is the USM pointer to query
/// \param ctxt is the sycl context the ptr was allocated in
__SYCL_EXPORT usm::alloc get_pointer_type(const void *ptr, const context &ctxt);

/// Queries the device against which the pointer was allocated
/// Throws an invalid_object_error if ptr is a host allocation.
///
/// \param ptr is the USM pointer to query
/// \param ctxt is the sycl context the ptr was allocated in
__SYCL_EXPORT device get_pointer_device(const void *ptr, const context &ctxt);

} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
35 changes: 35 additions & 0 deletions sycl/include/sycl/usm/usm_pointer_info.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//==---- usm_pointer_info.hpp - SYCL USM pointer info queries --*- C++ -*---==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// ===--------------------------------------------------------------------=== //
#pragma once

#include <sycl/detail/common.hpp>
#include <sycl/detail/export.hpp>
#include <sycl/usm/usm_enums.hpp>

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {

class device;
class context;

// Pointer queries
/// Query the allocation type from a USM pointer
///
/// \param ptr is the USM pointer to query
/// \param ctxt is the sycl context the ptr was allocated in
__SYCL_EXPORT usm::alloc get_pointer_type(const void *ptr, const context &ctxt);

/// Queries the device against which the pointer was allocated
/// Throws an invalid_object_error if ptr is a host allocation.
///
/// \param ptr is the USM pointer to query
/// \param ctxt is the sycl context the ptr was allocated in
__SYCL_EXPORT device get_pointer_device(const void *ptr, const context &ctxt);

} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
45 changes: 42 additions & 3 deletions sycl/source/detail/memory_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -932,10 +932,49 @@ void MemoryManager::copy_2d_usm(const void *SrcMem, size_t SrcPitch,
if (!DstMem || !SrcMem)
throw sycl::exception(sycl::make_error_code(errc::invalid),
"NULL pointer argument in 2D memory copy operation.");

const detail::plugin &Plugin = Queue->getPlugin();
Plugin.call<PiApiKind::piextUSMEnqueueMemcpy2D>(
Queue->getHandleRef(), /*blocking=*/PI_FALSE, DstMem, DstPitch, SrcMem,
SrcPitch, Width, Height, DepEvents.size(), DepEvents.data(), OutEvent);

pi_bool SupportsUSMMemcpy2D = false;
Plugin.call<detail::PiApiKind::piContextGetInfo>(
Queue->getContextImplPtr()->getHandleRef(),
PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT, sizeof(pi_bool),
&SupportsUSMMemcpy2D, nullptr);

if (SupportsUSMMemcpy2D) {
// Direct memcpy2D is supported so we use this function.
Plugin.call<PiApiKind::piextUSMEnqueueMemcpy2D>(
Queue->getHandleRef(), /*blocking=*/PI_FALSE, DstMem, DstPitch, SrcMem,
SrcPitch, Width, Height, DepEvents.size(), DepEvents.data(), OutEvent);
return;
}

// Otherwise we allow the special case where the copy is to or from host.
#ifndef NDEBUG
context Ctx = createSyclObjFromImpl<context>(Queue->getContextImplPtr());
usm::alloc SrcAllocType = get_pointer_type(SrcMem, Ctx);
usm::alloc DstAllocType = get_pointer_type(DstMem, Ctx);
bool SrcIsHost =
SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
bool DstIsHost =
DstAllocType == usm::alloc::unknown || DstAllocType == usm::alloc::host;
assert((SrcIsHost || DstIsHost) && "In fallback path for copy_2d_usm either "
"source or destination must be on host.");
#endif // NDEBUG

// The fallback in this case is to insert a copy per row.
std::vector<RT::PiEvent> CopyEvents(Height);
for (size_t I = 0; I < Height; ++I) {
char *DstItBegin = static_cast<char *>(DstMem) + I * DstPitch;
const char *SrcItBegin = static_cast<const char *>(SrcMem) + I * SrcPitch;
Plugin.call<PiApiKind::piextUSMEnqueueMemcpy>(
Queue->getHandleRef(), /* blocking */ PI_FALSE, DstItBegin, SrcItBegin,
Width, DepEvents.size(), DepEvents.data(), CopyEvents.data() + I);
}

// Then insert a wait to coalesce the copy events.
Queue->getPlugin().call<PiApiKind::piEnqueueEventsWait>(
Queue->getHandleRef(), CopyEvents.size(), CopyEvents.data(), OutEvent);
}

void MemoryManager::fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch,
Expand Down
5 changes: 5 additions & 0 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,5 +876,10 @@ void handler::memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
setType(detail::CG::CopyFromDeviceGlobal);
}

const std::shared_ptr<detail::context_impl> &
handler::getContextImplPtr() const {
return MQueue->getContextImplPtr();
}

} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -4454,6 +4454,7 @@ _ZNK4sycl3_V17context8get_infoINS0_4info7context32atomic_memory_scope_capabiliti
_ZNK4sycl3_V17context8get_infoINS0_4info7context7devicesEEENS0_6detail20is_context_info_descIT_E11return_typeEv
_ZNK4sycl3_V17context8get_infoINS0_4info7context8platformEEENS0_6detail20is_context_info_descIT_E11return_typeEv
_ZNK4sycl3_V17context9getNativeEv
_ZNK4sycl3_V17handler17getContextImplPtrEv
_ZNK4sycl3_V17handler27isStateExplicitKernelBundleEv
_ZNK4sycl3_V17handler30getOrInsertHandlerKernelBundleEb
_ZNK4sycl3_V17sampler12get_propertyINS0_3ext6oneapi4cuda8property7context19use_primary_contextEEET_v
Expand Down
Loading