Skip to content

Commit

Permalink
Add DLPack support for XPU backend by mapping to kDLOneAPI in DLPack (p…
Browse files Browse the repository at this point in the history
…ytorch#82867)

## Motivation
The DLPack device type kDLOneAPI stands for the Unified Shared Memory allocated on a oneAPI device. The corresponding Pytorch backend type is XPU.
Support to export/import the Pytorch XPU tensor as a DLPack tensor of kDLOneAPI device.

## Solution
1. Update the DLPack protocol to v0.7.
2. Add the XPU hooks to map the Aten device and DLPack device with the address value and device information.

Pull Request resolved: pytorch#82867
Approved by: https://github.com/kit1980
  • Loading branch information
chengjunlu authored and pytorchmergebot committed Aug 5, 2022
1 parent bfebf25 commit de0e030
Show file tree
Hide file tree
Showing 11 changed files with 208 additions and 19 deletions.
8 changes: 8 additions & 0 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <ATen/detail/CUDAHooksInterface.h>
#include <ATen/detail/HIPHooksInterface.h>
#include <ATen/detail/ORTHooksInterface.h>
#include <ATen/detail/XPUHooksInterface.h>
#include <c10/core/QEngine.h>
#include <c10/core/impl/DeviceGuardImplInterface.h>
#include <c10/util/CallOnce.h>
Expand Down Expand Up @@ -89,6 +90,9 @@ class TORCH_API Context {
static bool hasXLA() {
return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA);
}
static bool hasXPU() {
return c10::impl::hasDeviceGuardImpl(at::DeviceType::XPU);
}
static bool hasLazy() {
return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
}
Expand Down Expand Up @@ -331,6 +335,10 @@ static inline bool hasORT() {
return globalContext().hasORT();
}

static inline bool hasXPU() {
return globalContext().hasXPU();
}

// Despite its name, this function returns the number of *CUDA* GPUs.
static inline size_t getNumGPUs() {
// WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
Expand Down
16 changes: 12 additions & 4 deletions aten/src/ATen/DLConvertor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,17 @@ DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) {
case DeviceType::HIP:
ctx.device_type = DLDeviceType::kDLROCM;
break;
case DeviceType::XPU:
ctx = at::detail::getXPUHooks().getDLPackDeviceFromATenDevice(
tensor.device(), tensor.data_ptr());
break;
default:
TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
}
return ctx;
}

static Device getATenDevice(const DLDevice& ctx) {
static Device getATenDevice(const DLDevice& ctx, void* data) {
switch (ctx.device_type) {
case DLDeviceType::kDLCPU:
return at::Device(DeviceType::CPU);
Expand All @@ -114,6 +118,8 @@ static Device getATenDevice(const DLDevice& ctx) {
#else
return at::Device(DeviceType::HIP, ctx.device_id);
#endif
case DLDeviceType::kDLOneAPI:
return at::detail::getXPUHooks().getATenDeviceFromDLPackDevice(ctx, data);
default:
TORCH_CHECK(
false, "Unsupported device_type: " + c10::to_string(ctx.device_type));
Expand Down Expand Up @@ -238,17 +244,19 @@ DLManagedTensor* toDLPack(const Tensor& src) {
}

Tensor fromDLPack(const DLManagedTensor* src) {
Device device = getATenDevice(src->dl_tensor.device);
Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data);
ScalarType stype = toScalarType(src->dl_tensor.dtype);
auto deleter = [src](void* self) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
src->deleter(const_cast<DLManagedTensor*>(src));
};
if (!src->dl_tensor.strides) {
return at::from_blob(src->dl_tensor.data,
return at::from_blob(
src->dl_tensor.data,
IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim),
deleter,
at::device(device).dtype(stype));
at::device(device).dtype(stype),
{device});
}
return at::from_blob(
src->dl_tensor.data,
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/Version.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,10 @@ std::string show_config() {
ss << detail::getORTHooks().showConfig();
}

if (hasXPU()) {
ss << detail::getXPUHooks().showConfig();
}

ss << " - Build settings: ";
for (const auto& pair : caffe2::GetBuildOptions()) {
if (!pair.second.empty()) {
Expand Down
28 changes: 28 additions & 0 deletions aten/src/ATen/detail/XPUHooksInterface.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <ATen/detail/XPUHooksInterface.h>

#include <c10/util/CallOnce.h>

#include <memory>
#include <mutex>

namespace at {
namespace detail {

static XPUHooksInterface *xpu_hooks = nullptr;

const XPUHooksInterface &getXPUHooks() {
static c10::once_flag once;
c10::call_once(once, [] {
xpu_hooks =
XPUHooksRegistry()->Create("XPUHooks", XPUHooksArgs{}).release();
if (!xpu_hooks) {
xpu_hooks = new XPUHooksInterface();
}
});
return *xpu_hooks;
}
} // namespace detail

C10_DEFINE_REGISTRY(XPUHooksRegistry, XPUHooksInterface, XPUHooksArgs)

} // namespace at
75 changes: 75 additions & 0 deletions aten/src/ATen/detail/XPUHooksInterface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#pragma once

#include <ATen/dlpack.h>
#include <c10/core/Device.h>
#include <c10/util/Exception.h>

#include <c10/util/Registry.h>

#include <cstddef>
#include <functional>
#include <memory>

namespace at {
class Context;
}

namespace at {

constexpr const char* XPU_HELP =
"The XPU backend requires Intel Extension for Pytorch;"
"this error has occurred because you are trying "
"to use some XPU's functionality, but the Intel Extension for Pytorch has not been "
"loaded for some reason. The Intel Extension for Pytorch MUST "
"be loaded, EVEN IF you don't directly use any symbols from that!";

struct TORCH_API XPUHooksInterface {
virtual ~XPUHooksInterface() {}

virtual void initXPU() const {
TORCH_CHECK(
false,
"Cannot initialize XPU without Intel Extension for Pytorch.",
XPU_HELP);
}

virtual bool hasXPU() const {
return false;
}

virtual std::string showConfig() const {
TORCH_CHECK(
false,
"Cannot query detailed XPU version without Intel Extension for Pytorch. ",
XPU_HELP);
}

virtual Device getATenDeviceFromDLPackDevice(
const DLDevice& dl_device,
void* data) const {
TORCH_CHECK(
false,
"Cannot get XPU device without Intel Extension for Pytorch. ",
XPU_HELP);
};

virtual DLDevice getDLPackDeviceFromATenDevice(
const Device& aten_device,
void* data) const {
TORCH_CHECK(
false,
"Cannot get XPU DL device without Intel Extension for Pytorch. ",
XPU_HELP);
};
};

struct TORCH_API XPUHooksArgs {};

C10_DECLARE_REGISTRY(XPUHooksRegistry, XPUHooksInterface, XPUHooksArgs);
#define REGISTER_XPU_HOOKS(clsname) \
C10_REGISTER_CLASS(XPUHooksRegistry, clsname, clsname)

namespace detail {
TORCH_API const XPUHooksInterface& getXPUHooks();
} // namespace detail
} // namespace at
44 changes: 37 additions & 7 deletions aten/src/ATen/dlpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,20 @@
#ifndef DLPACK_DLPACK_H_
#define DLPACK_DLPACK_H_

/**
* \brief Compatibility with C++
*/
#ifdef __cplusplus
#define DLPACK_EXTERN_C extern "C"
#else
#define DLPACK_EXTERN_C
#endif

/*! \brief The current version of dlpack */
#define DLPACK_VERSION 60
#define DLPACK_VERSION 70

/*! \brief The current ABI version of dlpack */
#define DLPACK_ABI_VERSION 1

/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
Expand All @@ -35,7 +41,11 @@ extern "C" {
/*!
* \brief The device type in DLDevice.
*/
#ifdef __cplusplus
typedef enum : int32_t {
#else
typedef enum {
#endif
/*! \brief CPU device */
kDLCPU = 1,
/*! \brief CUDA GPU device */
Expand Down Expand Up @@ -68,6 +78,17 @@ typedef enum {
* \brief CUDA managed/unified memory allocated by cudaMallocManaged
*/
kDLCUDAManaged = 13,
/*!
* \brief Unified shared memory allocated on a oneAPI non-partititioned
* device. Call to oneAPI runtime is required to determine the device
* type, the USM allocation type and the sycl context it is bound to.
*
*/
kDLOneAPI = 14,
/*! \brief GPU support for next generation WebGPU standard. */
kDLWebGPU = 15,
/*! \brief Qualcomm Hexagon DSP */
kDLHexagon = 16,
} DLDeviceType;

/*!
Expand All @@ -80,7 +101,7 @@ typedef struct {
* \brief The device index.
* For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
*/
int device_id;
int32_t device_id;
} DLDevice;

/*!
Expand Down Expand Up @@ -109,7 +130,9 @@ typedef enum {
} DLDataTypeCode;

/*!
* \brief The data type the tensor can hold.
* \brief The data type the tensor can hold. The data type is assumed to follow
* the native endian-ness. An explicit error message should be raised when
* attempting to export an array with non-native endianness
*
* Examples
* - float: type_code = 2, bits = 32, lanes=1
Expand Down Expand Up @@ -137,9 +160,16 @@ typedef struct {
*/
typedef struct {
/*!
* \brief The opaque data pointer points to the allocated data. This will be
* CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
* aligned to 256 bytes as in CUDA.
* \brief The data pointer points to the allocated data. This will be CUDA
* device pointer or cl_mem handle in OpenCL. It may be opaque on some device
* types. This pointer is always aligned to 256 bytes as in CUDA. The
* `byte_offset` field should be used to point to the beginning of the data.
*
* Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
* TVM, perhaps others) do not adhere to this 256 byte aligment requirement
* on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed
* (after which this note will be updated); at the moment it is recommended
* to not rely on the data pointer being correctly aligned.
*
* For given DLTensor, the size of memory required to store the contents of
* data is calculated as follows:
Expand All @@ -159,7 +189,7 @@ typedef struct {
/*! \brief The device of the tensor */
DLDevice device;
/*! \brief Number of dimensions */
int ndim;
int32_t ndim;
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
Expand Down
4 changes: 3 additions & 1 deletion aten/src/ATen/ops/from_blob.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,12 @@ inline Tensor from_blob(
void* data,
IntArrayRef sizes,
const std::function<void(void*)>& deleter,
const TensorOptions& options = {}) {
const TensorOptions& options = {},
const c10::optional<Device> target_device = c10::nullopt) {
return for_blob(data, sizes)
.deleter(deleter)
.options(options)
.target_device(target_device)
.make_tensor();
}

Expand Down
1 change: 1 addition & 0 deletions build_variables.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,7 @@ aten_cpu_source_non_codegen_list = [
"aten/src/ATen/detail/CUDAHooksInterface.cpp",
"aten/src/ATen/detail/HIPHooksInterface.cpp",
"aten/src/ATen/detail/ORTHooksInterface.cpp",
"aten/src/ATen/detail/XPUHooksInterface.cpp",
"aten/src/ATen/metal/Context.cpp",
"aten/src/ATen/native/AutogradComposite.cpp",
"aten/src/ATen/native/DispatchStub.cpp",
Expand Down
Loading

0 comments on commit de0e030

Please sign in to comment.