Add DLPack support for XPU backend by mapping to kDLOneAPI in DLPack (p…

…ytorch#82867) ## Motivation The DLPack device type kDLOneAPI stands for the Unified Shared Memory allocated on a oneAPI device. The corresponding Pytorch backend type is XPU. Support to export/import the Pytorch XPU tensor as a DLPack tensor of kDLOneAPI device. ## Solution 1. Update the DLPack protocol to v0.7. 2. Add the XPU hooks to map the Aten device and DLPack device with the address value and device information. Pull Request resolved: pytorch#82867 Approved by: https://github.com/kit1980
atalman · Aug 5, 2022 · de0e030 · de0e030
1 parent bfebf25
commit de0e030
Show file tree

Hide file tree

Showing 11 changed files with 208 additions and 19 deletions.
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -9,6 +9,7 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
 #include <ATen/detail/ORTHooksInterface.h>
+#include <ATen/detail/XPUHooksInterface.h>
 #include <c10/core/QEngine.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/util/CallOnce.h>
@@ -89,6 +90,9 @@ class TORCH_API Context {
   static bool hasXLA() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA);
   }
+  static bool hasXPU() {
+    return c10::impl::hasDeviceGuardImpl(at::DeviceType::XPU);
+  }
   static bool hasLazy() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
   }
@@ -331,6 +335,10 @@ static inline bool hasORT() {
   return globalContext().hasORT();
 }
 
+static inline bool hasXPU() {
+  return globalContext().hasXPU();
+}
+
 // Despite its name, this function returns the number of *CUDA* GPUs.
 static inline size_t getNumGPUs() {
   // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS

diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
@@ -90,13 +90,17 @@ DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) {
     case DeviceType::HIP:
       ctx.device_type = DLDeviceType::kDLROCM;
       break;
+    case DeviceType::XPU:
+      ctx = at::detail::getXPUHooks().getDLPackDeviceFromATenDevice(
+          tensor.device(), tensor.data_ptr());
+      break;
     default:
       TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
   }
   return ctx;
 }
 
-static Device getATenDevice(const DLDevice& ctx) {
+static Device getATenDevice(const DLDevice& ctx, void* data) {
   switch (ctx.device_type) {
     case DLDeviceType::kDLCPU:
       return at::Device(DeviceType::CPU);
@@ -114,6 +118,8 @@ static Device getATenDevice(const DLDevice& ctx) {
 #else
       return at::Device(DeviceType::HIP, ctx.device_id);
 #endif
+    case DLDeviceType::kDLOneAPI:
+      return at::detail::getXPUHooks().getATenDeviceFromDLPackDevice(ctx, data);
     default:
       TORCH_CHECK(
           false, "Unsupported device_type: " + c10::to_string(ctx.device_type));
@@ -238,17 +244,19 @@ DLManagedTensor* toDLPack(const Tensor& src) {
 }
 
 Tensor fromDLPack(const DLManagedTensor* src) {
-  Device device = getATenDevice(src->dl_tensor.device);
+  Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data);
   ScalarType stype = toScalarType(src->dl_tensor.dtype);
   auto deleter = [src](void* self) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
     src->deleter(const_cast<DLManagedTensor*>(src));
   };
   if (!src->dl_tensor.strides) {
-    return at::from_blob(src->dl_tensor.data,
+    return at::from_blob(
+        src->dl_tensor.data,
         IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim),
         deleter,
-        at::device(device).dtype(stype));
+        at::device(device).dtype(stype),
+        {device});
   }
   return at::from_blob(
       src->dl_tensor.data,

diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
@@ -195,6 +195,10 @@ std::string show_config() {
     ss << detail::getORTHooks().showConfig();
   }
 
+  if (hasXPU()) {
+    ss << detail::getXPUHooks().showConfig();
+  }
+
   ss << "  - Build settings: ";
   for (const auto& pair : caffe2::GetBuildOptions()) {
     if (!pair.second.empty()) {

diff --git a/aten/src/ATen/detail/XPUHooksInterface.cpp b/aten/src/ATen/detail/XPUHooksInterface.cpp
@@ -0,0 +1,28 @@
+#include <ATen/detail/XPUHooksInterface.h>
+
+#include <c10/util/CallOnce.h>
+
+#include <memory>
+#include <mutex>
+
+namespace at {
+namespace detail {
+
+static XPUHooksInterface *xpu_hooks = nullptr;
+
+const XPUHooksInterface &getXPUHooks() {
+  static c10::once_flag once;
+  c10::call_once(once, [] {
+    xpu_hooks =
+        XPUHooksRegistry()->Create("XPUHooks", XPUHooksArgs{}).release();
+    if (!xpu_hooks) {
+      xpu_hooks = new XPUHooksInterface();
+    }
+  });
+  return *xpu_hooks;
+}
+} // namespace detail
+
+C10_DEFINE_REGISTRY(XPUHooksRegistry, XPUHooksInterface, XPUHooksArgs)
+
+} // namespace at
diff --git a/aten/src/ATen/detail/XPUHooksInterface.h b/aten/src/ATen/detail/XPUHooksInterface.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <ATen/dlpack.h>
+#include <c10/core/Device.h>
+#include <c10/util/Exception.h>
+
+#include <c10/util/Registry.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+
+namespace at {
+class Context;
+}
+
+namespace at {
+
+constexpr const char* XPU_HELP =
+    "The XPU backend requires Intel Extension for Pytorch;"
+    "this error has occurred because you are trying "
+    "to use some XPU's functionality, but the Intel Extension for Pytorch has not been "
+    "loaded for some reason. The Intel Extension for Pytorch MUST "
+    "be loaded, EVEN IF you don't directly use any symbols from that!";
+
+struct TORCH_API XPUHooksInterface {
+  virtual ~XPUHooksInterface() {}
+
+  virtual void initXPU() const {
+    TORCH_CHECK(
+        false,
+        "Cannot initialize XPU without Intel Extension for Pytorch.",
+        XPU_HELP);
+  }
+
+  virtual bool hasXPU() const {
+    return false;
+  }
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(
+        false,
+        "Cannot query detailed XPU version without Intel Extension for Pytorch. ",
+        XPU_HELP);
+  }
+
+  virtual Device getATenDeviceFromDLPackDevice(
+      const DLDevice& dl_device,
+      void* data) const {
+    TORCH_CHECK(
+        false,
+        "Cannot get XPU device without Intel Extension for Pytorch. ",
+        XPU_HELP);
+  };
+
+  virtual DLDevice getDLPackDeviceFromATenDevice(
+      const Device& aten_device,
+      void* data) const {
+    TORCH_CHECK(
+        false,
+        "Cannot get XPU DL device without Intel Extension for Pytorch. ",
+        XPU_HELP);
+  };
+};
+
+struct TORCH_API XPUHooksArgs {};
+
+C10_DECLARE_REGISTRY(XPUHooksRegistry, XPUHooksInterface, XPUHooksArgs);
+#define REGISTER_XPU_HOOKS(clsname) \
+  C10_REGISTER_CLASS(XPUHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const XPUHooksInterface& getXPUHooks();
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h
@@ -6,14 +6,20 @@
 #ifndef DLPACK_DLPACK_H_
 #define DLPACK_DLPACK_H_
 
+/**
+ * \brief Compatibility with C++
+ */
 #ifdef __cplusplus
 #define DLPACK_EXTERN_C extern "C"
 #else
 #define DLPACK_EXTERN_C
 #endif
 
 /*! \brief The current version of dlpack */
-#define DLPACK_VERSION 60
+#define DLPACK_VERSION 70
+
+/*! \brief The current ABI version of dlpack */
+#define DLPACK_ABI_VERSION 1
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -35,7 +41,11 @@ extern "C" {
 /*!
  * \brief The device type in DLDevice.
  */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
 typedef enum {
+#endif
   /*! \brief CPU device */
   kDLCPU = 1,
   /*! \brief CUDA GPU device */
@@ -68,6 +78,17 @@ typedef enum {
    * \brief CUDA managed/unified memory allocated by cudaMallocManaged
    */
   kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
 } DLDeviceType;
 
 /*!
@@ -80,7 +101,7 @@ typedef struct {
    * \brief The device index.
    * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
    */
-  int device_id;
+  int32_t device_id;
 } DLDevice;
 
 /*!
@@ -109,7 +130,9 @@ typedef enum {
 } DLDataTypeCode;
 
 /*!
- * \brief The data type the tensor can hold.
+ * \brief The data type the tensor can hold. The data type is assumed to follow
+ * the native endian-ness. An explicit error message should be raised when
+ * attempting to export an array with non-native endianness
  *
  *  Examples
  *   - float: type_code = 2, bits = 32, lanes=1
@@ -137,9 +160,16 @@ typedef struct {
  */
 typedef struct {
   /*!
-   * \brief The opaque data pointer points to the allocated data. This will be
-   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
-   * aligned to 256 bytes as in CUDA.
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
    *
    * For given DLTensor, the size of memory required to store the contents of
    * data is calculated as follows:
@@ -159,7 +189,7 @@ typedef struct {
   /*! \brief The device of the tensor */
   DLDevice device;
   /*! \brief Number of dimensions */
-  int ndim;
+  int32_t ndim;
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
   /*! \brief The shape of the tensor */

diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h
@@ -127,10 +127,12 @@ inline Tensor from_blob(
     void* data,
     IntArrayRef sizes,
     const std::function<void(void*)>& deleter,
-    const TensorOptions& options = {}) {
+    const TensorOptions& options = {},
+    const c10::optional<Device> target_device = c10::nullopt) {
   return for_blob(data, sizes)
       .deleter(deleter)
       .options(options)
+      .target_device(target_device)
       .make_tensor();
 }
 

diff --git a/build_variables.bzl b/build_variables.bzl
@@ -1062,6 +1062,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/detail/CUDAHooksInterface.cpp",
     "aten/src/ATen/detail/HIPHooksInterface.cpp",
     "aten/src/ATen/detail/ORTHooksInterface.cpp",
+    "aten/src/ATen/detail/XPUHooksInterface.cpp",
     "aten/src/ATen/metal/Context.cpp",
     "aten/src/ATen/native/AutogradComposite.cpp",
     "aten/src/ATen/native/DispatchStub.cpp",