Build SYCL kernels for ATen XPU ops on Native Windows (take 2) (pytor…

…ch#127390) Original PR pytorch#126725 is closed due to bad rebase. ------- As proposed in pytorch#126719, we are enabling PyTorch XPU on Native Windows on Intel GPU. This PR enables XPU build on Windows as the first step of pytorch#126719: - Enable `USE_XPU` build on Windows using MSVC as host compiler. The use of MSVC as host compiler seamlessly aligns with the existing PyTorch build on Windows. - Build oneDNN GPU library on Windows. Co-authored-by: Yu, Guangye <guangye.yu@intel.com> Pull Request resolved: pytorch#127390 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/ezyang
kashif · Jun 6, 2024 · b4a0161 · b4a0161
1 parent 6adcf21
commit b4a0161
Show file tree

Hide file tree

Showing 15 changed files with 110 additions and 20 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -242,8 +242,7 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
-cmake_dependent_option(USE_XPU "Use XPU. Only available on Linux." ON "LINUX"
-                       OFF)
+option(USE_XPU "Use XPU" ON)
 cmake_dependent_option(
   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)

diff --git a/README.md b/README.md
@@ -189,7 +189,7 @@ Other potentially useful environment variables may be found in `setup.py`.
 ##### Intel GPU Support
 If you want to compile with Intel GPU support, follow these
 - [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
-- Intel GPU is currently supported only for Linux systems.
+- Intel GPU is supported for Linux and Windows.
 
 If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
 Other potentially useful environment variables may be found in `setup.py`.

diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
@@ -12,7 +12,7 @@
 
 namespace at::native::onednn {
 
-TORCH_API dnnl::memory make_onednn_memory(
+TORCH_XPU_API dnnl::memory make_onednn_memory(
     dnnl::memory::desc md,
     dnnl::engine& engine,
     void* ptr);
@@ -21,7 +21,7 @@ TORCH_API dnnl::memory make_onednn_memory(
 bool set_onednn_verbose(int level);
 
 // GpuEngineManager singleton
-struct TORCH_API GpuEngineManager {
+struct TORCH_XPU_API GpuEngineManager {
   static GpuEngineManager& Instance(); // Singleton
 
   dnnl::engine& get_engine(const Device& device) {
@@ -51,7 +51,7 @@ struct TORCH_API GpuEngineManager {
 };
 
 // GpuStreamManager singleton
-struct TORCH_API GpuStreamManager {
+struct TORCH_XPU_API GpuStreamManager {
   static GpuStreamManager& Instance(); // Singleton
 
   dnnl::stream get_stream() {

diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.h b/aten/src/ATen/xpu/XPUGeneratorImpl.h
@@ -4,7 +4,7 @@
 
 namespace at {
 
-struct TORCH_API XPUGeneratorImpl : public GeneratorImpl {
+struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
   // Constructors
   XPUGeneratorImpl(DeviceIndex device_index = -1);
   ~XPUGeneratorImpl() override = default;

diff --git a/aten/src/ATen/xpu/detail/XPUHooks.cpp b/aten/src/ATen/xpu/detail/XPUHooks.cpp
@@ -25,7 +25,13 @@ std::string XPUHooks::showConfig() const {
 
 int32_t XPUHooks::getGlobalIdxFromDevice(const at::Device& device) const {
   TORCH_CHECK(device.is_xpu(), "Only the XPU device type is expected.");
+#ifdef _WIN32
+  TORCH_CHECK(
+      false,
+      "Default context is not supported on XPU on Windows. So we can NOT find its global index of the ATen device.");
+#else
   return at::xpu::getGlobalIdxFromDevice(device.index());
+#endif
 }
 
 Generator XPUHooks::getXPUGenerator(DeviceIndex device_index) const {
@@ -38,7 +44,13 @@ const Generator& XPUHooks::getDefaultXPUGenerator(
 }
 
 Device XPUHooks::getDeviceFromPtr(void* data) const {
+#ifdef _WIN32
+  TORCH_CHECK(
+      false,
+      "Default context is not supported on XPU on Windows. So we can NOT find the ATen device of a pointer.");
+#else
   return at::xpu::getDeviceFromPtr(data);
+#endif
 }
 
 c10::DeviceIndex XPUHooks::getNumGPUs() const {

diff --git a/c10/util/Float8_fnuz_cvt.h b/c10/util/Float8_fnuz_cvt.h
@@ -4,6 +4,10 @@
 
 #include <cstdint>
 
+#if defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp>
+#endif
+
 namespace c10::detail {
 
 /*
@@ -33,6 +37,8 @@ inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
     // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     uint32_t renorm_shift = __clz(mantissa);
+#elif defined(__SYCL_DEVICE_ONLY__)
+    uint32_t renorm_shift = sycl::clz(mantissa);
 #elif defined(_MSC_VER)
     unsigned long nonsign_bsr;
     _BitScanReverse(&nonsign_bsr, (unsigned long)mantissa);

diff --git a/c10/xpu/CMakeLists.txt b/c10/xpu/CMakeLists.txt
@@ -8,6 +8,12 @@ if(NOT BUILD_LIBTORCHLESS)
   find_library(C10_XPU_LIB c10_xpu PATHS $ENV{LIBTORCH_LIB_PATH} NO_DEFAULT_PATH)
 endif()
 
+# ---[ Configure macro file.
+set(C10_XPU_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in xpu_cmake_macros.h.in
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/impl/xpu_cmake_macros.h.in
+    ${CMAKE_BINARY_DIR}/c10/xpu/impl/xpu_cmake_macros.h)
+
 set(C10_XPU_SRCS
     XPUCachingAllocator.cpp
     XPUFunctions.cpp
@@ -50,3 +56,7 @@ foreach(file ${C10_XPU_HEADERS})
   get_filename_component(dir ${file} DIRECTORY)
   install(FILES ${file} DESTINATION include/c10/xpu/${dir})
 endforeach()
+
+if(MSVC AND C10_XPU_BUILD_SHARED_LIBS)
+  install(FILES $<TARGET_PDB_FILE:c10_xpu> DESTINATION lib OPTIONAL)
+endif()
diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
@@ -2,8 +2,6 @@
 #include <c10/util/Exception.h>
 #include <c10/xpu/XPUFunctions.h>
 
-#include <sys/wait.h>
-#include <unistd.h>
 #include <vector>
 
 namespace c10::xpu {
@@ -53,10 +51,20 @@ inline void initGlobalDevicePoolState() {
     return;
   }
 
+#ifdef _WIN32
+  // default context feature is disabled by default on Windows.
+  std::vector<sycl::device> deviceList;
+  for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
+       ++it) {
+    deviceList.push_back(*(*it));
+  }
+  gDevicePool.context = std::make_unique<sycl::context>(deviceList);
+#else
   // The default context is utilized for each Intel GPU device, allowing the
   // retrieval of the context from any GPU device.
   gDevicePool.context = std::make_unique<sycl::context>(
       gDevicePool.devices[0]->get_platform().ext_oneapi_get_default_context());
+#endif
 }
 
 inline void initDevicePoolCallOnce() {

diff --git a/c10/xpu/XPUMacros.h b/c10/xpu/XPUMacros.h
@@ -1,15 +1,29 @@
 #pragma once
 
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/xpu/impl/xpu_cmake_macros.h>
+#endif
+
 // See c10/macros/Export.h for a detailed explanation of what the function
 // of these macros are.  We need one set of macros for every separate library
 // we build.
 
+#ifdef _WIN32
+#if defined(C10_XPU_BUILD_SHARED_LIBS)
+#define C10_XPU_EXPORT __declspec(dllexport)
+#define C10_XPU_IMPORT __declspec(dllimport)
+#else
+#define C10_XPU_EXPORT
+#define C10_XPU_IMPORT
+#endif
+#else // _WIN32
 #if defined(__GNUC__)
 #define C10_XPU_EXPORT __attribute__((__visibility__("default")))
 #else // defined(__GNUC__)
 #define C10_XPU_EXPORT
 #endif // defined(__GNUC__)
 #define C10_XPU_IMPORT C10_XPU_EXPORT
+#endif // _WIN32
 
 // This one is being used by libc10_xpu.so
 #ifdef C10_XPU_BUILD_MAIN_LIB

diff --git a/c10/xpu/impl/xpu_cmake_macros.h.in b/c10/xpu/impl/xpu_cmake_macros.h.in
@@ -0,0 +1,6 @@
+#pragma once
+
+// Automatically generated header file for the C10 XPU library.  Do not
+// include this file directly.  Instead, include c10/xpu/XPUMacros.h
+
+#cmakedefine C10_XPU_BUILD_SHARED_LIBS
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -1062,8 +1062,15 @@ if(USE_XPU)
     message(WARNING "Failed to include ATen XPU implementation target")
   else()
     target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
-    target_link_libraries(torch_xpu PRIVATE
-      "-Wl,--whole-archive,\"$<TARGET_FILE:torch_xpu_ops>\" -Wl,--no-whole-archive")
+    if(MSVC)
+      # Windows
+      target_link_libraries(torch_xpu PRIVATE
+      "-WHOLEARCHIVE:\"$<TARGET_FILE:torch_xpu_ops>\"")
+    else()
+      # Linux
+      target_link_libraries(torch_xpu PRIVATE
+        "-Wl,--whole-archive,\"$<TARGET_FILE:torch_xpu_ops>\" -Wl,--no-whole-archive")
+    endif()
   endif()
 endif()
 

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -89,8 +89,8 @@ endif()
 if(USE_XPU)
   include(${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake)
   if(NOT PYTORCH_FOUND_XPU)
-    # message(WARNING "Not compiling with XPU. Could NOT find SYCL."
-    # "Suppress this warning with -DUSE_XPU=OFF.")
+    message(WARNING "Not compiling with XPU. Could NOT find SYCL."
+    "Suppress this warning with -DUSE_XPU=OFF.")
     caffe2_update_option(USE_XPU OFF)
   endif()
 endif()

diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
@@ -21,10 +21,16 @@ IF(NOT MKLDNN_FOUND)
 
   if(USE_XPU) # Build oneDNN GPU library
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      set(DNNL_HOST_COMPILER "g++")
+      # Linux
       # g++ is soft linked to /usr/bin/cxx, oneDNN would not treat it as an absolute path
+      set(DNNL_HOST_COMPILER "g++")
+      set(SYCL_CXX_DRIVER "icpx")
+      set(DNNL_LIB_NAME "libdnnl.a")
     else()
-      message(FATAL_ERROR "oneDNN library currently only supports GUN g++ compiler for XPU backend")
+      # Windows
+      set(DNNL_HOST_COMPILER "DEFAULT")
+      set(SYCL_CXX_DRIVER "icx")
+      set(DNNL_LIB_NAME "dnnl.lib")
     endif()
 
     set(DNNL_MAKE_COMMAND "cmake" "--build" ".")
@@ -41,8 +47,7 @@ IF(NOT MKLDNN_FOUND)
       PREFIX ${XPU_MKLDNN_DIR_PREFIX}
       BUILD_IN_SOURCE 0
       CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
-      -DCMAKE_CXX_COMPILER=icpx
-      -DCMAKE_CXX_COMPILER_ID=IntelLLVM
+      -DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER}
       -DDNNL_GPU_RUNTIME=SYCL
       -DDNNL_CPU_RUNTIME=THREADPOOL
       -DDNNL_BUILD_TESTS=OFF
@@ -52,20 +57,20 @@ IF(NOT MKLDNN_FOUND)
       -DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler
       -G ${CMAKE_GENERATOR} # Align Generator to Torch
       BUILD_COMMAND ${DNNL_MAKE_COMMAND}
-      BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/libdnnl.a"
+      BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/${DNNL_LIB_NAME}"
       INSTALL_COMMAND ""
     )
 
     ExternalProject_Get_Property(xpu_mkldnn_proj BINARY_DIR)
     set(__XPU_MKLDNN_BUILD_DIR ${BINARY_DIR})
-    set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
+    set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
     set(XPU_MKLDNN_INCLUDE ${__XPU_MKLDNN_BUILD_DIR}/include)
     # This target would be further linked to libtorch_xpu.so.
     # The libtorch_xpu.so would contain Conv&GEMM operators that depend on
     # oneDNN primitive implementations inside libdnnl.a.
     add_library(xpu_mkldnn INTERFACE)
     add_dependencies(xpu_mkldnn xpu_mkldnn_proj)
-    target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
+    target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
     target_include_directories(xpu_mkldnn INTERFACE ${XPU_MKLDNN_INCLUDE})
   endif()
 

diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
@@ -55,6 +55,23 @@ find_library(
   HINTS ${SYCL_LIBRARY_DIR}
   NO_DEFAULT_PATH
 )
+# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix,
+# where the current version of the SYCL runtime is 7.
+# Until oneAPI adds support to sycl.lib without the version suffix,
+# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts.
+# TODO: remove this when sycl.lib is supported on Windows
+if(WIN32)
+  set(sycl_runtime_version 7)
+  find_library(
+    SYCL_LIBRARY
+    NAMES "sycl${sycl_runtime_version}"
+    HINTS ${SYCL_LIBRARY_DIR}
+    NO_DEFAULT_PATH
+  )
+  if(SYCL_LIBRARY STREQUAL "SYCL_LIBRARY-NOTFOUND")
+    message(FATAL_ERROR "Cannot find a SYCL library on Windows")
+  endif()
+endif()
 
 find_library(
   OCL_LIBRARY

diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
@@ -11,24 +11,30 @@
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
 
+#ifndef WIN32
 #include <pthread.h>
+#endif
 
 using namespace torch;
 
 static bool in_bad_fork = false; // True for children forked after xpu init
 
+#ifndef WIN32
 // Called in the forked child if xpu has already been initialized
 static void forked_child() {
   in_bad_fork = true;
   torch::utils::set_requires_device_init(at::kXPU, true);
 }
+#endif
 
 // Should be called before the first xpu call. It is mainly called in lazy_init.
 // Note: This is distinct from initExtension because a stub xpu implementation
 // has some working functions (e.g. device_count) but cannot fully initialize.
 static void poison_fork() {
+#ifndef WIN32
   static c10::once_flag flag;
   c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
+#endif
 }
 
 // XPU management methods