Skip to content

Commit

Permalink
Build SYCL kernels for ATen XPU ops on Native Windows (take 2) (pytor…
Browse files Browse the repository at this point in the history
…ch#127390)

Original PR pytorch#126725 is closed due to bad rebase.

-------
As proposed in pytorch#126719, we are enabling PyTorch XPU on Native Windows on Intel GPU.

This PR  enables XPU build on Windows as the first step of pytorch#126719:

- Enable `USE_XPU` build on Windows using MSVC as host compiler. The use of MSVC as host compiler seamlessly aligns with the existing PyTorch build on Windows.
- Build oneDNN GPU library on Windows.

Co-authored-by: Yu, Guangye <guangye.yu@intel.com>
Pull Request resolved: pytorch#127390
Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/ezyang
  • Loading branch information
min-jean-cho authored and pytorchmergebot committed Jun 6, 2024
1 parent 6adcf21 commit b4a0161
Show file tree
Hide file tree
Showing 15 changed files with 110 additions and 20 deletions.
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,7 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
option(USE_TSAN "Use Thread Sanitizer" OFF)
option(USE_CUDA "Use CUDA" ON)
cmake_dependent_option(USE_XPU "Use XPU. Only available on Linux." ON "LINUX"
OFF)
option(USE_XPU "Use XPU" ON)
cmake_dependent_option(
BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
"USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ Other potentially useful environment variables may be found in `setup.py`.
##### Intel GPU Support
If you want to compile with Intel GPU support, follow these
- [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
- Intel GPU is currently supported only for Linux systems.
- Intel GPU is supported for Linux and Windows.

If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
Other potentially useful environment variables may be found in `setup.py`.
Expand Down
6 changes: 3 additions & 3 deletions aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace at::native::onednn {

TORCH_API dnnl::memory make_onednn_memory(
TORCH_XPU_API dnnl::memory make_onednn_memory(
dnnl::memory::desc md,
dnnl::engine& engine,
void* ptr);
Expand All @@ -21,7 +21,7 @@ TORCH_API dnnl::memory make_onednn_memory(
bool set_onednn_verbose(int level);

// GpuEngineManager singleton
struct TORCH_API GpuEngineManager {
struct TORCH_XPU_API GpuEngineManager {
static GpuEngineManager& Instance(); // Singleton

dnnl::engine& get_engine(const Device& device) {
Expand Down Expand Up @@ -51,7 +51,7 @@ struct TORCH_API GpuEngineManager {
};

// GpuStreamManager singleton
struct TORCH_API GpuStreamManager {
struct TORCH_XPU_API GpuStreamManager {
static GpuStreamManager& Instance(); // Singleton

dnnl::stream get_stream() {
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/xpu/XPUGeneratorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

namespace at {

struct TORCH_API XPUGeneratorImpl : public GeneratorImpl {
struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
// Constructors
XPUGeneratorImpl(DeviceIndex device_index = -1);
~XPUGeneratorImpl() override = default;
Expand Down
12 changes: 12 additions & 0 deletions aten/src/ATen/xpu/detail/XPUHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,13 @@ std::string XPUHooks::showConfig() const {

int32_t XPUHooks::getGlobalIdxFromDevice(const at::Device& device) const {
TORCH_CHECK(device.is_xpu(), "Only the XPU device type is expected.");
#ifdef _WIN32
TORCH_CHECK(
false,
"Default context is not supported on XPU on Windows. So we can NOT find its global index of the ATen device.");
#else
return at::xpu::getGlobalIdxFromDevice(device.index());
#endif
}

Generator XPUHooks::getXPUGenerator(DeviceIndex device_index) const {
Expand All @@ -38,7 +44,13 @@ const Generator& XPUHooks::getDefaultXPUGenerator(
}

Device XPUHooks::getDeviceFromPtr(void* data) const {
#ifdef _WIN32
TORCH_CHECK(
false,
"Default context is not supported on XPU on Windows. So we can NOT find the ATen device of a pointer.");
#else
return at::xpu::getDeviceFromPtr(data);
#endif
}

c10::DeviceIndex XPUHooks::getNumGPUs() const {
Expand Down
6 changes: 6 additions & 0 deletions c10/util/Float8_fnuz_cvt.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

#include <cstdint>

#if defined(SYCL_LANGUAGE_VERSION)
#include <sycl/sycl.hpp>
#endif

namespace c10::detail {

/*
Expand Down Expand Up @@ -33,6 +37,8 @@ inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
// guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
uint32_t renorm_shift = __clz(mantissa);
#elif defined(__SYCL_DEVICE_ONLY__)
uint32_t renorm_shift = sycl::clz(mantissa);
#elif defined(_MSC_VER)
unsigned long nonsign_bsr;
_BitScanReverse(&nonsign_bsr, (unsigned long)mantissa);
Expand Down
10 changes: 10 additions & 0 deletions c10/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ if(NOT BUILD_LIBTORCHLESS)
find_library(C10_XPU_LIB c10_xpu PATHS $ENV{LIBTORCH_LIB_PATH} NO_DEFAULT_PATH)
endif()

# ---[ Configure macro file.
set(C10_XPU_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in xpu_cmake_macros.h.in
configure_file(
${CMAKE_CURRENT_LIST_DIR}/impl/xpu_cmake_macros.h.in
${CMAKE_BINARY_DIR}/c10/xpu/impl/xpu_cmake_macros.h)

set(C10_XPU_SRCS
XPUCachingAllocator.cpp
XPUFunctions.cpp
Expand Down Expand Up @@ -50,3 +56,7 @@ foreach(file ${C10_XPU_HEADERS})
get_filename_component(dir ${file} DIRECTORY)
install(FILES ${file} DESTINATION include/c10/xpu/${dir})
endforeach()

if(MSVC AND C10_XPU_BUILD_SHARED_LIBS)
install(FILES $<TARGET_PDB_FILE:c10_xpu> DESTINATION lib OPTIONAL)
endif()
12 changes: 10 additions & 2 deletions c10/xpu/XPUFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
#include <c10/util/Exception.h>
#include <c10/xpu/XPUFunctions.h>

#include <sys/wait.h>
#include <unistd.h>
#include <vector>

namespace c10::xpu {
Expand Down Expand Up @@ -53,10 +51,20 @@ inline void initGlobalDevicePoolState() {
return;
}

#ifdef _WIN32
// default context feature is disabled by default on Windows.
std::vector<sycl::device> deviceList;
for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
++it) {
deviceList.push_back(*(*it));
}
gDevicePool.context = std::make_unique<sycl::context>(deviceList);
#else
// The default context is utilized for each Intel GPU device, allowing the
// retrieval of the context from any GPU device.
gDevicePool.context = std::make_unique<sycl::context>(
gDevicePool.devices[0]->get_platform().ext_oneapi_get_default_context());
#endif
}

inline void initDevicePoolCallOnce() {
Expand Down
14 changes: 14 additions & 0 deletions c10/xpu/XPUMacros.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
#pragma once

#ifndef C10_USING_CUSTOM_GENERATED_MACROS
#include <c10/xpu/impl/xpu_cmake_macros.h>
#endif

// See c10/macros/Export.h for a detailed explanation of what the function
// of these macros are. We need one set of macros for every separate library
// we build.

#ifdef _WIN32
#if defined(C10_XPU_BUILD_SHARED_LIBS)
#define C10_XPU_EXPORT __declspec(dllexport)
#define C10_XPU_IMPORT __declspec(dllimport)
#else
#define C10_XPU_EXPORT
#define C10_XPU_IMPORT
#endif
#else // _WIN32
#if defined(__GNUC__)
#define C10_XPU_EXPORT __attribute__((__visibility__("default")))
#else // defined(__GNUC__)
#define C10_XPU_EXPORT
#endif // defined(__GNUC__)
#define C10_XPU_IMPORT C10_XPU_EXPORT
#endif // _WIN32

// This one is being used by libc10_xpu.so
#ifdef C10_XPU_BUILD_MAIN_LIB
Expand Down
6 changes: 6 additions & 0 deletions c10/xpu/impl/xpu_cmake_macros.h.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#pragma once

// Automatically generated header file for the C10 XPU library. Do not
// include this file directly. Instead, include c10/xpu/XPUMacros.h

#cmakedefine C10_XPU_BUILD_SHARED_LIBS
11 changes: 9 additions & 2 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1062,8 +1062,15 @@ if(USE_XPU)
message(WARNING "Failed to include ATen XPU implementation target")
else()
target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
target_link_libraries(torch_xpu PRIVATE
"-Wl,--whole-archive,\"$<TARGET_FILE:torch_xpu_ops>\" -Wl,--no-whole-archive")
if(MSVC)
# Windows
target_link_libraries(torch_xpu PRIVATE
"-WHOLEARCHIVE:\"$<TARGET_FILE:torch_xpu_ops>\"")
else()
# Linux
target_link_libraries(torch_xpu PRIVATE
"-Wl,--whole-archive,\"$<TARGET_FILE:torch_xpu_ops>\" -Wl,--no-whole-archive")
endif()
endif()
endif()

Expand Down
4 changes: 2 additions & 2 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ endif()
if(USE_XPU)
include(${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake)
if(NOT PYTORCH_FOUND_XPU)
# message(WARNING "Not compiling with XPU. Could NOT find SYCL."
# "Suppress this warning with -DUSE_XPU=OFF.")
message(WARNING "Not compiling with XPU. Could NOT find SYCL."
"Suppress this warning with -DUSE_XPU=OFF.")
caffe2_update_option(USE_XPU OFF)
endif()
endif()
Expand Down
19 changes: 12 additions & 7 deletions cmake/Modules/FindMKLDNN.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,16 @@ IF(NOT MKLDNN_FOUND)

if(USE_XPU) # Build oneDNN GPU library
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(DNNL_HOST_COMPILER "g++")
# Linux
# g++ is soft linked to /usr/bin/cxx, oneDNN would not treat it as an absolute path
set(DNNL_HOST_COMPILER "g++")
set(SYCL_CXX_DRIVER "icpx")
set(DNNL_LIB_NAME "libdnnl.a")
else()
message(FATAL_ERROR "oneDNN library currently only supports GUN g++ compiler for XPU backend")
# Windows
set(DNNL_HOST_COMPILER "DEFAULT")
set(SYCL_CXX_DRIVER "icx")
set(DNNL_LIB_NAME "dnnl.lib")
endif()

set(DNNL_MAKE_COMMAND "cmake" "--build" ".")
Expand All @@ -41,8 +47,7 @@ IF(NOT MKLDNN_FOUND)
PREFIX ${XPU_MKLDNN_DIR_PREFIX}
BUILD_IN_SOURCE 0
CMAKE_ARGS -DCMAKE_C_COMPILER=icx
-DCMAKE_CXX_COMPILER=icpx
-DCMAKE_CXX_COMPILER_ID=IntelLLVM
-DCMAKE_CXX_COMPILER=${SYCL_CXX_DRIVER}
-DDNNL_GPU_RUNTIME=SYCL
-DDNNL_CPU_RUNTIME=THREADPOOL
-DDNNL_BUILD_TESTS=OFF
Expand All @@ -52,20 +57,20 @@ IF(NOT MKLDNN_FOUND)
-DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler
-G ${CMAKE_GENERATOR} # Align Generator to Torch
BUILD_COMMAND ${DNNL_MAKE_COMMAND}
BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/libdnnl.a"
BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/${DNNL_LIB_NAME}"
INSTALL_COMMAND ""
)

ExternalProject_Get_Property(xpu_mkldnn_proj BINARY_DIR)
set(__XPU_MKLDNN_BUILD_DIR ${BINARY_DIR})
set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
set(XPU_MKLDNN_INCLUDE ${__XPU_MKLDNN_BUILD_DIR}/include)
# This target would be further linked to libtorch_xpu.so.
# The libtorch_xpu.so would contain Conv&GEMM operators that depend on
# oneDNN primitive implementations inside libdnnl.a.
add_library(xpu_mkldnn INTERFACE)
add_dependencies(xpu_mkldnn xpu_mkldnn_proj)
target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/${DNNL_LIB_NAME})
target_include_directories(xpu_mkldnn INTERFACE ${XPU_MKLDNN_INCLUDE})
endif()

Expand Down
17 changes: 17 additions & 0 deletions cmake/Modules/FindSYCLToolkit.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,23 @@ find_library(
HINTS ${SYCL_LIBRARY_DIR}
NO_DEFAULT_PATH
)
# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix,
# where the current version of the SYCL runtime is 7.
# Until oneAPI adds support to sycl.lib without the version suffix,
# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts.
# TODO: remove this when sycl.lib is supported on Windows
if(WIN32)
set(sycl_runtime_version 7)
find_library(
SYCL_LIBRARY
NAMES "sycl${sycl_runtime_version}"
HINTS ${SYCL_LIBRARY_DIR}
NO_DEFAULT_PATH
)
if(SYCL_LIBRARY STREQUAL "SYCL_LIBRARY-NOTFOUND")
message(FATAL_ERROR "Cannot find a SYCL library on Windows")
endif()
endif()

find_library(
OCL_LIBRARY
Expand Down
6 changes: 6 additions & 0 deletions torch/csrc/xpu/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,30 @@
#include <torch/csrc/utils/python_numbers.h>
#include <torch/csrc/utils/python_strings.h>

#ifndef WIN32
#include <pthread.h>
#endif

using namespace torch;

static bool in_bad_fork = false; // True for children forked after xpu init

#ifndef WIN32
// Called in the forked child if xpu has already been initialized
static void forked_child() {
in_bad_fork = true;
torch::utils::set_requires_device_init(at::kXPU, true);
}
#endif

// Should be called before the first xpu call. It is mainly called in lazy_init.
// Note: This is distinct from initExtension because a stub xpu implementation
// has some working functions (e.g. device_count) but cannot fully initialize.
static void poison_fork() {
#ifndef WIN32
static c10::once_flag flag;
c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
#endif
}

// XPU management methods
Expand Down

0 comments on commit b4a0161

Please sign in to comment.