-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Offload] Add MPI Proxy Plugin #114574
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[Offload] Add MPI Proxy Plugin #114574
Conversation
@llvm/pr-subscribers-offload Author: Jhonatan Cléto (cl3to) ChangesThis patch introduces a new Offload plugin built on the existing interface to enable the offloading of computational tasks to remote accelerator devices via an MPI Proxy Layer. It improves the efficiency of kernel launches and data transfers by utilizing an event-driven architecture with non-blocking MPI communications and C++20 coroutines, facilitating asynchronous operations. With this new MPI Plugin, users can offload OpenMP target regions to remote devices seamlessly, as if they were local. Any remote device compatible with an Offload Plugin can be used with the MPI Plugin. Currently, we have tested this plugin with X86_64 and CUDA devices, but it is expected to work with AMD GPUs as well. Currently, the plugin lacks support for the following features:
Programs using the MPI Plugin are compiled like standard OpenMP target programs with clang, as shown in this example: clang -fopenmp -fopenmp-targets=nvptx64 -o app app.c The MPI Plugin uses a binary, mpirun -np N llvm-offload-mpi-proxy-device : -np 1 ./app Note: Only one instance of the OpenMP program ( At runtime, the number of devices returned by the To compile the plugin and run the test suite, an environment with an installed MPI implementation (such as OpenMPI or MPICH) is required. We currently lack resources to add a dedicated Buildbot for this plugin, so we request that existing Buildbots be updated to support it. Patch is 165.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114574.diff 33 Files Affected:
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 9b771d1116ee38..e01070cca652df 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -139,7 +139,7 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
endif()
-set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS mpi amdgpu cuda host)
set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
"Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
@@ -194,8 +194,10 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-g
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu-LTO")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu-LTO")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu-mpi")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-LTO")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-mpi")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-JIT-LTO")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu-LTO")
@@ -341,6 +343,8 @@ set(LIBOMPTARGET_LLVM_LIBRARY_DIR "${LLVM_LIBRARY_DIR}" CACHE STRING
set(LIBOMPTARGET_LLVM_LIBRARY_INTDIR "${LIBOMPTARGET_INTDIR}" CACHE STRING
"Path to folder where intermediate libraries will be output")
+set(LIBOMPTARGET_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+
# Build offloading plugins and device RTLs if they are available.
add_subdirectory(plugins-nextgen)
add_subdirectory(DeviceRTL)
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 41cc0f286a581f..75fec516de9b88 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1208,130 +1208,141 @@ struct GenericPluginTy {
/// Returns non-zero if the \p Image is compatible with the plugin. This
/// function does not require the plugin to be initialized before use.
- int32_t is_plugin_compatible(__tgt_device_image *Image);
+ virtual int32_t is_plugin_compatible(__tgt_device_image *Image);
/// Returns non-zero if the \p Image is compatible with the device.
- int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image);
+ virtual int32_t is_device_compatible(int32_t DeviceId,
+ __tgt_device_image *Image);
/// Returns non-zero if the plugin device has been initialized.
- int32_t is_device_initialized(int32_t DeviceId) const;
+ virtual int32_t is_device_initialized(int32_t DeviceId) const;
/// Initialize the device inside of the plugin.
- int32_t init_device(int32_t DeviceId);
+ virtual int32_t init_device(int32_t DeviceId);
/// Return the number of devices this plugin can support.
- int32_t number_of_devices();
+ virtual int32_t number_of_devices();
/// Returns non-zero if the data can be exchanged between the two devices.
- int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
+ virtual int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
/// Initializes the record and replay mechanism inside the plugin.
- int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
- void *VAddr, bool isRecord, bool SaveOutput,
- uint64_t &ReqPtrArgOffset);
+ virtual int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
+ void *VAddr, bool isRecord,
+ bool SaveOutput,
+ uint64_t &ReqPtrArgOffset);
/// Loads the associated binary into the plugin and returns a handle to it.
- int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
- __tgt_device_binary *Binary);
+ virtual int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
+ __tgt_device_binary *Binary);
/// Allocates memory that is accessively to the given device.
- void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind);
+ virtual void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
+ int32_t Kind);
/// Deallocates memory on the given device.
- int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
+ virtual int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
/// Locks / pins host memory using the plugin runtime.
- int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
- void **LockedPtr);
+ virtual int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
+ void **LockedPtr);
/// Unlocks / unpins host memory using the plugin runtime.
- int32_t data_unlock(int32_t DeviceId, void *Ptr);
+ virtual int32_t data_unlock(int32_t DeviceId, void *Ptr);
/// Notify the runtime about a new mapping that has been created outside.
- int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
+ virtual int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr,
+ int64_t Size);
/// Notify t he runtime about a mapping that has been deleted.
- int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
+ virtual int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
/// Copy data to the given device.
- int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
- int64_t Size);
+ virtual int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
+ int64_t Size);
/// Copy data to the given device asynchronously.
- int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
- int64_t Size, __tgt_async_info *AsyncInfoPtr);
+ virtual int32_t data_submit_async(int32_t DeviceId, void *TgtPtr,
+ void *HstPtr, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr);
/// Copy data from the given device.
- int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
- int64_t Size);
+ virtual int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
+ int64_t Size);
/// Copy data from the given device asynchornously.
- int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
- int64_t Size, __tgt_async_info *AsyncInfoPtr);
+ virtual int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr,
+ void *TgtPtr, int64_t Size,
+ __tgt_async_info *AsyncInfoPtr);
/// Exchange memory addresses between two devices.
- int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId,
- void *DstPtr, int64_t Size);
+ virtual int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr,
+ int32_t DstDeviceId, void *DstPtr,
+ int64_t Size);
/// Exchange memory addresses between two devices asynchronously.
- int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
- int DstDeviceId, void *DstPtr, int64_t Size,
- __tgt_async_info *AsyncInfo);
+ virtual int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
+ int DstDeviceId, void *DstPtr,
+ int64_t Size,
+ __tgt_async_info *AsyncInfo);
/// Begin executing a kernel on the given device.
- int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
- ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
- __tgt_async_info *AsyncInfoPtr);
+ virtual int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
+ void **TgtArgs, ptrdiff_t *TgtOffsets,
+ KernelArgsTy *KernelArgs,
+ __tgt_async_info *AsyncInfoPtr);
/// Synchronize an asyncrhonous queue with the plugin runtime.
- int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+ virtual int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
/// Query the current state of an asynchronous queue.
- int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+ virtual int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
/// Prints information about the given devices supported by the plugin.
- void print_device_info(int32_t DeviceId);
+ virtual void print_device_info(int32_t DeviceId);
/// Creates an event in the given plugin if supported.
- int32_t create_event(int32_t DeviceId, void **EventPtr);
+ virtual int32_t create_event(int32_t DeviceId, void **EventPtr);
/// Records an event that has occurred.
- int32_t record_event(int32_t DeviceId, void *EventPtr,
- __tgt_async_info *AsyncInfoPtr);
+ virtual int32_t record_event(int32_t DeviceId, void *EventPtr,
+ __tgt_async_info *AsyncInfoPtr);
/// Wait until an event has occurred.
- int32_t wait_event(int32_t DeviceId, void *EventPtr,
- __tgt_async_info *AsyncInfoPtr);
+ virtual int32_t wait_event(int32_t DeviceId, void *EventPtr,
+ __tgt_async_info *AsyncInfoPtr);
/// Syncrhonize execution until an event is done.
- int32_t sync_event(int32_t DeviceId, void *EventPtr);
+ virtual int32_t sync_event(int32_t DeviceId, void *EventPtr);
/// Remove the event from the plugin.
- int32_t destroy_event(int32_t DeviceId, void *EventPtr);
+ virtual int32_t destroy_event(int32_t DeviceId, void *EventPtr);
/// Remove the event from the plugin.
void set_info_flag(uint32_t NewInfoLevel);
/// Creates an asynchronous queue for the given plugin.
- int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
+ virtual int32_t init_async_info(int32_t DeviceId,
+ __tgt_async_info **AsyncInfoPtr);
/// Creates device information to be used for diagnostics.
- int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
- const char **ErrStr);
+ virtual int32_t init_device_info(int32_t DeviceId,
+ __tgt_device_info *DeviceInfo,
+ const char **ErrStr);
/// Sets the offset into the devices for use by OMPT.
int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
/// Returns if the plugin can support auotmatic copy.
- int32_t use_auto_zero_copy(int32_t DeviceId);
+ virtual int32_t use_auto_zero_copy(int32_t DeviceId);
/// Look up a global symbol in the given binary.
- int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
- const char *Name, void **DevicePtr);
+ virtual int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
+ const char *Name, void **DevicePtr);
/// Look up a kernel function in the given binary.
- int32_t get_function(__tgt_device_binary Binary, const char *Name,
- void **KernelPtr);
+ virtual int32_t get_function(__tgt_device_binary Binary, const char *Name,
+ void **KernelPtr);
private:
/// Indicates if the platform runtime has been fully initialized.
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index fe296b77c7d557..c72a0770af23cf 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -43,7 +43,7 @@
#endif
// The number of devices in this plugin.
-#define NUM_DEVICES 4
+#define NUM_DEVICES 1
namespace llvm {
namespace omp {
diff --git a/offload/plugins-nextgen/mpi/CMakeLists.txt b/offload/plugins-nextgen/mpi/CMakeLists.txt
new file mode 100644
index 00000000000000..b64b2218048aa8
--- /dev/null
+++ b/offload/plugins-nextgen/mpi/CMakeLists.txt
@@ -0,0 +1,134 @@
+# Looking for MPI...
+find_package(MPI QUIET)
+
+if(NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+ message(STATUS "Not building MPI offloading plugin: only support MPI in Linux x86_64 or ppc64le hosts.")
+ return()
+elseif(NOT MPI_CXX_FOUND)
+ message(STATUS "Not building MPI offloading plugin: MPI not found in system.")
+ return()
+endif()
+
+message(STATUS "Building MPI Proxy offloading plugin.")
+
+# Event System
+add_subdirectory(event_system)
+
+# MPI Plugin
+
+# Create the library and add the default arguments.
+add_target_library(omptarget.rtl.mpi MPI)
+
+target_sources(omptarget.rtl.mpi PRIVATE
+ src/rtl.cpp
+)
+
+target_link_libraries(omptarget.rtl.mpi PRIVATE
+ EventSystem
+)
+
+# Add include directories
+target_include_directories(omptarget.rtl.mpi PRIVATE
+ ${LIBOMPTARGET_INCLUDE_DIR})
+
+# Set C++20 as the target standard for this plugin.
+set_target_properties(omptarget.rtl.mpi
+ PROPERTIES
+ CXX_STANDARD 20
+ CXX_STANDARD_REQUIRED ON)
+
+
+# Configure testing for the MPI plugin.
+list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.mpi")
+# Report to the parent scope that we are building a plugin for MPI.
+set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+
+# Define the target specific triples and ELF machine values.
+set(LIBOMPTARGET_SYSTEM_TARGETS
+ "${LIBOMPTARGET_SYSTEM_TARGETS} x86_64-pc-linux-gnu-mpi nvptx64-nvidia-cuda-mpi" PARENT_SCOPE)
+
+# Remote Plugin Manager
+message(STATUS "Building the llvm-offload-mpi-proxy-device")
+
+set(LIBOMPTARGET_ALL_REMOTE_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "all" CACHE STRING
+ "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+
+if(LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD STREQUAL "all")
+ set(LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_REMOTE_PLUGIN_TARGETS})
+endif()
+
+if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux" AND
+ "host" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
+ message(STATUS "Not building remote host plugin: only Linux systems are supported")
+ list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "host")
+endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
+ AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+ if("amdgpu" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
+ message(STATUS "Not building remote AMDGPU plugin: only support AMDGPU in "
+ "Linux x86_64, ppc64le, or aarch64 hosts")
+ list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "amdgpu")
+ endif()
+ if("cuda" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
+ message(STATUS "Not building remote CUDA plugin: only support CUDA in "
+ "Linux x86_64, ppc64le, or aarch64 hosts")
+ list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "cuda")
+ endif()
+endif()
+if("mpi" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
+ message(STATUS "It is not possible to build the mpi plugin inside "
+ "the remote proxy device")
+ list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "mpi")
+endif()
+
+message(STATUS "Building the MPI Plugin with support for remote offloading to "
+ "the \"${LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD}\" plugins")
+
+set(REMOTE_MPI_ENUM_PLUGIN_TARGETS "")
+foreach(plugin IN LISTS LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
+ set(REMOTE_MPI_ENUM_PLUGIN_TARGETS
+ "${REMOTE_MPI_ENUM_PLUGIN_TARGETS}PLUGIN_TARGET(${plugin})\n")
+endforeach()
+string(STRIP ${REMOTE_MPI_ENUM_PLUGIN_TARGETS} REMOTE_MPI_ENUM_PLUGIN_TARGETS)
+configure_file(
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/RemoteTargets.def.in
+ ${LIBOMPTARGET_BINARY_INCLUDE_DIR}/Shared/RemoteTargets.def
+)
+
+llvm_add_tool(OPENMP llvm-offload-mpi-proxy-device
+ src/ProxyDevice.cpp
+ src/RemotePluginManager.cpp
+ ${LIBOMPTARGET_SRC_DIR}/OpenMP/OMPT/Callback.cpp
+)
+
+llvm_update_compile_flags(llvm-offload-mpi-proxy-device)
+
+target_link_libraries(llvm-offload-mpi-proxy-device PRIVATE
+ EventSystem
+ LLVMSupport
+ omp
+)
+
+add_dependencies(llvm-offload-mpi-proxy-device omp)
+
+target_include_directories(llvm-offload-mpi-proxy-device PRIVATE
+ ${LIBOMPTARGET_INCLUDE_DIR}
+ ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+ ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
+)
+
+foreach(plugin IN LISTS LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
+ target_link_libraries(llvm-offload-mpi-proxy-device PRIVATE omptarget.rtl.${plugin})
+ add_dependencies(llvm-offload-mpi-proxy-device omptarget.rtl.${plugin})
+endforeach()
+
+# Set C++20 as the target standard for this plugin.
+set_target_properties(llvm-offload-mpi-proxy-device
+ PROPERTIES
+ CXX_STANDARD 20
+ CXX_STANDARD_REQUIRED ON)
+
+target_compile_definitions(llvm-offload-mpi-proxy-device PRIVATE
+ TARGET_NAME=llvm-offload-mpi-proxy-device
+ DEBUG_PREFIX="MPIProxyDevice")
diff --git a/offload/plugins-nextgen/mpi/event_system/CMakeLists.txt b/offload/plugins-nextgen/mpi/event_system/CMakeLists.txt
new file mode 100644
index 00000000000000..32a9f9b79423e1
--- /dev/null
+++ b/offload/plugins-nextgen/mpi/event_system/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Build EventSystem
+add_library(EventSystem OBJECT
+ EventSystem.cpp
+)
+
+target_include_directories(EventSystem PUBLIC
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
+ ${LIBOMPTARGET_INCLUDE_DIR}
+)
+
+target_link_libraries(EventSystem PRIVATE
+ MPI::MPI_CXX
+ LLVMSupport
+)
+
+target_compile_options(EventSystem PUBLIC ${offload_compile_flags})
+target_link_options(EventSystem PUBLIC ${offload_link_flags})
+
+set_target_properties(EventSystem PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# Set C++20 as the target standard for this plugin.
+set_target_properties(EventSystem
+ PROPERTIES
+ CXX_STANDARD 20
+ CXX_STANDARD_REQUIRED ON)
+
+target_compile_definitions(EventSystem PRIVATE
+ DEBUG_PREFIX="EventSystem")
\ No newline at end of file
diff --git a/offload/plugins-nextgen/mpi/event_system/EventSystem.cpp b/offload/plugins-nextgen/mpi/event_system/EventSystem.cpp
new file mode 100644
index 00000000000000..ab59e3da837fa5
--- /dev/null
+++ b/offload/plugins-nextgen/mpi/event_system/EventSystem.cpp
@@ -0,0 +1,848 @@
+//===------ event_system.cpp - Concurrent MPI communication -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the MPI Event System used by the MPI
+// target runtime for concurrent communication.
+//
+//===----------------------------------------------------------------------===//
+
+#include "EventSystem.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory>
+
+#include <mpi.h>
+#include <unistd.h>
+
+#include "Shared/APITypes.h"
+#include "Shared/Debug.h"
+#include "Shared/EnvironmentVar.h"
+#include "Shared/Utils.h"
+#include "omptarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Error.h"
+
+#include "llvm/Support/DynamicLibrary.h"
+
+#define CHECK(expr, msg, ...) ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is obviously a huge patch, I don't know what would be required to run a bot on this. Maybe @jplehr can chime in.
@@ -6,6 +6,7 @@ | |||
// UNSUPPORTED: amdgcn-amd-amdhsa | |||
// UNSUPPORTED: nvptx64-nvidia-cuda | |||
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO | |||
// UNSUPPORTED: nvptx64-nvidia-cuda-mpi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know it's building off of existing stuff, but this really isn't the way. We need a single feature called mpi
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we could have the mpi
feature, but in cases where only one or a few of the MPI targets lack support for the test, wouldn’t we run into the same problem?
Please precommit the format lines that changed in a separate NFC. |
We are gonna need someone with MPI expertise. @lyu Could you please help to take a look? |
We won't be able to provide coverage for an MPI plugin. So, if we want this tested, then someone else would need to chime-in and provide a bot for it. |
Yes, we know... We are working on a solution with @jdoerfert and @efwright |
4d61a9b
to
310f525
Compare
@@ -194,8 +194,10 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-g | |||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu-LTO") | |||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu") | |||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu-LTO") | |||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu-mpi") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to know both architectures for this? I figured that would be opaque since it's just a separate plugin.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm still curious why the MPI plugin needs to target a GPU triple.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We're not using this as a compilation triple; it's only used to add new tests that invoke mpirun
in LIT, and only if the MPI plugin was built. For reference, see this line in lit.cfg
. Do you have any suggestion for a cleaner or more appropriate way to handle this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm confused what this is testing. The target regions get lowered to MPI calls? Where does a GPU come in? The tests basically just make sure that the target region executes properly, we can just make that agnostic to the underlying device that it actually executes on. Honestly a lot of this stuff should be reworked but that's time consuming.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MPI is only used for the communication between the host and the devices but the target region can be lowered either to x86_64 and executed on remote CPUs or lowered to nvptx64 and executed on remote GPUs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So we are testing the support of the x86 and CUDA plugins with remote CPU/GPU using MPI as the communication layer
310f525
to
09dcc5b
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
Co-authored-by: Guilherme Valarini <guilherme.a.valarini@gmail.com>
09dcc5b
to
6f7ca4c
Compare
This patch introduces a new Offload plugin built on the existing interface to enable the offloading of computational tasks to remote accelerator devices via an MPI Proxy Layer. It improves the efficiency of kernel launches and data transfers by utilizing an event-driven architecture with non-blocking MPI communications and C++20 coroutines, facilitating asynchronous operations.
With this new MPI Plugin, users can offload OpenMP target regions to remote devices seamlessly, as if they were local. Any remote device compatible with an Offload Plugin can be used with the MPI Plugin. Currently, we have tested this plugin with X86_64 and CUDA devices, but it is expected to work with AMD GPUs as well.
Currently, the plugin lacks support for the following features:
Programs using the MPI Plugin are compiled like standard OpenMP target programs with clang, as shown in this example:
The MPI Plugin uses a binary,
llvm-offload-mpi-proxy-device
, to execute target operations on the remote device. Thus, to offload tasks to an MPI device, the program must be executed with the Single Program Multiple Data (SPMD) model of an MPI launcher, as shown here:mpirun -np N llvm-offload-mpi-proxy-device : -np 1 ./app
Note: Only one instance of the OpenMP program (
-np 1 ./app
) should be created. If multiple instances are launched, the plugin will not function correctly. Additionally, due to a design constraint, the host process (app
) must have the rankWorldSize - 1
for MPI communication to work correctly. Consequently, it's essential to execute thempirun
command in the order shown in the previous example.At runtime, the number of devices returned by the
omp_get_num_devices()
call will be the sum of local devices and all devices available in eachllvm-offload-mpi-proxy-device
instance.To compile the plugin and run the test suite, an environment with an installed MPI implementation (such as OpenMPI or MPICH) is required.
We currently lack resources to add a dedicated Buildbot for this plugin, so we request that existing Buildbots be updated to support it.
This patch supersedes #90890, which can now be closed. Unlike the previous version, which implemented a plugin to offload tasks to remote CPUs using MPI, this version functions as a proxy for other plugins.