llvm · fabianmcg · Aug 20, 2024 · Aug 8, 2024 · Aug 19, 2024
diff --git a/llvm/include/llvm/Frontend/Offloading/Utility.h b/llvm/include/llvm/Frontend/Offloading/Utility.h
@@ -9,8 +9,14 @@
 #ifndef LLVM_FRONTEND_OFFLOADING_UTILITY_H
 #define LLVM_FRONTEND_OFFLOADING_UTILITY_H
 
+#include <cstdint>
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/OffloadBinary.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBufferRef.h"
 
 namespace llvm {
 namespace offloading {
@@ -73,6 +79,60 @@ getOffloadingEntryInitializer(Module &M, Constant *Addr, StringRef Name,
 std::pair<GlobalVariable *, GlobalVariable *>
 getOffloadEntryArray(Module &M, StringRef SectionName);
 
+namespace amdgpu {
+/// Check if an image is compatible with current system's environment. The
+/// system environment is given as a 'target-id' which has the form:
+///
+/// <target-id> := <processor> ( ":" <target-feature> ( "+" | "-" ) )*
+///
+/// If a feature is not specific as '+' or '-' it is assumed to be in an 'any'
+/// and is compatible with either '+' or '-'. The HSA runtime returns this
+/// information using the target-id, while we use the ELF header to determine
+/// these features.
+bool isImageCompatibleWithEnv(StringRef ImageArch, uint32_t ImageFlags,
+                              StringRef EnvTargetID);
+
+/// Struct for holding metadata related to AMDGPU kernels, for more information
+/// about the metadata and its meaning see:
+/// https://llvm.org/docs/AMDGPUUsage.html#code-object-v3
+struct AMDGPUKernelMetaData {
+  /// Constant indicating that a value is invalid.
+  static constexpr uint32_t KInvalidValue =
+      std::numeric_limits<uint32_t>::max();
+  /// The amount of group segment memory required by a work-group in bytes.
+  uint32_t GroupSegmentList = KInvalidValue;
+  /// The amount of fixed private address space memory required for a work-item
+  /// in bytes.
+  uint32_t PrivateSegmentSize = KInvalidValue;
+  /// Number of scalar registers required by a wavefront.
+  uint32_t SGPRCount = KInvalidValue;
+  /// Number of vector registers required by each work-item.
+  uint32_t VGPRCount = KInvalidValue;
+  /// Number of stores from a scalar register to a register allocator created
+  /// spill location.
+  uint32_t SGPRSpillCount = KInvalidValue;
+  /// Number of stores from a vector register to a register allocator created
+  /// spill location.
+  uint32_t VGPRSpillCount = KInvalidValue;
+  /// Number of accumulator registers required by each work-item.
+  uint32_t AGPRCount = KInvalidValue;
+  /// Corresponds to the OpenCL reqd_work_group_size attribute.
+  uint32_t RequestedWorkgroupSize[3] = {KInvalidValue, KInvalidValue,
+                                        KInvalidValue};
+  /// Corresponds to the OpenCL work_group_size_hint attribute.
+  uint32_t WorkgroupSizeHint[3] = {KInvalidValue, KInvalidValue, KInvalidValue};
+  /// Wavefront size.
+  uint32_t WavefrontSize = KInvalidValue;
+  /// Maximum flat work-group size supported by the kernel in work-items.
+  uint32_t MaxFlatWorkgroupSize = KInvalidValue;
+};
+
+/// Reads AMDGPU specific metadata from the ELF file and propagates the
+/// KernelInfoMap.
+Error getAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
+                                 StringMap<AMDGPUKernelMetaData> &KernelInfoMap,
+                                 uint16_t &ELFABIVersion);
+} // namespace amdgpu
 } // namespace offloading
 } // namespace llvm
 

diff --git a/llvm/lib/Frontend/Offloading/CMakeLists.txt b/llvm/lib/Frontend/Offloading/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMFrontendOffloading
   LINK_COMPONENTS
   Core
   BinaryFormat
+  Object
   Support
   TransformUtils
   TargetParser

diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp
@@ -7,10 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -126,3 +132,229 @@ offloading::getOffloadEntryArray(Module &M, StringRef SectionName) {
 
   return std::make_pair(EntriesB, EntriesE);
 }
+
+bool llvm::offloading::amdgpu::isImageCompatibleWithEnv(StringRef ImageArch,
+                                                        uint32_t ImageFlags,
+                                                        StringRef EnvTargetID) {
+  using namespace llvm::ELF;
+  StringRef EnvArch = EnvTargetID.split(":").first;
+
+  // Trivial check if the base processors match.
+  if (EnvArch != ImageArch)
+    return false;
+
+  // Check if the image is requesting xnack on or off.
+  switch (ImageFlags & EF_AMDGPU_FEATURE_XNACK_V4) {
+  case EF_AMDGPU_FEATURE_XNACK_OFF_V4:
+    // The image is 'xnack-' so the environment must be 'xnack-'.
+    if (!EnvTargetID.contains("xnack-"))
+      return false;
+    break;
+  case EF_AMDGPU_FEATURE_XNACK_ON_V4:
+    // The image is 'xnack+' so the environment must be 'xnack+'.
+    if (!EnvTargetID.contains("xnack+"))
+      return false;
+    break;
+  case EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4:
+  case EF_AMDGPU_FEATURE_XNACK_ANY_V4:
+  default:
+    break;
+  }
+
+  // Check if the image is requesting sramecc on or off.
+  switch (ImageFlags & EF_AMDGPU_FEATURE_SRAMECC_V4) {
+  case EF_AMDGPU_FEATURE_SRAMECC_OFF_V4:
+    // The image is 'sramecc-' so the environment must be 'sramecc-'.
+    if (!EnvTargetID.contains("sramecc-"))
+      return false;
+    break;
+  case EF_AMDGPU_FEATURE_SRAMECC_ON_V4:
+    // The image is 'sramecc+' so the environment must be 'sramecc+'.
+    if (!EnvTargetID.contains("sramecc+"))
+      return false;
+    break;
+  case EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4:
+  case EF_AMDGPU_FEATURE_SRAMECC_ANY_V4:
+    break;
+  }
+
+  return true;
+}
+
+namespace {
+/// Reads the AMDGPU specific per-kernel-metadata from an image.
+class KernelInfoReader {
+public:
+  KernelInfoReader(StringMap<offloading::amdgpu::AMDGPUKernelMetaData> &KIM)
+      : KernelInfoMap(KIM) {}
+
+  /// Process ELF note to read AMDGPU metadata from respective information
+  /// fields.
+  Error processNote(const llvm::object::ELF64LE::Note &Note, size_t Align) {
+    if (Note.getName() != "AMDGPU")
+      return Error::success(); // We are not interested in other things
+
+    assert(Note.getType() == ELF::NT_AMDGPU_METADATA &&
+           "Parse AMDGPU MetaData");
+    auto Desc = Note.getDesc(Align);
+    StringRef MsgPackString =
+        StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
+    msgpack::Document MsgPackDoc;
+    if (!MsgPackDoc.readFromBlob(MsgPackString, /*Multi=*/false))
+      return Error::success();
+
+    AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true);
+    if (!Verifier.verify(MsgPackDoc.getRoot()))
+      return Error::success();
+
+    auto RootMap = MsgPackDoc.getRoot().getMap(true);
+
+    if (auto Err = iterateAMDKernels(RootMap))
+      return Err;
+
+    return Error::success();
+  }
+
+private:
+  /// Extracts the relevant information via simple string look-up in the msgpack
+  /// document elements.
+  Error
+  extractKernelData(msgpack::MapDocNode::MapTy::value_type V,
+                    std::string &KernelName,
+                    offloading::amdgpu::AMDGPUKernelMetaData &KernelData) {
+    if (!V.first.isString())
+      return Error::success();
+
+    const auto IsKey = [](const msgpack::DocNode &DK, StringRef SK) {
+      return DK.getString() == SK;
+    };
+
+    const auto GetSequenceOfThreeInts = [](msgpack::DocNode &DN,
+                                           uint32_t *Vals) {
+      assert(DN.isArray() && "MsgPack DocNode is an array node");
+      auto DNA = DN.getArray();
+      assert(DNA.size() == 3 && "ArrayNode has at most three elements");
+
+      int I = 0;
+      for (auto DNABegin = DNA.begin(), DNAEnd = DNA.end(); DNABegin != DNAEnd;
+           ++DNABegin) {
+        Vals[I++] = DNABegin->getUInt();
+      }
+    };
+
+    if (IsKey(V.first, ".name")) {
+      KernelName = V.second.toString();
+    } else if (IsKey(V.first, ".sgpr_count")) {
+      KernelData.SGPRCount = V.second.getUInt();
+    } else if (IsKey(V.first, ".sgpr_spill_count")) {
+      KernelData.SGPRSpillCount = V.second.getUInt();
+    } else if (IsKey(V.first, ".vgpr_count")) {
+      KernelData.VGPRCount = V.second.getUInt();
+    } else if (IsKey(V.first, ".vgpr_spill_count")) {
+      KernelData.VGPRSpillCount = V.second.getUInt();
+    } else if (IsKey(V.first, ".agpr_count")) {
+      KernelData.AGPRCount = V.second.getUInt();
+    } else if (IsKey(V.first, ".private_segment_fixed_size")) {
+      KernelData.PrivateSegmentSize = V.second.getUInt();
+    } else if (IsKey(V.first, ".group_segment_fixed_size")) {
+      KernelData.GroupSegmentList = V.second.getUInt();
+    } else if (IsKey(V.first, ".reqd_workgroup_size")) {
+      GetSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize);
+    } else if (IsKey(V.first, ".workgroup_size_hint")) {
+      GetSequenceOfThreeInts(V.second, KernelData.WorkgroupSizeHint);
+    } else if (IsKey(V.first, ".wavefront_size")) {
+      KernelData.WavefrontSize = V.second.getUInt();
+    } else if (IsKey(V.first, ".max_flat_workgroup_size")) {
+      KernelData.MaxFlatWorkgroupSize = V.second.getUInt();
+    }
+
+    return Error::success();
+  }
+
+  /// Get the "amdhsa.kernels" element from the msgpack Document
+  Expected<msgpack::ArrayDocNode> getAMDKernelsArray(msgpack::MapDocNode &MDN) {
+    auto Res = MDN.find("amdhsa.kernels");
+    if (Res == MDN.end())
+      return createStringError(inconvertibleErrorCode(),
+                               "Could not find amdhsa.kernels key");
+
+    auto Pair = *Res;
+    assert(Pair.second.isArray() &&
+           "AMDGPU kernel entries are arrays of entries");
+
+    return Pair.second.getArray();
+  }
+
+  /// Iterate all entries for one "amdhsa.kernels" entry. Each entry is a
+  /// MapDocNode that either maps a string to a single value (most of them) or
+  /// to another array of things. Currently, we only handle the case that maps
+  /// to scalar value.
+  Error generateKernelInfo(msgpack::ArrayDocNode::ArrayTy::iterator It) {
+    offloading::amdgpu::AMDGPUKernelMetaData KernelData;
+    std::string KernelName;
+    auto Entry = (*It).getMap();
+    for (auto MI = Entry.begin(), E = Entry.end(); MI != E; ++MI)
+      if (auto Err = extractKernelData(*MI, KernelName, KernelData))
+        return Err;
+
+    KernelInfoMap.insert({KernelName, KernelData});
+    return Error::success();
+  }
+
+  /// Go over the list of AMD kernels in the "amdhsa.kernels" entry
+  Error iterateAMDKernels(msgpack::MapDocNode &MDN) {
+    auto KernelsOrErr = getAMDKernelsArray(MDN);
+    if (auto Err = KernelsOrErr.takeError())
+      return Err;
+
+    auto KernelsArr = *KernelsOrErr;
+    for (auto It = KernelsArr.begin(), E = KernelsArr.end(); It != E; ++It) {
+      if (!It->isMap())
+        continue; // we expect <key,value> pairs
+
+      // Obtain the value for the different entries. Each array entry is a
+      // MapDocNode
+      if (auto Err = generateKernelInfo(It))
+        return Err;
+    }
+    return Error::success();
+  }
+
+  // Kernel names are the keys
+  StringMap<offloading::amdgpu::AMDGPUKernelMetaData> &KernelInfoMap;
+};
+} // namespace
+
+Error llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
+    MemoryBufferRef MemBuffer,
+    StringMap<offloading::amdgpu::AMDGPUKernelMetaData> &KernelInfoMap,
+    uint16_t &ELFABIVersion) {
+  Error Err = Error::success(); // Used later as out-parameter
+
+  auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
+  if (auto Err = ELFOrError.takeError())
+    return Err;
+
+  const object::ELF64LEFile ELFObj = ELFOrError.get();
+  Expected<ArrayRef<object::ELF64LE::Shdr>> Sections = ELFObj.sections();
+  if (!Sections)
+    return Sections.takeError();
+  KernelInfoReader Reader(KernelInfoMap);
+
+  // Read the code object version from ELF image header
+  auto Header = ELFObj.getHeader();
+  ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
+  for (const auto &S : *Sections) {
+    if (S.sh_type != ELF::SHT_NOTE)
+      continue;
+
+    for (const auto N : ELFObj.notes(S, Err)) {
+      if (Err)
+        return Err;
+      // Fills the KernelInfoTabel entries in the reader
+      if ((Err = Reader.processNote(N, S.sh_addralign)))
+        return Err;
+    }
+  }
+  return Error::success();
+}
diff --git a/offload/plugins-nextgen/amdgpu/CMakeLists.txt b/offload/plugins-nextgen/amdgpu/CMakeLists.txt
@@ -10,11 +10,12 @@ target_include_directories(omptarget.rtl.amdgpu PRIVATE
 
 if(hsa-runtime64_FOUND AND NOT "amdgpu" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
   message(STATUS "Building AMDGPU plugin linked against libhsa")
-  target_link_libraries(omptarget.rtl.amdgpu PRIVATE hsa-runtime64::hsa-runtime64)
+  target_link_libraries(omptarget.rtl.amdgpu PRIVATE hsa-runtime64::hsa-runtime64 LLVMFrontendOffloading)
 else()
   message(STATUS "Building AMDGPU plugin for dlopened libhsa")
   target_include_directories(omptarget.rtl.amdgpu PRIVATE dynamic_hsa)
   target_sources(omptarget.rtl.amdgpu PRIVATE dynamic_hsa/hsa.cpp)
+  target_link_libraries(omptarget.rtl.amdgpu PRIVATE LLVMFrontendOffloading)
 endif()
 
 # Configure testing for the AMDGPU plugin. We will build tests if we could a

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -485,7 +485,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
   findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
 
   /// Get additional info for kernel, e.g., register spill counts
-  std::optional<utils::KernelMetaDataTy>
+  std::optional<offloading::amdgpu::AMDGPUKernelMetaData>
   getKernelInfo(StringRef Identifier) const {
     auto It = KernelInfoMap.find(Identifier);
 
@@ -499,7 +499,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
   /// The exectuable loaded on the agent.
   hsa_executable_t Executable;
   hsa_code_object_t CodeObject;
-  StringMap<utils::KernelMetaDataTy> KernelInfoMap;
+  StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap;
   uint16_t ELFABIVersion;
 };
 
@@ -600,7 +600,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
   uint32_t ImplicitArgsSize;
 
   /// Additional Info for the AMD GPU Kernel
-  std::optional<utils::KernelMetaDataTy> KernelInfo;
+  std::optional<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfo;
 };
 
 /// Class representing an HSA signal. Signals are used to define dependencies
@@ -3188,9 +3188,9 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
         utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId));
     if (!TargeTripleAndFeaturesOrError)
       return TargeTripleAndFeaturesOrError.takeError();
-    return utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
-                                           ElfOrErr->getPlatformFlags(),
-                                           *TargeTripleAndFeaturesOrError);
+    return offloading::amdgpu::isImageCompatibleWithEnv(
+        Processor ? *Processor : "", ElfOrErr->getPlatformFlags(),
+        *TargeTripleAndFeaturesOrError);
   }
 
   bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {