intel · asudarsa · Nov 11, 2022 · Dec 15, 2022 · Dec 21, 2022 · Dec 21, 2022
@@ -9,7 +9,6 @@
 // This coordinates the per-function state used while generating code.
 //
 //===----------------------------------------------------------------------===//
-
 #include "CodeGenFunction.h"
 #include "CGBlocks.h"
 #include "CGCUDARuntime.h"
@@ -583,11 +582,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
     && !FD->hasAttr<SYCLDeviceAttr>())
     return;
 
-  // TODO Module identifier is not reliable for this purpose since two modules
-  // can have the same ID, needs improvement
-  if (getLangOpts().SYCLIsDevice)
+
+  if (getLangOpts().SYCLIsDevice) {
+    // TODO Module identifier is not reliable for this purpose since two modules
+    // can have the same ID, needs improvement
     Fn->addFnAttr("sycl-module-id", Fn->getParent()->getModuleIdentifier());
-
+    int SYCLDeviceCompileOptLevel = -1;
+    switch (CGM.getCodeGenOpts().OptimizationLevel) {
+    default:
+      llvm_unreachable("Invalid optimization level!");
+    case 0:
+    case 1:
+    case 2:
+    case 3:
+      SYCLDeviceCompileOptLevel = CGM.getCodeGenOpts().OptimizationLevel;
+    }
-    int SYCLDeviceCompileOptLevel = -1;
-    switch (CGM.getCodeGenOpts().OptimizationLevel) {
-    default:
-      llvm_unreachable("Invalid optimization level!");
-    case 0:
-    case 1:
-    case 2:
-    case 3:
-      SYCLDeviceCompileOptLevel = CGM.getCodeGenOpts().OptimizationLevel;
-    }
+    int SYCLDeviceCompileOptLevel = CGM.getCodeGenOpts().OptimizationLevel;
+    if (SYCLDeviceCompileOptLevel < 0) ...;
-    int SYCLDeviceCompileOptLevel = -1;
-    switch (CGM.getCodeGenOpts().OptimizationLevel) {
-    default:
-      llvm_unreachable("Invalid optimization level!");
-    case 0:
-    case 1:
-    case 2:
-    case 3:
-      SYCLDeviceCompileOptLevel = CGM.getCodeGenOpts().OptimizationLevel;
-    }
+    int SYCLDeviceCompileOptLevel = CGM.getCodeGenOpts().OptimizationLevel;
+    if (SYCLDeviceCompileOptLevel < 0) ...;
+    if (SYCLDeviceCompileOptLevel >= 0)
+      Fn->addFnAttr("sycl-device-compile-optlevel", std::to_string(SYCLDeviceCompileOptLevel));
+  }
   llvm::LLVMContext &Context = getLLVMContext();
 
   if (FD->hasAttr<OpenCLKernelAttr>() || FD->hasAttr<CUDAGlobalAttr>())

@@ -43,6 +43,9 @@ constexpr char ESIMD_MARKER_MD[] = "sycl_explicit_simd";
 
 constexpr char ATTR_SYCL_MODULE_ID[] = "sycl-module-id";
 
+// Similar copy in sycl-post-link.cpp
+constexpr char ATTR_OPT_LEVEL[] = "sycl-device-compile-optlevel";
+
 bool hasIndirectFunctionsOrCalls(const Module &M) {
   for (const auto &F : M.functions()) {
     // There are functions marked with [[intel::device_indirectly_callable]]
@@ -261,6 +264,44 @@ EntryPointGroupVec groupEntryPointsByScope(ModuleDesc &MD,
   return EntryPointGroups;
 }
 
+template <class EntryPoinGroupFunc>
+EntryPointGroupVec
+groupEntryPointsByOptLevel(ModuleDesc &MD, StringRef AttrName,
+                            bool EmitOnlyKernelsAsEntryPoints,
+                            EntryPoinGroupFunc F) {
+  EntryPointGroupVec EntryPointGroups{};
+  std::map<StringRef, EntryPointSet> EntryPointMap;
+  Module &M = MD.getModule();
+
+  // Only process module entry points:
+  for (auto &F : M.functions()) {
+    if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints) ||
+        !MD.isEntryPointCandidate(F)) {
+      continue;
+    }
+    if (F.hasFnAttribute(AttrName)) {
+      SmallString<16> stringConstName;
+      StringRef OptLevelStr = F.getFnAttribute(AttrName).getValueAsString();
+      EntryPointMap[OptLevelStr].insert(&F);
+    } else {
+      EntryPointMap["-1"].insert(&F);
+    }
+  }
+  if (!EntryPointMap.empty()) {
+    EntryPointGroups.reserve(EntryPointMap.size());
+    for (auto &EPG : EntryPointMap) {
+      EntryPointGroups.emplace_back(EntryPointGroup{
+          EPG.first, std::move(EPG.second), MD.getEntryPointGroup().Props});
+      F(EntryPointGroups.back());
+    }
+  } else {
+    // No entry points met, record this.
+    EntryPointGroups.push_back({GLOBAL_SCOPE_NAME, {}});
+    F(EntryPointGroups.back());
+  }
+  return EntryPointGroups;
+}
+
 // Represents a call graph between functions in a module. Nodes are functions,
 // edges are "calls" relation.
 class CallGraph {
@@ -861,5 +902,28 @@ getSplitterByOptionalFeatures(ModuleDesc &&MD,
     return std::make_unique<ModuleCopier>(std::move(MD), std::move(Groups));
 }
 
+std::unique_ptr<ModuleSplitterBase>
+getOptLevelSplitter(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints) {
+  EntryPointGroupVec Groups = groupEntryPointsByOptLevel(
+      MD, ATTR_OPT_LEVEL, EmitOnlyKernelsAsEntryPoints,
+      [](EntryPointGroup &G) {
+        if (G.GroupId == "3")
+          G.Props.OptLevel = 3;
+        else if (G.GroupId == "2")
+          G.Props.OptLevel = 2;
+        else if (G.GroupId == "1")
+          G.Props.OptLevel = 1;
+        else if (G.GroupId == "0")
+          G.Props.OptLevel = 0;
+      });
+  assert(!Groups.empty() && "At least one group is expected");
+  assert(Groups.size() <= 2 && "At most 2 groups are expected");
+
+  if (Groups.size() > 1)
+    return std::make_unique<ModuleSplitter>(std::move(MD), std::move(Groups));
+  else
+    return std::make_unique<ModuleCopier>(std::move(MD), std::move(Groups));
+}
+
 } // namespace module_split
 } // namespace llvm
@@ -60,12 +60,17 @@ struct EntryPointGroup {
     // Scope represented by EPs in a group
     EntryPointsGroupScope Scope = Scope_Global;
 
+    // opt level
+    int OptLevel = -1;
+
     Properties merge(const Properties &Other) const {
       Properties Res;
       Res.HasESIMD = HasESIMD == Other.HasESIMD
                          ? HasESIMD
                          : SyclEsimdSplitStatus::SYCL_AND_ESIMD;
       Res.UsesLargeGRF = UsesLargeGRF || Other.UsesLargeGRF;
+      // TODO What do we do about optimization levels while merging?
+      // Opt Level remains at '-1'
       // Scope remains global
       return Res;
     }
@@ -93,6 +98,9 @@ struct EntryPointGroup {
   // Tells if some entry points use large GRF mode.
   bool isLargeGRF() const { return Props.UsesLargeGRF; }
 
+  // Get opt level.
+  int getOptLevel() const { return Props.OptLevel; }
+
   void saveNames(std::vector<std::string> &Dest) const;
   void rebuildFromNames(const std::vector<std::string> &Names, const Module &M);
   void rebuild(const Module &M);
@@ -147,6 +155,7 @@ class ModuleDesc {
   bool isESIMD() const { return EntryPoints.isEsimd(); }
   bool isSYCL() const { return EntryPoints.isSycl(); }
   bool isLargeGRF() const { return EntryPoints.isLargeGRF(); }
+  int getOptLevel() const { return EntryPoints.getOptLevel(); }
 
   const EntryPointSet &entries() const { return EntryPoints.Functions; }
   const EntryPointGroup &getEntryPointGroup() const { return EntryPoints; }
@@ -256,6 +265,9 @@ std::unique_ptr<ModuleSplitterBase>
 getSplitterByOptionalFeatures(ModuleDesc &&MD,
                               bool EmitOnlyKernelsAsEntryPoints);
 
+std::unique_ptr<ModuleSplitterBase>
+getOptLevelSplitter(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints);
+
 #ifndef NDEBUG
 void dumpEntryPoints(const EntryPointSet &C, const char *msg = "", int Tab = 0);
 void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,

@@ -91,6 +91,9 @@ constexpr char COL_CODE[] = "Code";
 constexpr char COL_SYM[] = "Symbols";
 constexpr char COL_PROPS[] = "Properties";
 
+// Similar copy in ModuleSplitter.cpp
+constexpr char ATTR_OPT_LEVEL[] = "sycl-device-compile-optlevel";
+
 // InputFilename - The filename to read from.
 cl::opt<std::string> InputFilename{cl::Positional,
                                    cl::desc("<input bitcode file>"),
@@ -449,6 +452,11 @@ std::string saveModuleProperties(module_split::ModuleDesc &MD,
       ProgramMetadata.insert({MetadataNames.back(), GV.getName()});
     }
   }
+
+  auto OptLevel = MD.getOptLevel();
+  if (OptLevel >= 0)
+    PropSet[PropSetRegTy::SYCL_MISC_PROP].insert({"OptLevel", OptLevel});
+
   if (MD.isESIMD()) {
     PropSet[PropSetRegTy::SYCL_MISC_PROP].insert({"isEsimdImage", true});
   }
@@ -775,6 +783,7 @@ processInputModule(std::unique_ptr<Module> M) {
 
   const bool SplitByScope = ScopedSplitter->remainingSplits() > 1;
   bool SplitByOptionalFeatures = false;
+  bool SplitByOptLevel = false;
 
   while (ScopedSplitter->hasMoreSplits()) {
     module_split::ModuleDesc MD = ScopedSplitter->nextSplit();
@@ -797,12 +806,22 @@ processInputModule(std::unique_ptr<Module> M) {
     SplitByOptionalFeatures |= OptionalFeaturesSplitter->remainingSplits() > 1;
 
     while (OptionalFeaturesSplitter->hasMoreSplits()) {
-      TopLevelModules.emplace_back(OptionalFeaturesSplitter->nextSplit());
+      // Here, we perform third-level splitting based on optimization level.
+      // This step is mandatory, as optimization level is at module level.
+      module_split::ModuleDesc MDesc = OptionalFeaturesSplitter->nextSplit();
+      std::unique_ptr<module_split::ModuleSplitterBase> OptLevelSplitter =
+        module_split::getOptLevelSplitter(std::move(MDesc),
+                                          EmitOnlyKernelsAsEntryPoints);
+      SplitByOptLevel |= OptLevelSplitter->remainingSplits() > 1;
+      while (OptLevelSplitter->hasMoreSplits()) {
+        TopLevelModules.emplace_back(OptLevelSplitter->nextSplit());
+      }
     }
   }
 
   Modified |= SplitByScope;
   Modified |= SplitByOptionalFeatures;
+  Modified |= SplitByOptLevel;
 
   // TODO this nested splitting scheme will not scale well when other split
   // "dimensions" will be added. Some infra/"split manager" needs to be

@@ -0,0 +1,71 @@
+# Propagation of optimization levels used by front-end compiler to linker and backend compiler
+
+In order to ease the process of debugging, there is a user requirement to compile different modules with different levels of optimization. This document proposes a compiler flow that will enable propagation of compiler options specified for front-end to the linker and runtimes and eventually to the backend. Currently, only O0/O1/O2/O3 options are handled.
+
+**NOTE**: This is not a final version. The document is still in progress.
+
+## Background
+
+When building an application with several source and object files, it should be possible to specify the optimization parameters individually for each source file/object file (for each invocation of the DPCPP compiler). The linker should pass the original optimization options (e.g. -O0 or -O2) used when building an object file to the device backend compiler (IGC compiler). This will improve the debugging experience by selectively disabling/enabling optimizations for each source file, and therefore achieving better debuggability and better performance as needed.
+
+The current behavior, is that the device backend optimization options are determined by the linker optimization options. If the -O0 option is specified for linker, the linker will pass -cl-opt-disable option to IGC for {*}all kernels{*}, essentially disabling optimizations globally. Otherwise, if the -O0 option is not specified for linker, it will not pass -cl-opt-disable option at all, therefore making the kernels mostly undebuggable, regardless of the original front-end compiler options.
+
+Here is an example that demonstrates this pain point:
+
+```
+icx -c -fsycl test1.c -o test1
+icx -c -O0 -fsycl test2.c -o test2
+icx -fsycl -o test test1.o test2.o
+```
+
+In this scenario, the fat binary is 'test' and there are no compilation flags sent across to the backend compiler. Though the user wanted to have full debuggability with test2.c module, some of the debuggablity is lost.
+
+Another scenario is shown below:
+
+```
+icpx -c -O0 -fsycl -g test.cpp -o test.o
+icpx -fsycl test.o
+```
+
+In this scenario, the fat binary is 'test' and there are no compilation flags sent across to the backend compiler. Though the user wanted to have full debuggability with test.cpp module, some of the debuggablity is lost. The user was not able to set a breakpoint inside device code.
+
+## Requirements
+
+In order to support module-level debuggability, the user will compile different module files with different levels of optimization. These optimization levels must be preserved and made use of during every stage of compilation. Following are the requirements for this feature.
+- If the user specifies '-Ox' as a front-end compile option for a particular module, this option must be preserved during compilation, linking, AOT compilation as well as JIT compilation.
+- If the user specifies '-Ox' option as a front-end linker option, this option will override any front-end compile options and the linker option will be preserved during AOT and JIT compilation.
+- If the user specifies '-O0' option, we need to pass '-cl-opt-disable' to AOT and JIT compilation stages.
+
+## Use case
+
+Following is a possible use case:
+
+```
+A list of modules:
+test1.cpp
+test2.cpp
+test3.cpp
+```
+
+```
+Following are the compilation steps:
+# compiling
+icpx -c -O0 -fsycl test1.cpp -o test1.o
+icpx -c -O3 -fsycl test2.cpp -o test2.o
+icpx -c -fsycl test3.cpp -o test3.o
+# linking
+icpx -o test -fsycl test1.o test2.o test3.o
+# JIT compilation (For GPU backends, this calls igc-standalone compiler in the background)
+./test
+```
+
+Since we have three modules with three different compiler options, we will need to end up with three device binaries, each with their own compiler option specified.
+
+## Proposed design
+
+Following are changes required in various stages of the compilation pipeline:
+- Front-end code generation: For each SYCL kernel, identify the compilation option. Add an appropriate attribute to that kernel. Name of that attribute is 'sycl-device-compile-optlevel'.
+- During the llvm-link stage, all modules are linked into a single module. This is an existing behavior.
+- During sycl-post-link stage, we first split the kernels into multiple modules based on their optimization level. For each split module, an entry corresponding to its optimization level is made in its .props file.
+- During ocloc call generation, the .props file will be parsed and appropriate option will be added to the list of compiler options.
+- In SYCL runtime, logic will be added to program manager to parse the .props file, extract the optimization level, and add '-cl-opt-disable' if the optimization level is 0. Otherwise, we do nothing.
@@ -44,6 +44,7 @@ Design Documents for the oneAPI DPC++ Compiler
    design/CompileTimeProperties
    design/ESIMDStatelesAccessors
    design/DeviceIf
+   design/PropagateCompilerFlagsToLinkerAndRuntime
    New OpenCL Extensions <https://github.com/intel/llvm/tree/sycl/sycl/doc/design/opencl-extensions>
    New SPIR-V Extensions <https://github.com/intel/llvm/tree/sycl/sycl/doc/design/spirv-extensions>
 

@@ -407,6 +407,14 @@ static void appendCompileOptionsFromImage(std::string &CompileOpts,
   // TODO: Remove isDoubleGRF check in next ABI break
   bool isLargeGRF = getUint32PropAsBool(Img, "isLargeGRF") ||
                     getUint32PropAsBool(Img, "isDoubleGRF");
+  pi_device_binary_property Prop = Img.getProperty("OptLevel");
+  int OptLevel = Prop ? DeviceBinaryProperty(Prop).asUint32() : -1;
+  std::string OptLevelStr = "";
+  // Currently, we do not do anything for other opt levels
+  // TODO: Figure out a way to send some info across for other opt levels.
+  if (OptLevel == 0)
+    OptLevelStr = "-cl-opt-disable";
+
   // The -vc-codegen option is always preserved for ESIMD kernels, regardless
   // of the contents SYCL_PROGRAM_COMPILE_OPTIONS environment variable.
   if (isEsimdImage) {
@@ -425,6 +433,11 @@ static void appendCompileOptionsFromImage(std::string &CompileOpts,
     // is fixed
     CompileOpts += isEsimdImage ? "-doubleGRF" : "-ze-opt-large-register-file";
   }
+  if (!OptLevelStr.empty()) {
+    if (!CompileOpts.empty())
+      CompileOpts += " ";
+    CompileOpts += OptLevelStr;
+  }
 }
 
 static void applyOptionsFromImage(std::string &CompileOpts,

@@ -0,0 +1,32 @@
+// RUN: %clangxx %s -O0 -S -o %t.ll -fsycl-device-only
+// RUN: FileCheck %s --input-file %t.ll -check-prefixes=CHECK-IR
+// CHECK-IR: define weak_odr dso_local spir_kernel void @{{.*}}main{{.*}}sycl{{.*}}handler{{.*}}() #[[ATTR:[0-9]+]]
+// CHECK-IR: attributes #[[ATTR]] = { {{.*}} "sycl-device-compile-optlevel"="0" {{.*}}}
+
+// RUN: %clangxx %s -O0 -o %t.bc -fsycl-device-only
+// RUN: sycl-post-link -split=source -symbols -S %t.bc -o %t.table
+// RUN: FileCheck %s -input-file=%t.table
+// RUN: FileCheck %s -input-file=%t_0.prop --check-prefixes CHECK-OPT-LEVEL-PROP
+
+// CHECK: [Code|Properties|Symbols]
+// CHECK: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym
+
+// CHECK-OPT-LEVEL-PROP: OptLevel=1|0
+
+// This test checks adding of the attribute 'sycl-device-compile-optlevel'
+// by the clang front-end
+// This test also checks parsing of the attribute 'sycl-device-compile-optlevel'
+// by the sycl-post-link-tool:
+// Splitting happens as usual.
+// - sycl-post-link adds 'OptLevel' property to the device binary
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue q;
+  q.submit([&](sycl::handler &h) {
+    h.single_task([=]() {});
+  });
+  return 0;
+}
+