From c92eaf8a12808a356489a46608c469ac7215f06a Mon Sep 17 00:00:00 2001
From: Cloud Han <cloudhan@outlook.com>
Date: Wed, 8 Jun 2022 22:06:38 +0800
Subject: [PATCH] Add a wrapper implementation for device link

---
 cuda/private/actions/dlink_wrapper.bzl       | 125 +++++++++++++++++++
 cuda/private/rules/cuda_library.bzl          |   2 +-
 cuda/private/toolchain_configs/nvcc.bzl      |   4 +
 cuda/private/toolchain_configs/nvcc_msvc.bzl |   4 +
 4 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 cuda/private/actions/dlink_wrapper.bzl

diff --git a/cuda/private/actions/dlink_wrapper.bzl b/cuda/private/actions/dlink_wrapper.bzl
new file mode 100644
index 00000000..a1c9444c
--- /dev/null
+++ b/cuda/private/actions/dlink_wrapper.bzl
@@ -0,0 +1,125 @@
+""
+
+load("//cuda/private:action_names.bzl", "ACTION_NAMES")
+load("//cuda/private:actions/compile.bzl", "compile")
+load("//cuda/private:actions/dlink.bzl", compiler_device_link = "device_link")
+load("//cuda/private:cuda_helper.bzl", "cuda_helper")
+load("//cuda/private:toolchain.bzl", "find_cuda_toolkit")
+
+def device_link(
+        ctx,
+        cuda_toolchain,
+        cc_toolchain,
+        objects,
+        common,
+        pic = False,
+        rdc = False,
+        dlto = False):
+    """perform device link, return a dlink-ed object file"""
+    if not rdc:
+        fail("device link is only meaningful on building relocatable device code")
+
+    cuda_toolkit = find_cuda_toolkit(ctx)
+    cuda_feature_config = cuda_helper.configure_features(ctx, cuda_toolchain, requested_features = [ACTION_NAMES.device_link])
+    if cuda_helper.is_enabled(cuda_feature_config, "supports_compiler_device_link"):
+        return compiler_device_link(ctx, cuda_toolchain, cc_toolchain, objects, common, pic = pic, rdc = rdc, dlto = dlto)
+
+    if not cuda_helper.is_enabled(cuda_feature_config, "supports_wrapper_device_link"):
+        fail("toolchain is not configured to enable wrapper device link.")
+
+    actions = ctx.actions
+    pic_suffix = "_pic" if pic else ""
+
+    # Device-link to cubins for each gpu architecture. The stage1 compiled PTX is embeded in the object files.
+    # We don't need to do any thing about it, presumably.
+    register_h = None
+    cubins = []
+    images = []
+    obj_args = actions.args()
+    obj_args.add_all(objects)
+    for arch_spec in common.cuda_archs_info.arch_specs:
+        for stage2_arch in arch_spec.stage2_archs:
+            if stage2_arch.gpu:
+                arch = "sm_" + stage2_arch.arch
+            elif stage2_arch.lto:
+                arch = "lto_" + stage2_arch.arch
+            else:
+                # PTX is JIT-linked at runtime
+                continue
+
+            register_h = ctx.actions.declare_file("_dlink{suffix}/{0}/{0}_register_{1}.h".format(ctx.attr.name, arch, suffix = pic_suffix))
+            cubin = ctx.actions.declare_file("_dlink{suffix}/{0}/{0}_{1}.cubin".format(ctx.attr.name, arch, suffix = pic_suffix))
+            ctx.actions.run(
+                outputs = [register_h, cubin],
+                inputs = objects,
+                executable = cuda_toolkit.nvlink,
+                arguments = [
+                    "--arch=" + arch,
+                    "--register-link-binaries=" + register_h.path,
+                    "--output-file=" + cubin.path,
+                    obj_args,
+                ],
+                mnemonic = "nvlink",
+            )
+            cubins.append(cubin)
+            images.append("--image=profile={},file={}".format(arch, cubin.path))
+
+    # Generate fatbin header from all cubins.
+    fatbin = ctx.actions.declare_file("_dlink{suffix}/{0}/{0}.fatbin".format(ctx.attr.name, suffix = pic_suffix))
+    fatbin_h = ctx.actions.declare_file("_dlink{suffix}/{0}/{0}_fatbin.h".format(ctx.attr.name, suffix = pic_suffix))
+
+    arguments = [
+        "-64",
+        "--cmdline=--compile-only",
+        "--link",
+        "--compress-all",
+        "--create=" + fatbin.path,
+        "--embedded-fatbin=" + fatbin_h.path,
+    ]
+    bin2c = cuda_toolkit.bin2c
+    if (cuda_toolkit.version_major, cuda_toolkit.version_minor) <= (10, 1):
+        arguments.append("--bin2c-path=%s" % bin2c.dirname)
+    ctx.actions.run(
+        outputs = [fatbin, fatbin_h],
+        inputs = cubins,
+        executable = cuda_toolkit.fatbinary,
+        arguments = arguments + images,
+        tools = [bin2c],
+        mnemonic = "fatbinary",
+    )
+
+    # Generate the source file #including the headers generated above.
+    fatbin_c = ctx.actions.declare_file("_dlink{suffix}/{0}/{0}.cu".format(ctx.attr.name, suffix = pic_suffix))
+    ctx.actions.expand_template(
+        output = fatbin_c,
+        template = cuda_toolkit.link_stub,
+        substitutions = {
+            "REGISTERLINKBINARYFILE": '"{}"'.format(register_h.short_path),
+            "FATBINFILE": '"{}"'.format(fatbin_h.short_path),
+        },
+    )
+
+    # cc_common.compile will cause file conflict for pic and non-pic objects,
+    # and it accepts only one set of src files. But pic fatbin_c and non-pic
+    # fatbin_c have different compilation trajectories. This make me feel bad.
+    # Just avoid cc_common.compile at all.
+    compile_common = cuda_helper.create_common_info(
+        # this is useless
+        cuda_archs_info = common.cuda_archs_info,
+        headers = [fatbin_h],
+        defines = [
+            # Silence warning about including internal header.
+            "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
+            # Macros that need to be defined starting with CUDA 10.
+            "__NV_EXTRA_INITIALIZATION=",
+            "__NV_EXTRA_FINALIZATION=",
+        ],
+        # TODO: avoid the hardcode path
+        includes = common.includes + ["external/local_cuda/cuda/include"],
+        system_includes = common.system_includes,
+        quote_includes = common.quote_includes,
+        # suppress cuda mode as c++ mode
+        host_compile_flags = common.host_compile_flags + ["-x", "c++"],
+    )
+    ret = compile(ctx, cuda_toolchain, cc_toolchain, srcs = [fatbin_c], common = compile_common, pic = pic, rdc = rdc)
+    return ret[0]
diff --git a/cuda/private/rules/cuda_library.bzl b/cuda/private/rules/cuda_library.bzl
index b7229927..9f92b9c7 100644
--- a/cuda/private/rules/cuda_library.bzl
+++ b/cuda/private/rules/cuda_library.bzl
@@ -3,7 +3,7 @@ load("//cuda/private:cuda_helper.bzl", "cuda_helper")
 load("//cuda/private:providers.bzl", "CudaInfo")
 load("//cuda/private:toolchain.bzl", "find_cuda_toolchain", "use_cpp_toolchain", "use_cuda_toolchain")
 load("//cuda/private:actions/compile.bzl", "compile")
-load("//cuda/private:actions/dlink.bzl", "device_link")
+load("//cuda/private:actions/dlink_wrapper.bzl", "device_link")
 load("//cuda/private:rules/common.bzl", "ALLOW_CUDA_HDRS", "ALLOW_CUDA_SRCS")
 
 def _cuda_library_impl(ctx):
diff --git a/cuda/private/toolchain_configs/nvcc.bzl b/cuda/private/toolchain_configs/nvcc.bzl
index 28671400..360c7ce4 100644
--- a/cuda/private/toolchain_configs/nvcc.bzl
+++ b/cuda/private/toolchain_configs/nvcc.bzl
@@ -115,6 +115,8 @@ def _impl(ctx):
         ],
     )
 
+    supports_compiler_device_link_feature = feature(name = "supports_compiler_device_link")
+
     cuda_device_link_action = action_config(
         action_name = ACTION_NAMES.device_link,
         flag_sets = [
@@ -150,6 +152,7 @@ def _impl(ctx):
             # "linker_input_flags",
             "compiler_output_flags",
             "nvcc_device_link_env",
+            "supports_compiler_device_link",
         ],
     )
 
@@ -396,6 +399,7 @@ def _impl(ctx):
         arch_native_feature,
         pic_feature,
         host_compiler_feature,
+        supports_compiler_device_link_feature,
         include_paths_feature,
         defines_feature,
         host_defines_feature,
diff --git a/cuda/private/toolchain_configs/nvcc_msvc.bzl b/cuda/private/toolchain_configs/nvcc_msvc.bzl
index 88427892..c4b59914 100644
--- a/cuda/private/toolchain_configs/nvcc_msvc.bzl
+++ b/cuda/private/toolchain_configs/nvcc_msvc.bzl
@@ -114,6 +114,8 @@ def _impl(ctx):
         ],
     )
 
+    supports_compiler_device_link_feature = feature(name = "supports_compiler_device_link")
+
     cuda_device_link_action = action_config(
         action_name = ACTION_NAMES.device_link,
         flag_sets = [
@@ -149,6 +151,7 @@ def _impl(ctx):
             # "linker_input_flags",
             "compiler_output_flags",
             "nvcc_compile_env",
+            "supports_compiler_device_link",
         ],
     )
 
@@ -476,6 +479,7 @@ def _impl(ctx):
     features = [
         nvcc_compile_env_feature,
         host_compiler_feature,
+        supports_compiler_device_link_feature,
         use_local_env_feature,
         arch_native_feature,
         default_compile_flags_feature,