junrushao
diff --git a/‎cmake/modules/contrib/CUTLASS.cmake‎
Lines changed: 2 additions & 2 deletions b/‎cmake/modules/contrib/CUTLASS.cmake‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎gallery/how_to/work_with_relay/using_pipeline_executor.py‎
Lines changed: 1 addition & 7 deletions b/‎gallery/how_to/work_with_relay/using_pipeline_executor.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎include/tvm/relax/transform.h‎
Lines changed: 7 additions & 6 deletions b/‎include/tvm/relax/transform.h‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎python/tvm/contrib/cutlass/build.py‎
Lines changed: 191 additions & 14 deletions b/‎python/tvm/contrib/cutlass/build.py‎
Lines changed: 191 additions & 14 deletions
diff --git a/‎python/tvm/contrib/cutlass/conv2d_profiler.py‎
Lines changed: 3 additions & 3 deletions b/‎python/tvm/contrib/cutlass/conv2d_profiler.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/tvm/contrib/cutlass/gen_tensor_op.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tvm/contrib/cutlass/gen_tensor_op.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/tvm/relax/transform/transform.py‎
Lines changed: 13 additions & 7 deletions b/‎python/tvm/relax/transform/transform.py‎
Lines changed: 13 additions & 7 deletions
@@ -16,8 +16,8 @@
 # under the License.
 
 if(USE_CUDA AND USE_CUTLASS)
-  tvm_file_glob(GLOB CUTLASS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cutlass/*.cc)
-  list(APPEND COMPILER_SRCS ${CUTLASS_RELAY_CONTRIB_SRC})
+  tvm_file_glob(GLOB CUTLASS_CONTRIB_SRC src/relay/backend/contrib/cutlass/*.cc src/relax/backend/contrib/cutlass/*.cc)
+  list(APPEND COMPILER_SRCS ${CUTLASS_CONTRIB_SRC})
 
   message(STATUS "Build with CUTLASS")
 endif()
@@ -29,12 +29,7 @@
 from tvm import relay
 from tvm.relay import testing
 import tvm.testing
-from tvm.contrib.cutlass import (
-    has_cutlass,
-    num_cutlass_partitions,
-    finalize_modules,
-    finalize_modules_vm,
-)
+from tvm.contrib.cutlass import finalize_modules
 
 img_size = 8
 #######################################################################
@@ -50,7 +45,6 @@ def get_network():
         "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16")
     )
     weight = relay.var("weight")
-    second_weight = relay.var("second_weight")
     bn_gamma = relay.var("bn_gamma")
     bn_beta = relay.var("bn_beta")
     bn_mmean = relay.var("bn_mean")
 
@@ -183,21 +183,22 @@ TVM_DLL Pass FuseOps(int fuse_opt_level = -1);
  * of a fused function after successful matching.
  * \param patterns The patterns to detect. The order of the patterns determines the order
  * of priority in which they are matched. Higher-priority patterns should come earlier in the list.
+ * \param annotate_codegen If true, wrap each created composite function with another function,
+ * whose body consists only of a call to the composite function, and annotate the outer function
+ * with kCodegen and kGlobalSymbol attributes. The kCodegen attribute is set as the prefix of the
+ * corresponding pattern name. For example, "dnnl" if the pattern name is "dnnl.conv2d_relu".
+ * This must be True if the created composite functions are intended to be offloaded to
+ * an external backend without using the MergeCompositeFunctions pass.
  * \return The Pass.
  */
 TVM_DLL Pass FuseOpsByPattern(const tvm::Array<runtime::String>& pattern_names,
-                              const tvm::Array<DFPattern>& patterns);
+                              const tvm::Array<DFPattern>& patterns, bool annotate_codegen = false);
 
 /*!
  * \brief Group one or multiple composite functions created by FuseOpsByPattern into a new
  *  function. The new function will be annotated with kCodegen and GlobalSymbol attributes,
  *  and it is intented to be offloaded to an external backend.
  *
- *  Even if there is only one composite function, or a backend does not benefit from receiving
- *  larger subgraphs, this pass is required to run for offloading (BYOC) since a composite function
- *  needs to be wrapped by an outer function that are annotated with "Codegen" and "global_symbol"
- *  attributes.
- *
  * \return The Pass.
  */
 TVM_DLL Pass MergeCompositeFunctions();
 
@@ -14,13 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, dangerous-default-value
+# pylint: disable=invalid-name, dangerous-default-value, arguments-differ
 """Driver for partitioning and building a Relay module for CUTLASS offload."""
 import logging
 import os
 import multiprocessing
 import tvm
-from tvm import runtime, relay
+from tvm import runtime, relay, relax
 from tvm.contrib.nvcc import get_cuda_version
 from tvm._ffi.registry import register_func
 from .gen_gemm import CutlassGemmProfiler
@@ -516,6 +516,167 @@ def tune_cutlass_function(
     )
 
 
+def _extract_relax_function_info(f):
+    signature = {}
+
+    for i, arg in enumerate(f.params):
+        sinfo = arg.struct_info
+        signature["arg%d_shape" % i] = list(sinfo.shape)
+        signature["arg%d_dtype" % i] = sinfo.dtype
+
+    ret_sinfo = f.ret_struct_info
+    signature["ret_shape"] = list(ret_sinfo.shape)
+    signature["ret_dtype"] = ret_sinfo.dtype
+
+    op_attrs = {}
+
+    def fvisit(e):
+        nonlocal op_attrs
+        if isinstance(e, relax.Call) and str(e.op) in ["relax.nn.conv2d"]:
+            op_attrs = e.attrs
+
+    relax.analysis.post_order_visit(f.body, fvisit)
+
+    return signature, op_attrs
+
+
+@relax.expr_functor.mutator
+class CutlassRelaxFunctionAnnotator(relax.PyExprMutator):
+    """A Relax function mutator that tunes and annotates CUTLASS composite functions
+    with shape, dtype and generated templates.
+    """
+
+    def __init__(self, mod, conv2d_profiler, options):
+        super().__init__(mod)
+        self.options = options
+        self.conv2d_profiler = conv2d_profiler
+
+    def handle_conv2d(self, f, op_type):
+        """Tune and annotate a conv2d op."""
+        signature, op_attrs = _extract_relax_function_info(f)
+
+        d_shape = signature["arg0_shape"]
+        w_shape = signature["arg1_shape"]
+        out_shape = signature["ret_shape"]
+        data_dtype = signature["arg0_dtype"]
+        weight_dtype = signature["arg1_dtype"]
+        out_dtype = signature["ret_dtype"]
+        padding = op_attrs["padding"]
+        strides = op_attrs["strides"]
+        dilation = op_attrs["dilation"]
+        conv_kind = ConvKind.Fprop
+
+        use_3xtf32 = self.options.get("use_3xtf32", False)
+        profile_all_alignments = self.options.get("profile_all_alignments", False)
+        find_first_valid = self.options.get("find_first_valid", True)
+        use_multiprocessing = self.options.get("use_multiprocessing", True)
+        split_k_slices = self.options.get("split_k_slices", [1])
+
+        op_name, op_def, _ = self.conv2d_profiler.profile(
+            op_type,
+            d_shape,
+            w_shape,
+            padding,
+            strides,
+            dilation,
+            out_dtype,
+            data_dtype,
+            weight_dtype,
+            use_3xtf32,
+            conv_kind,
+            split_k_slices,
+            profile_all_alignments,
+            find_first_valid=find_first_valid,
+            use_multiprocessing=use_multiprocessing,
+        )
+
+        return f.with_attrs(
+            {
+                "op_type": op_type,
+                "arg0_dtype": data_dtype,
+                "arg1_dtype": weight_dtype,
+                "ret_dtype": out_dtype,
+                "arg0_shape": d_shape,
+                "arg1_shape": w_shape,
+                "ret_shape": out_shape,
+                "strides": strides,
+                "padding": padding,
+                "dilation": dilation,
+                "cutlass_op_name": op_name,
+                "cutlass_op_def": op_def,
+            }
+        )
+
+    def visit_function_(self, f):
+        if "Composite" not in f.attrs:
+            body = super().visit_expr(f.body)
+            return relax.Function(f.params, body, f.ret_struct_info, f.attrs, f.span)
+
+        op_type = f.attrs["Composite"]
+
+        if "conv2d" in op_type:
+            return self.handle_conv2d(f, op_type)
+
+        raise ValueError("Unsupported composite {}".format(op_type))
+
+    def visit_span(self, span):
+        return span
+
+
+@register_func("contrib.cutlass.tune_relax_function")
+def profile_relax_function(functions, options):
+    """Tune and annotate CUTLASS composite functions with shape, dtype and generated templates."""
+    tmp_dir = options.get("tmp_dir", "./tmp")
+    sm = options.get("sm", 80)
+    conv2d_profiler = CutlassConv2DProfiler(sm, _get_cutlass_path(), tmp_dir)
+
+    annotated_functions = []
+
+    for f in functions:
+        annotator = CutlassRelaxFunctionAnnotator(
+            tvm.IRModule.from_expr(f), conv2d_profiler, options
+        )
+        annotated_functions.append(annotator.visit_expr(f))
+
+    return annotated_functions
+
+
+@register_func("contrib.cutlass.compile")
+def compile_cutlass_module(c_source_module, options):
+    """Compile all CUTLASS kernels in the given C-source module.
+
+    Parameters
+    ----------
+    c_source_module: runtime.Module
+        A C-source module containing CUTLASS kernels.
+
+    options: dict
+        Compilation options. Currently recognizes
+          "sm": The target architecture (compute capability), for example 75 or 80 (default: 80)
+          "threads": The number of threads to use in NVCC parallel compilation (default:
+          use all logical cores)
+          "use_fast_math": Whether or not to use faster but approximate arithmetic in some
+          CUTLASS epilogues (default: False)
+
+    Returns
+    -------
+    rt_mod : runtime.Module
+        A runtime module where all cutlass kernels have been compiled.
+    """
+    tmp_dir = options.get("tmp_dir", "./tmp")
+    defaults = {"sm": 80, "threads": -1, "use_fast_math": False}
+    compile_config = {key: options.get(key, val) for key, val in defaults.items()}
+
+    function_names = c_source_module.get_function("get_func_names")()
+    compile_options = _get_cutlass_compile_options(**compile_config)
+    lib_path = os.path.join(tmp_dir, "cutlass.o")
+    logger.info("Compiling generated CUTLASS code")
+    c_source_module.export_library(lib_path, workspace_dir=tmp_dir, **compile_options)
+
+    # Recover static library
+    return tvm.runtime.load_static_library(lib_path, function_names)
+
+
 @register_func("relay.ext.cutlass.compile_for_cutlass")
 def compile_for_cutlass(mod, cutlass_target):
     """Given an IRModule with at least one Compiler='cutlass' Relay function, return a
@@ -549,6 +710,7 @@ def compile_for_cutlass(mod, cutlass_target):
         key: cutlass_target.attrs.get(key) for key in ["sm", "threads", "use_fast_math"]
     }
     tmp_dir = cutlass_target.attrs.get("tmp_dir")
+    compile_config["tmp_dir"] = tmp_dir
 
     # Tune
     logger.info("Tuning for CUTLASS")
@@ -558,18 +720,7 @@ def compile_for_cutlass(mod, cutlass_target):
     logger.info("Creating CSource module for CUTLASS")
     create_c_source_module = tvm._ffi.get_global_func("relay.ext.cutlass.create_c_source_module")
     c_module = create_c_source_module(mod)
-    function_names = c_module.get_function("get_func_names")()
-    compile_options = _get_cutlass_compile_options(**compile_config)
-    lib_path = os.path.join(tmp_dir, "cutlass.o")
-    logger.info("Compiling generated CUTLASS code")
-    c_module.export_library(lib_path, workspace_dir=tmp_dir, **compile_options)
-
-    # Recover static library
-    logger.info("Loading compiled CUTLASS code")
-    final_mod = tvm.runtime.load_static_library(lib_path, function_names)
-
-    logger.info("Done with CUTLASS compilation")
-    return final_mod
+    return compile_cutlass_module(c_module, compile_config)
 
 
 def finalize_modules(lib, lib_path="compile.so", tmp_dir="./tmp"):
@@ -633,3 +784,29 @@ def finalize_modules_vm(vm_exec, lib_path="compile.so", vmcode_path="vmcode.ro",
         fo.write(code)
     lib = tvm.runtime.load_module(lib_path)
     return tvm.runtime.vm.Executable.load_exec(code, lib)
+
+
+def finalize_modules_relax(vm_exec, lib_path="compile.so", tmp_dir="./tmp"):
+    """finalize_modules_vm equivalent for Relax VM.
+
+    Parameters
+    ----------
+    vm_exec : vm.Executable
+        The output from relax.vm.build containing compiled host code and kernels.
+
+    lib_path : string
+        The path to a shared library which will be generated as the result of the build process.
+
+    tmp_dir : string
+        A temporary directory where intermediate compiled artifacts will be stored.
+
+    Returns
+    -------
+    updated_vm_exec : relax.vm.Executable
+        The updated VM executable with all compilation and linking completed.
+    """
+    lib_path = os.path.join(tmp_dir, lib_path)
+    vm_exec.mod.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
+    lib = tvm.runtime.load_module(lib_path)
+
+    return relax.vm.Executable(lib)
@@ -35,15 +35,15 @@ def __init__(self):
         cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
         {
       	 reinterpret_cast<ImplicitGemm::ElementC*> (workspace.get()),
-      	   ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])
+      	   ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])
       	   },
         {
       	 tensor_d.device_data(),
-      	   ReductionStrideIndex(tensor_d.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])
+      	   ReductionStrideIndex(tensor_d.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])
       	   },
         {
       	 tensor_c.device_data(),
-      	   ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])
+      	   ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])
       	   },
         {ElementComputeEpilogue(1), ElementComputeEpilogue(0)}
         );
 
@@ -317,7 +317,7 @@ def __init__(self, cuda_arch, cutlass_path, binary_prefix):
         self.cuda_arch = cuda_arch
         self.binary_prefix = binary_prefix
         self.cutlass = cutlass_path
-        self.cflags = "-I{cutlass}/include -I{cutlass}/tools/util/include -O3 -std=c++11".format(
+        self.cflags = "-I{cutlass}/include -I{cutlass}/tools/util/include -O3 -std=c++17".format(
             cutlass=cutlass_path
         )
         self.cflags += " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1"
 
@@ -300,7 +300,9 @@ def FuseOps(fuse_opt_level=-1) -> tvm.ir.transform.Pass:
     return _ffi_api.FuseOps(fuse_opt_level)  # type: ignore
 
 
-def FuseOpsByPattern(patterns: List[Tuple]) -> tvm.ir.transform.Pass:
+def FuseOpsByPattern(
+    patterns: List[Tuple], annotate_codegen: bool = False
+) -> tvm.ir.transform.Pass:
     """Apply pattern matching to each function in the given module, and group matched expressions
     into a new function.
 
@@ -314,26 +316,30 @@ def FuseOpsByPattern(patterns: List[Tuple]) -> tvm.ir.transform.Pass:
         The string is the name of the corresponding pattern. It becomes the value of the kComposite
         attribute of a fused function after a successful matching.
 
+    annotate_codegen : bool
+        If True, wrap each created composite function with another function, whose body consists
+        only of a call to the composite function, and annotate the outer function with "Codegen"
+        and "global_symbol" attributes. The "Codegen" attribute is set as the prefix of the
+        corresponding pattern name. For example, "dnnl" if the pattern name is "dnnl.conv2d_relu".
+
+        This must be True if the created composite functions are intended to be offloaded to
+        an external backend without using the MergeCompositeFunctions pass.
+
     Returns
     -------
     ret : tvm.transform.Pass
         The registered pass for pattern-based fusion.
 
     """
     pattern_names, df_patterns = zip(*patterns)
-    return _ffi_api.FuseOpsByPattern(pattern_names, df_patterns)  # type: ignore
+    return _ffi_api.FuseOpsByPattern(pattern_names, df_patterns, annotate_codegen)  # type: ignore
 
 
 def MergeCompositeFunctions() -> tvm.ir.transform.Pass:
     """Group one or multiple composite functions created by FuseOpsByPattern into a new function.
     The new function will be annotated with "Codegen" and "global_symbol" attributes, and it
     is intented to be offloaded to an external backend.
 
-    Even if there is only one composite function, or a backend does not benefit from receiving
-    larger subgraphs, this pass is required to run for offloading (BYOC) since a composite function
-    needs to be wrapped by an outer function that are annotated with "Codegen" and "global_symbol"
-    attributes.
-
     Returns
     -------
     ret : tvm.transform.Pass
Original file line number	Diff line number	Diff line change
`@@ -35,15 +35,15 @@ def __init__(self):`
`35`	`35`	`cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),`
`36`	`36`	`{`
`37`	`37`	`reinterpret_cast<ImplicitGemm::ElementC*> (workspace.get()),`
`38`		`- ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])`
	`38`	`+ ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])`
`39`	`39`	`},`
`40`	`40`	`{`
`41`	`41`	`tensor_d.device_data(),`
`42`		`- ReductionStrideIndex(tensor_d.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])`
	`42`	`+ ReductionStrideIndex(tensor_d.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])`
`43`	`43`	`},`
`44`	`44`	`{`
`45`	`45`	`tensor_c.device_data(),`
`46`		`- ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::ImplicitGemmKernel::kTensorCStrideIdx])`
	`46`	`+ ReductionStrideIndex(tensor_c.stride()[ImplicitGemm::UnderlyingKernel::kTensorCStrideIdx])`
`47`	`47`	`},`
`48`	`48`	`{ElementComputeEpilogue(1), ElementComputeEpilogue(0)}`
`49`	`49`	`);`
Original file line number	Diff line number	Diff line change
`@@ -317,7 +317,7 @@ def __init__(self, cuda_arch, cutlass_path, binary_prefix):`
`317`	`317`	`self.cuda_arch = cuda_arch`
`318`	`318`	`self.binary_prefix = binary_prefix`
`319`	`319`	`self.cutlass = cutlass_path`
`320`		`- self.cflags = "-I{cutlass}/include -I{cutlass}/tools/util/include -O3 -std=c++11".format(`
	`320`	`+ self.cflags = "-I{cutlass}/include -I{cutlass}/tools/util/include -O3 -std=c++17".format(`
`321`	`321`	`cutlass=cutlass_path`
`322`	`322`	`)`
`323`	`323`	`self.cflags += " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1"`