[BACKEND] Add passes to metax (#47)

ArinaJJH · zhzhcookie · web-flow · commit 257178c50469 · 2025-07-10T15:56:35.000+08:00
---------

Co-authored-by: zhengyang &lt;zhengyang@baai.ac.cn&gt;
diff --git a/third_party/metax/backend/compiler.py b/third_party/metax/backend/compiler.py
@@ -212,6 +212,7 @@ def make_ttgir(mod, metadata, opt, capability):
 
         if opt.pipeline == "cpasync":
             disable_prefetch = True
+            metax.passes.ttgpuir.add_tritonmetaxgpu_change_layout_for_int8_pass(pm, opt.num_stages, opt.pipeline)
         metax.passes.ttgpuir.add_accelerate_matmul(pm, opt.num_stages, disable_prefetch, store_coalesce, "c500")
         passes.ttgpuir.add_remove_layout_conversions(pm)
         if store_coalesce:
@@ -236,8 +237,11 @@ def make_ttgir(mod, metadata, opt, capability):
                     metax.passes.ttgpuir.add_pipeline_async_tt(pm, opt.num_stages)
                     metax.passes.ttgpuir.add_pipeline_async_base(pm, opt.num_stages, fullstage)
                 elif mla and opt.num_stages == 2 and opt.pipeline == "cpasync":
-                    metax.passes.ttgpuir.add_pipeline_async_multidot_mla(pm, opt.num_stages, fullstage,
-                                                                         opt.pipeline_load_num)
+                    metax.passes.ttgpuir.add_pipeline_async_multidot_mla_mixed(pm, opt.num_stages, fullstage,
+                                                                               opt.pipeline_load_num, single_shm, True)
+                elif mla and opt.num_stages == 2 and opt.pipeline == "mixed":
+                    metax.passes.ttgpuir.add_pipeline_async_multidot_mla_mixed(pm, opt.num_stages, fullstage,
+                                                                               opt.pipeline_load_num, single_shm, False)
                 else:
                     print("no avalilable pipeline for maca")
             else:
@@ -252,7 +256,7 @@ def make_ttgir(mod, metadata, opt, capability):
         passes.ttgpuir.add_reorder_instructions(pm)
         if os.getenv("TRITON_ENABLE_MACA_OPT_MOVE_DOT_OPERANDS_OUT_LOOP"):
             metax.passes.ttgpuir.add_tritonmetaxgpu_move_dot_operands_out_loop_pass(pm)
-        if os.getenv("TRITON_ENABLE_MACA_MERGE_EQUAL_SHARED_LAYOUT"):
+        if not os.getenv("TRITON_DISABLE_MACA_MERGE_EQUAL_SHARED_LAYOUT"):
             metax.passes.ttgpuir.add_tritonmetaxgpu_merge_equal_shared_layout_pass(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
@@ -322,14 +326,38 @@ def make_mcfatbin(src, metadata, opt, capability):
             if "roll" not in scenarios:
                 compile_options += " -mllvm -metaxgpu-mma-unroll-count=" + str(opt.num_stages) + " "
         elif opt.pipeline == "cpasync" and "mla" not in scenarios:
-            compile_options = " -mllvm -metaxgpu-sched-regpressure=true -mllvm -metaxgpu-sinkload=false -mllvm -metaxgpu-vectorize-slp=true \
-                                -mllvm -metaxgpu-igroup -mllvm -metaxgpu-aggressive-4g-addr-opt=true -mllvm -metaxgpu-shl-add-combine=false \
-                                -mllvm -misched-postra=true -mllvm -enable-post-misched=true "
+            compile_options = " -mllvm -metaxgpu-sched-regpressure=true "
+            compile_options += " -mllvm -metaxgpu-sinkload=false -mllvm -metaxgpu-vectorize-slp=true -mllvm -metaxgpu-igroup -mllvm -metaxgpu-aggressive-4g-addr-opt=true \
+                                -mllvm -metaxgpu-shl-add-combine=false -mllvm -misched-postra=true -mllvm -enable-post-misched=true "
 
             if os.getenv("TRITON_ENABLE_MACA_COMPILER_INT8_OPT"):
                 compile_options += " -mllvm -metaxgpu-slp-vectorize-i8=true"
+
             if "unroll" in scenarios:
                 compile_options += " -mllvm -metaxgpu-mma-unroll-count=" + str(opt.num_stages) + " "
+        if "flashattn-fwd" in scenarios:
+            compile_options = " -mllvm -metaxgpu-mma-sched=true -mllvm -metaxgpu-sched-select=metaxgpu-minreg -mllvm -map-use-pk-fma=1 "
+        elif "flashattn-bwd" in scenarios:
+            compile_options = " -mllvm -metaxgpu-sched-regpressure=true "
+            compile_options += " -mllvm -metaxgpu-sinkload=false -mllvm -metaxgpu-vectorize-slp=true "
+        if "mla" in scenarios:
+            # maybe will change the compile options in mla later
+            if opt.num_stages == 2:
+                if opt.pipeline == "cpasync":
+                    compile_options = " -mllvm -metaxgpu-sched-regpressure=true "
+                    compile_options += " -mllvm -metaxgpu-sinkload=false -mllvm -metaxgpu-vectorize-slp=true -mllvm -metaxgpu-igroup -mllvm -metaxgpu-aggressive-4g-addr-opt=true \
+                                        -mllvm -metaxgpu-shl-add-combine=false -mllvm -misched-postra=true -mllvm -enable-post-misched=true "
+
+                    if "unroll" in scenarios:
+                        compile_options += " -mllvm -metaxgpu-mma-unroll-count=" + str(opt.num_stages) + " "
+                elif opt.pipeline == "basic" or opt.pipeline == "mixed":
+                    compile_options = " -mllvm -metaxgpu-mma-sched=true -mllvm -map-use-pk-fma=1 -mllvm -metaxgpu-split-regalloc=true -mllvm -metaxgpu-aggressive-fold=true \
+                                        -mllvm -metaxgpu-disable-licm=true "
+
+                else:
+                    assert False, "Please set pipeline for mla!"
+            else:
+                compile_options = " -mllvm -metaxgpu-mma-sched=true -mllvm -map-use-pk-fma=1 -mllvm -metaxgpu-split-regalloc=true -mllvm -metaxgpu-aggressive-fold=true "
         if opt.extra_options != "":
             compile_options = opt.extra_options
         return metax.translate_llvmir_to_mcfatbin(src, mxcc_arch, os.environ.get('MACA_PATH'), compile_options)
diff --git a/third_party/metax/backend/driver.c b/third_party/metax/backend/driver.c
@@ -106,6 +106,7 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
   MCcontext pctx = 0;
 
   Py_BEGIN_ALLOW_THREADS;
+  // TODO: MCcontext implement not found
   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxGetCurrent(&pctx));
   if (!pctx) {
     MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
@@ -121,7 +122,6 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
       mcFuncGetAttribute(&n_spills, MC_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
   n_spills /= 4;
-
   Py_END_ALLOW_THREADS;
 
   if (PyErr_Occurred()) {
@@ -141,6 +141,36 @@ static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
     return NULL;
   }
 
+  Py_BEGIN_ALLOW_THREADS;
+
+  // Ensure we have an active context.
+  // MCcontext ctx = NULL;
+  // TODO: CU_LIMIT_PRINTF_FIFO_SIZE implement not found
+  // MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxGetCurrent(&ctx));
+  // if (!ctx) {
+  //   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+  //       mcDevicePrimaryCtxRetain(&ctx, /*device=*/0));
+  //   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(mcCtxSetCurrent(ctx));
+  // }
+
+  // // We can't set the fifo size after running a kernel that calls printf.
+  // This
+  // // is true even if the set() call is a nop and the new size is the same as
+  // the
+  // // old size.
+  // //
+  // // This is unfriendly, so check if the old size matches the new size, and
+  // skip
+  // // the set() call if so.
+  // size_t oldSize = 0;
+  // MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+  //     mcCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
+  // if (oldSize != size) {
+  //   MACA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+  //       mcCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
+  // }
+
+  Py_END_ALLOW_THREADS;
   return Py_None;
 }
 
diff --git a/third_party/metax/backend/driver.py b/third_party/metax/backend/driver.py
@@ -143,6 +143,7 @@ def format_of(ty):
 #include <stdbool.h>
 #include <Python.h>
 #include <dlfcn.h>
+#include <stdlib.h>  // MACA: for getenv
 
 static inline void gpuAssert(mcError_t code, const char *file, int line)
 {{
@@ -201,14 +202,16 @@ def format_of(ty):
     ptr_info.dev_ptr = (mcDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
     if(!ptr_info.dev_ptr)
       return ptr_info;
-    uint64_t dev_ptr;
-    int status = mcPointerGetAttribute(&dev_ptr, mcPointerAttributeDevicePointer, ptr_info.dev_ptr);
-    if (status == mcErrorInvalidValue) {{
-        PyErr_Format(PyExc_ValueError,
-                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
-        ptr_info.valid = false;
+    if (getenv("TRITON_DISABLE_DEVICE_POINTER_ATTR_CHECK") == NULL) {{
+        uint64_t dev_ptr;
+        int status = mcPointerGetAttribute(&dev_ptr, mcPointerAttributeDevicePointer, ptr_info.dev_ptr);
+        if (status == mcErrorInvalidValue) {{
+            PyErr_Format(PyExc_ValueError,
+                         "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
+            ptr_info.valid = false;
+        }}
+        ptr_info.dev_ptr = (mcDeviceptr_t)dev_ptr;
     }}
-    ptr_info.dev_ptr = (mcDeviceptr_t)dev_ptr;
     Py_DECREF(ret);  // Thanks ChatGPT!
     return ptr_info;
   }}