flashinfer-ai
diff --git a/‎benchmarks/README.md
Lines changed: 4 additions & 0 deletions b/‎benchmarks/README.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎benchmarks/bench_trtllm_gen_fused_moe_autotuner.py
Lines changed: 4 additions & 1 deletion b/‎benchmarks/bench_trtllm_gen_fused_moe_autotuner.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎benchmarks/routines/flashinfer_benchmark_utils.py
Lines changed: 1 addition & 0 deletions b/‎benchmarks/routines/flashinfer_benchmark_utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/routines/moe.py
Lines changed: 25 additions & 5 deletions b/‎benchmarks/routines/moe.py
Lines changed: 25 additions & 5 deletions
diff --git a/‎csrc/trtllm_batched_gemm_runner.cu
Lines changed: 6 additions & 4 deletions b/‎csrc/trtllm_batched_gemm_runner.cu
Lines changed: 6 additions & 4 deletions
diff --git a/‎csrc/trtllm_fused_moe_kernel_launcher.cu
Lines changed: 13 additions & 9 deletions b/‎csrc/trtllm_fused_moe_kernel_launcher.cu
Lines changed: 13 additions & 9 deletions
diff --git a/‎csrc/trtllm_fused_moe_routing_renormalize.cu
Lines changed: 13 additions & 7 deletions b/‎csrc/trtllm_fused_moe_routing_renormalize.cu
Lines changed: 13 additions & 7 deletions
diff --git a/‎csrc/trtllm_fused_moe_runner.cu
Lines changed: 35 additions & 25 deletions b/‎csrc/trtllm_fused_moe_runner.cu
Lines changed: 35 additions & 25 deletions
diff --git a/‎flashinfer/__init__.py
Lines changed: 1 addition & 0 deletions b/‎flashinfer/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -45,6 +45,9 @@ python3 flashinfer_benchmark.py --routine mm_fp4 --m 8192 --n 4096 --k 16384 --o
 # MOE FP4 Block Scale (DeepSeekV3 routing)
 python3 flashinfer_benchmark.py --routine trtllm_fp4_block_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 128 --top_k 8 --n_group 8 --topk_group 4 --routed_scaling_factor 2.5 --use_routing_bias --routing_method deepseek_v3 --use_shuffled_weight --verbose --generate_repro_command
 
+# MOE FP4 Block Scale (topk routing, GeGlu gated act)
+python3 flashinfer_benchmark.py --routine trtllm_fp4_block_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 128 --top_k 8 --routing_method topk --use_shuffled_weight --gated_act geglu --verbose --generate_repro_command
+
 # MOE FP8 Block Scale with DeepSeekV3 routing
 python3 flashinfer_benchmark.py --routine trtllm_fp8_block_scale_moe --num_tokens 1024 --hidden_size 1024 --intermediate_size 1024 --num_experts 128 --top_k 8 --n_group 8 --topk_group 4 --routed_scaling_factor 2.5 --use_routing_bias --routing_method deepseek_v3 --use_shuffled_weight --verbose --generate_repro_command
 
@@ -148,6 +151,7 @@ The output CSV will contain detailed metrics including:
 | `--tp_rank`              | Tensor-parallel rank                                                                                        |
 | `--ep_size`              | Expert-parallel world size                                                                                  |
 | `--ep_rank`              | Expert-parallel rank                                                                                        |
+| `--gated_act`            | Gated activation function: `swiglu` (default) or `geglu`                                                   |
 
 ### MOE Routing Method Compatibility
 
 
@@ -3,6 +3,8 @@
 import torch
 import numpy as np
 from flashinfer import (
+    RoutingMethodType,
+    GatedActType,
     fp4_quantize,
     mxfp8_quantize,
     next_positive_power_of_2,
@@ -156,9 +158,10 @@ def bench_trtllm_gen_fused_moe_autotuner(
         num_experts,
         None,  # routed_scaling_factor
         tile_tokens_dim,
-        1,
+        RoutingMethodType.Renormalize.value[0],
         True,
         enable_pdl,
+        GatedActType.SwiGlu.value,  # gated_act_type
         None,
         num_tokens if tune_max_num_tokens is None else tune_max_num_tokens,
     )
 
@@ -59,6 +59,7 @@
         "use_routing_scales_on_input",
         "input_dtype",
         "weight_dtype",
+        "gated_act",
         # CUTLASS fused MoE specific
         "cutlass_variant",
         "quantized_input",
 
@@ -132,9 +132,10 @@ def parse_moe_args(line, parser):
             "deepseek_v3",
             "llama4",
             "renormalize_naive",
+            "topk",
         ],
         help=(
-            "Routing method: renormalize | deepseek_v3 | llama4 | renormalize_naive."
+            "Routing method: renormalize | deepseek_v3 | llama4 | renormalize_naive | topk."
         ),
     )
     parser.add_argument(
@@ -177,6 +178,14 @@ def parse_moe_args(line, parser):
         default="bfloat16",
         help="Data type of the weights (before quantization).",
     )
+    parser.add_argument(
+        "--gated_act",
+        type=str,
+        required=False,
+        default="swiglu",
+        choices=["swiglu", "geglu"],
+        help="Type of gated activation function: swiglu | geglu.",
+    )
 
     # CUTLASS fused MoE specific
     parser.add_argument(
@@ -225,13 +234,22 @@ def parse_moe_args(line, parser):
     args = parser.parse_args(line)
 
     # Normalize routing method (map string to internal int expected by kernels)
-    name_to_type = {
+    routing_method_name_to_type = {
         "renormalize": 1,
         "deepseek_v3": 2,
         "llama4": 3,
         "renormalize_naive": 4,
+        "topk": 5,
     }
-    args.routing_method_type = name_to_type[args.routing_method]
+    args.routing_method_type = routing_method_name_to_type[args.routing_method]
+
+    # Normalize gated act type (map string to internal int expected by kernels)
+    gated_act_name_to_type = {
+        "swiglu": 0,
+        "geglu": 1,
+    }
+    args.gated_act_type = gated_act_name_to_type[args.gated_act]
+
     if args.verbose >= 1:
         print(f"[INFO] {args = }")
     return args
@@ -451,8 +469,7 @@ def get_effective_bytes(dtype: torch.dtype, fmt: Optional[str]) -> float:
     if active_experts is not None:
         num_active_experts = active_experts
     else:
-        # CUTLASS MoE does not support active_experts, so we return -1
-        return -1
+        num_active_experts = min(num_experts, top_k * num_tokens)
     weight_bytes = num_active_experts * weight_bytes_per_expert
 
     # Output memory (typically full precision)
@@ -539,6 +556,7 @@ def testTrtllmFp4BlockScaleMoe(args):
     use_shuffled_weight = args.use_shuffled_weight
     weight_layout = args.weight_layout
     is_cuda_graph_compatible = not args.no_cuda_graph
+    gated_act_type = args.gated_act_type
 
     if args.verbose >= 1:
         print(
@@ -669,6 +687,7 @@ def run_fp4_moe():
             routed_scaling_factor=routed_scaling_factor,
             tile_tokens_dim=tile_tokens_dim,
             routing_method_type=routing_method_type,
+            gated_act_type=gated_act_type,
             do_finalize=True,
         )
 
@@ -745,6 +764,7 @@ def run_fp4_moe():
         cur_res["use_routing_scales_on_input"] = args.use_routing_scales_on_input
         cur_res["input_dtype"] = input_dtype
         cur_res["weight_dtype"] = weight_dtype
+        cur_res["gated_act"] = args.gated_act
         res.append(cur_res)
 
     return res
 
@@ -103,8 +103,10 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
         tileSize == mOptions.tileSize &&
         options.mUseShuffledMatrixA == mOptions.useShuffledMatrixA &&
         options.mLayoutA == mOptions.weightLayout) {
-      // FIXME: Disable split-k for now.
-      if (options.mClusterDimZ != 1) {
+      // FIXME: Disable split-k for swiglu for now.
+      if (static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType) ==
+              batchedGemm::gemmGatedAct::ActType::SwiGlu &&
+          options.mClusterDimZ != 1) {
         continue;
       }
 
@@ -213,8 +215,8 @@ void TrtllmGenBatchedGemmRunner::run(
   gemmData.mInputBuffers.mPtrPerTokenSfB =
       mOptions.transposeMmaOutput ? perTokensSfA : perTokensSfB;
   gemmData.mInputBuffers.mPtrBias = ptrBias;
-  gemmData.mInputBuffers.mPtrSwiGluAlpha = ptrAlpha;
-  gemmData.mInputBuffers.mPtrSwiGluBeta = ptrBeta;
+  gemmData.mInputBuffers.mPtrGatedActAlpha = ptrAlpha;
+  gemmData.mInputBuffers.mPtrGatedActBeta = ptrBeta;
   gemmData.mInputBuffers.mPtrClampLimit = ptrClampLimit;
 
   gemmData.mInputBuffers.mPtrRouteMap = routeMap;
 
@@ -39,6 +39,7 @@
 namespace flashinfer {
 
 namespace btg = batchedGemm::trtllm::gen;
+using tensorrt_llm::kernels::trtllmgen_moe::MoE::GatedActType;
 using tensorrt_llm::kernels::trtllmgen_moe::Routing::RoutingMethodType;
 
 at::Tensor trtllm_fp8_per_tensor_scale_moe_launcher(
@@ -732,10 +733,11 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
   } else if (static_cast<RoutingMethodType>(routing_method_type) ==
                  RoutingMethodType::Renormalize ||
              static_cast<RoutingMethodType>(routing_method_type) ==
-                 RoutingMethodType::RenormalizeNaive) {
+                 RoutingMethodType::RenormalizeNaive ||
+             static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::TopK) {
     TORCH_CHECK(
         top_k <= 8 && top_k > 0,
-        "Current routing kernel (no groups, renormalize) only supports top_k<=8 && top_k>0.");
+        "Current routing kernel (no groups, renormalize/topk) only supports top_k<=8 && top_k>0.");
   } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
     TORCH_CHECK(top_k == 1, "Current routing kernel (no groups, Llama4) only supports top_k=1.");
   }
@@ -1058,8 +1060,8 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
     std::optional<int64_t> n_group, std::optional<int64_t> topk_group, int64_t intermediate_size,
     int64_t local_expert_offset, int64_t local_num_experts,
     std::optional<double> routed_scaling_factor, int64_t tile_tokens_dim,
-    int64_t routing_method_type, bool do_finalize, bool enable_pdl, at::Tensor& output,
-    int64_t config_index) {
+    int64_t routing_method_type, bool do_finalize, bool enable_pdl, int64_t gated_act_type,
+    at::Tensor& output, int64_t config_index) {
   using RunnerType = tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner;
 
   int const num_tokens = hidden_states.sizes()[0];
@@ -1110,7 +1112,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
   // Properly initialize the runner using make_unique like in the original code
   auto mRunner = std::make_unique<RunnerType>(
       mDtypeAct, mDtypeWeights, mUseDeepSeekFp8, (int32_t)tile_tokens_dim,
-      tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
+      static_cast<GatedActType>(gated_act_type), /*useShuffledMatrixA*/ true);
 
   if (config_index == -1) {
     config_index = mRunner->getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
@@ -1131,25 +1133,27 @@ int64_t trtllm_get_default_moe_configs(int64_t const tile_tokens_dim, int64_t co
                                        int64_t const dtype_weights_, bool const useDeepSeekFp8,
                                        int64_t const top_k, int64_t const hidden_size,
                                        int64_t const intermediate_size,
-                                       int64_t const num_local_experts, int64_t const num_tokens) {
+                                       int64_t const num_local_experts,
+                                       int64_t const gated_act_type, int64_t const num_tokens) {
   auto dtype_act = static_cast<btg::Dtype>(dtype_act_);
   auto dtype_weights = static_cast<btg::Dtype>(dtype_weights_);
   tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
       dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
-      tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
+      static_cast<GatedActType>(gated_act_type), /*useShuffledMatrixA*/ true);
   return moe_runner.getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
                                                num_local_experts, num_tokens);
 }
 
 std::vector<int64_t> trtllm_get_valid_moe_configs(
     int64_t const tile_tokens_dim, int64_t const dtype_act_, int64_t const dtype_weights_,
     bool const useDeepSeekFp8, int64_t const top_k, int64_t const hidden_size,
-    int64_t const intermediate_size, int64_t const num_local_experts, int64_t const num_tokens) {
+    int64_t const intermediate_size, int64_t const num_local_experts, int64_t const gated_act_type,
+    int64_t const num_tokens) {
   auto dtype_act = static_cast<btg::Dtype>(dtype_act_);
   auto dtype_weights = static_cast<btg::Dtype>(dtype_weights_);
   tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
       dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
-      tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
+      static_cast<GatedActType>(gated_act_type), /*useShuffledMatrixA*/ true);
   return moe_runner.getValidConfigIndices(top_k, hidden_size, intermediate_size, num_local_experts,
                                           num_tokens);
 }
 
@@ -32,7 +32,8 @@ __forceinline__ __device__ void routingTopKExperts(
     cg::thread_block_tile<WarpSize> const& warp, DataType (&score)[VecSize],
     int32_t (&idx)[VecSize], DataType (&warpTopKScore)[MaxNumTopExperts],
     int32_t (&warpTopKExpertIdx)[MaxNumTopExperts], int32_t const laneIdx, int32_t const numExperts,
-    int32_t topK, InputType const* ptrScores, bool const normTopkProb) {
+    int32_t topK, InputType const* ptrScores, bool const normTopkProb,
+    bool const applySoftmaxAfterTopK) {
   DataType minScore = DataType{-INFINITY};
 
   for (int i = 0; i < VecSize; i++) {
@@ -59,11 +60,14 @@ __forceinline__ __device__ void routingTopKExperts(
       warpTopKScore[laneIdx] = warpTopKScore[laneIdx] / sum;
     }
   } else {
-    auto softmaxScore =
-        calcSoftmax(warp, laneIdx < topK ? warpTopKScore[laneIdx] : minScore, laneIdx, topK);
-    if (laneIdx < topK) {
-      warpTopKScore[laneIdx] = softmaxScore;
+    if (applySoftmaxAfterTopK) {
+      auto softmaxScore =
+          calcSoftmax(warp, laneIdx < topK ? warpTopKScore[laneIdx] : minScore, laneIdx, topK);
+      if (laneIdx < topK) {
+        warpTopKScore[laneIdx] = softmaxScore;
+      }
     }
+    // If applySoftmaxAfterTopK is false, we keep the raw TopK values without softmax
   }
 }
 
@@ -113,7 +117,8 @@ __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(Nu
     if (validToken) {
       routingTopKExperts<BaseType, InputT, VecSize, KernelParams::DoSoftmaxBeforeTopK>(
           warp, score, idx, warpTopKScore, warpTopKExpertIdx, laneIdx, params.mNumExperts,
-          params.mTopK, params.mPtrScores + scoreOffset, params.mNormTopkProb);
+          params.mTopK, params.mPtrScores + scoreOffset, params.mNormTopkProb,
+          params.mApplySoftmaxAfterTopK);
 
       if (laneIdx < params.mTopK) {
         smemPackedScoreIdx[warpIdx * params.mTopK + laneIdx] =
@@ -205,7 +210,8 @@ __global__ void __launch_bounds__(NumThreadsHist)
 
     routingTopKExperts<BaseType, InputT, VecSize, KernelParams::DoSoftmaxBeforeTopK>(
         warp, allScores, allExpertIdx, warpTopKScore, warpTopKExpertIdx, laneIdx,
-        params.mNumExperts, params.mTopK, params.mPtrScores + scoreOffset, params.mNormTopkProb);
+        params.mNumExperts, params.mTopK, params.mPtrScores + scoreOffset, params.mNormTopkProb,
+        params.mApplySoftmaxAfterTopK);
 
     if (laneIdx < params.mTopK) {
       PackedScoreIdx<OutputT> packedScore{static_cast<OutputT>(warpTopKScore[laneIdx]),
 
@@ -122,8 +122,9 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
     routingData.mLocalExpertsStrideLog2 = 0;
     routingData.mNumLocalExperts = localNumExperts;
     moe::dev::routing::routingLlama4::run(routingData, stream);
-  } else if (routingMethodType == RoutingMethodType::Renormalize /* default */
-             || routingMethodType == RoutingMethodType::RenormalizeNaive /* Softmax -> TopK */) {
+  } else if (routingMethodType == RoutingMethodType::Renormalize         /* default */
+             || routingMethodType == RoutingMethodType::RenormalizeNaive /* Softmax -> TopK */
+             || routingMethodType == RoutingMethodType::TopK /* TopK only (no softmax) */) {
     moe::dev::routing::routingRenormalize::Data routingData;
 
     //
@@ -135,6 +136,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
     routingData.mUsePdl = true;
     routingData.mDoSoftmaxBeforeTopK = routingMethodType == RoutingMethodType::RenormalizeNaive;
     routingData.mNormTopkProb = routingMethodType == RoutingMethodType::RenormalizeNaive;
+    routingData.mApplySoftmaxAfterTopK = routingMethodType == RoutingMethodType::Renormalize;
 
     routingData.mPtrScores = routingLogits;
 
@@ -178,33 +180,41 @@ namespace PermuteGemm1 {
 
 tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(
     btg::Dtype dtypeAct, btg::Dtype dtypeWeights, int32_t tileTokensDim, bool useDeepSeekFp8,
-    ActType actType, bool useShuffledMatrixA, batchedGemm::gemm::MatrixLayout weightLayout) {
-  tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {
-      // Swap A and B dtypes because transposeMmaOutput is hardcoded to true
-      .dtypeA = dtypeWeights,
-      .dtypeB = dtypeAct,
-      .dtypeC = dtypeAct,
-      .actType = actType,
-      .deepSeekFp8 = useDeepSeekFp8,
-      .fusedAct = !useDeepSeekFp8,
-      .routeAct = true,
-      .staticBatch = false,
-      .transposeMmaOutput = true,
-      .tileSize = tileTokensDim,
-      .epilogueTileM = useDeepSeekFp8 ? 64 : 128,
-      .useShuffledMatrixA = useShuffledMatrixA,
-      .weightLayout = weightLayout};
-  return options;
+    MoE::GatedActType gatedActType, bool useShuffledMatrixA,
+    batchedGemm::gemm::MatrixLayout weightLayout) {
+  if (gatedActType == MoE::GatedActType::SwiGlu || gatedActType == MoE::GatedActType::GeGlu) {
+    ActType actType =
+        (gatedActType == MoE::GatedActType::SwiGlu) ? ActType::SwiGlu : ActType::GeGlu;
+    tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {
+        // Swap A and B dtypes because transposeMmaOutput is hardcoded to true
+        .dtypeA = dtypeWeights,
+        .dtypeB = dtypeAct,
+        .dtypeC = dtypeAct,
+        .actType = actType,
+        .deepSeekFp8 = useDeepSeekFp8,
+        .fusedAct = !useDeepSeekFp8,
+        .routeAct = true,
+        .staticBatch = false,
+        .transposeMmaOutput = true,
+        .tileSize = tileTokensDim,
+        .epilogueTileM = useDeepSeekFp8 ? 64 : 128,
+        .useShuffledMatrixA = useShuffledMatrixA,
+        .weightLayout = weightLayout};
+    return options;
+  } else {
+    TORCH_CHECK(false, "Unimplemented gated act type %s of enum %d",
+                MoE::serializeGatedActType(gatedActType).c_str(), (int)gatedActType);
+  }
 }
 
 Runner::Runner(btg::Dtype dtypeAct, btg::Dtype dtypeWeights, bool useDeepSeekFp8, int tileTokensDim,
-               ActType actType, bool useShuffledMatrixA,
+               MoE::GatedActType gatedActType, bool useShuffledMatrixA,
                batchedGemm::gemm::MatrixLayout weightLayout)
     : mDtypeAct(dtypeAct),
       mDtypeWeights(dtypeWeights),
       mTileTokensDim(tileTokensDim),
       mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(
-          getOptions(mDtypeAct, mDtypeWeights, mTileTokensDim, useDeepSeekFp8, actType,
+          getOptions(mDtypeAct, mDtypeWeights, mTileTokensDim, useDeepSeekFp8, gatedActType,
                      useShuffledMatrixA, weightLayout))) {}
 
 void Runner::run(void* hiddenState, void* hiddenStateScale, void* weights, void* weightsScale,
@@ -352,10 +362,10 @@ std::vector<int64_t> Runner::getPassingConfigIndices() const {
 
 namespace MoE {
 Runner::Runner(btg::Dtype dtypeAct, btg::Dtype dtypeWeights, bool useDeepSeekFp8,
-               int32_t tileTokensDim, ActType actType, bool useShuffledMatrixA,
+               int32_t tileTokensDim, GatedActType gatedActType, bool useShuffledMatrixA,
                batchedGemm::gemm::MatrixLayout weightLayout)
     : mPermuteGemm1(PermuteGemm1::Runner(dtypeAct, dtypeWeights, useDeepSeekFp8, tileTokensDim,
-                                         actType, useShuffledMatrixA, weightLayout)),
+                                         gatedActType, useShuffledMatrixA, weightLayout)),
       mGemm2(Gemm2::Runner(dtypeAct, dtypeWeights, btg::Dtype::Bfloat16, useDeepSeekFp8,
                            tileTokensDim, useShuffledMatrixA, weightLayout)) {
   auto const& gemm1PassingIndices = mPermuteGemm1.getPassingConfigIndices();
@@ -375,8 +385,8 @@ Runner::Runner(btg::Dtype dtypeAct, btg::Dtype dtypeWeights, bool useDeepSeekFp8
 
 Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim,
                bool useShuffledMatrixA, batchedGemm::gemm::MatrixLayout weightLayout)
-    : Runner(dtypeElt, dtypeElt, useDeepSeekFp8, tileTokensDim, ActType::SwiGlu, useShuffledMatrixA,
-             weightLayout) {}
+    : Runner(dtypeElt, dtypeElt, useDeepSeekFp8, tileTokensDim, GatedActType::SwiGlu,
+             useShuffledMatrixA, weightLayout) {}
 
 void Runner::setOpsData(MoERunnerArgs const& args, MoEWorkspace const& workspace,
                         moe::dev::convertsf::Data& convertSfData,
 
@@ -64,6 +64,7 @@
 from .fp8_quantization import mxfp8_dequantize_host, mxfp8_quantize
 from .fused_moe import (
     RoutingMethodType,
+    GatedActType,
     cutlass_fused_moe,
     reorder_rows_for_gated_act_gemm,
     trtllm_fp4_block_scale_moe,