ModelTC
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py
Lines changed: 3 additions & 1 deletion b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py
Lines changed: 2 additions & 1 deletion b/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py
Lines changed: 23 additions & 6 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py
Lines changed: 23 additions & 6 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_topk.py
Lines changed: 38 additions & 13 deletions b/‎lightllm/common/fused_moe/grouped_topk.py
Lines changed: 38 additions & 13 deletions
diff --git a/‎lightllm/common/fused_moe/topk_select.py
Lines changed: 46 additions & 3 deletions b/‎lightllm/common/fused_moe/topk_select.py
Lines changed: 46 additions & 3 deletions
diff --git a/‎lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
Lines changed: 5 additions & 8 deletions b/‎lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
Lines changed: 5 additions & 8 deletions
@@ -167,7 +167,7 @@ def _load_hf_weights_etp(self, weights):
         expert_gate_up_proj_last = None
         expert_down_proj_last = None
         if self.e_score_correction_bias_name in weights:
-            self.e_score_correction_bias = self._cuda(self.e_score_correction_bias_name)
+            self.e_score_correction_bias = self._cuda(weights[self.e_score_correction_bias_name])
 
         for i_experts_ep in range(n_expert_ep):
             expert_up_proj = None
@@ -223,6 +223,8 @@ def load_hf_weights(self, weights):
         if os.environ.get("ETP_MODE_ENABLED") == "true":
             self._load_hf_weights_etp(weights)
         else:
+            if self.e_score_correction_bias_name in weights:
+                self.e_score_correction_bias = self._cuda(weights[self.e_score_correction_bias_name])
             for i_experts in range(self.n_routed_experts):
                 w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.weight"
                 w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.weight"
 
@@ -500,7 +500,8 @@ def dequant_weight(self, weight: torch.Tensor, scale: torch.Tensor) -> torch.Ten
         weight = weight.to(self.data_type_)
         block_size = weight.shape[-1] // scale.shape[-1]
         w_shape = weight.shape
-        scale = scale.unsqueeze(-1).repeat(1, 1, 1, block_size).reshape(w_shape[0], w_shape[1], -1)
+        s_shape = scale.shape
+        scale = scale.unsqueeze(-1).repeat(1, 1, 1, block_size).reshape(s_shape[0], s_shape[1], -1)
         scale = scale.unsqueeze(2).repeat(1, 1, block_size, 1).reshape(w_shape)
         return (weight * scale).to(self.data_type_)
 
 
@@ -33,6 +33,7 @@
 from .moe_kernel_configs import MoeGroupedGemmKernelConfig
 from .moe_silu_and_mul import silu_and_mul_fwd
 from .moe_sum_reduce import moe_sum_reduce
+from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 
 FFN_MOE_CHUNK_SIZE = 8 * 1024
 
@@ -223,7 +224,7 @@ def grouped_matmul_kernel(
     n,  # int
     expert_num,  # int
     topk_num,  # int
-    token_scale_ptr,  # [1,]
+    token_scale_ptr,  # [1,] for per tensor quant, or [token_num, hidden_dim // block_size] for per token, group quant
     weight_scale_ptr,  # [expert_num,] or [export_num, n // block_size_n, k // block_size_k]
     weight_scale_stride0,
     weight_scale_stride1,
@@ -306,7 +307,7 @@ def grouped_matmul_kernel(
 
             if use_fp8_w8a8:
                 if block_size_k > 0 and block_size_n > 0:
-                    a_scale = tl.load(token_scale_ptr, eviction_policy="evict_last")
+                    a_scale_ptrs = token_scale_ptr + (a_m_index // topk_num) * (token_stride_0 // block_size_k)
                     offs_bsn = offs_bn // block_size_n
                     b_scale_ptrs = weight_scale_ptr + expert_id * weight_scale_stride0 + offs_bsn * weight_scale_stride1
                 else:
@@ -342,8 +343,9 @@ def grouped_matmul_kernel(
                 if use_fp8_w8a8:
                     if block_size_k > 0 and block_size_n > 0:
                         offs_ks = step_k * BLOCK_SIZE_K // block_size_k
+                        a_scale = tl.load(a_scale_ptrs + offs_ks, mask=offs_am < cur_m, other=0.0)
                         b_scale = tl.load(b_scale_ptrs + offs_ks * weight_scale_stride2)
-                        accumulator += tl.dot(b, a) * a_scale * b_scale[:, None]
+                        accumulator += tl.dot(b, a) * b_scale[:, None] * a_scale[None, :]
                     else:
                         accumulator = tl.dot(b, a, acc=accumulator)
                 else:
@@ -387,6 +389,7 @@ def grouped_matmul(
     expert_token_limit: int,
     mul_routed_weight: bool,
     use_fp8_w8a8: bool,
+    alloc_tensor_func=torch.empty,
     **run_config,
 ):
     """
@@ -417,7 +420,6 @@ def grouped_matmul(
         if expert_to_weights_scale.ndim == 3:
             block_size_n = expert_weights.shape[1] // expert_to_weights_scale.shape[1]
             block_size_k = expert_weights.shape[2] // expert_to_weights_scale.shape[2]
-
     if not run_config:
         run_config = MoeGroupedGemmKernelConfig.try_to_get_best_config(
             M=token_inputs.shape[0],
@@ -436,8 +438,22 @@ def grouped_matmul(
     num_warps = run_config["num_warps"]
     num_stages = run_config["num_stages"]
 
+    if block_size_k != 0:
+        # 如果使用了 block wise 量化，分块大小不能超过 block size
+        BLOCK_SIZE_K = min(BLOCK_SIZE_K, block_size_k)
+        assert BLOCK_SIZE_K == triton.next_power_of_2(BLOCK_SIZE_K)
+
     if use_fp8_w8a8:
-        token_inputs, token_input_scale = ops.scaled_fp8_quant(token_inputs, token_input_scale)
+        # 当权重使用 block wise 量化时，激活也使用 per token， group size 量化
+        if block_size_k == 0:
+            token_inputs, token_input_scale = ops.scaled_fp8_quant(token_inputs, token_input_scale)
+        else:
+            _m, _k = token_inputs.shape
+            assert _k % block_size_k == 0
+            input_scale = alloc_tensor_func((_m, _k // block_size_k), dtype=torch.float32, device=token_inputs.device)
+            qinput_tensor = alloc_tensor_func((_m, _k), dtype=expert_weights.dtype, device=token_inputs.device)
+            per_token_group_quant_fp8(token_inputs, block_size_k, qinput_tensor, input_scale)
+            token_inputs, token_input_scale = qinput_tensor, input_scale
 
     kernel = grouped_matmul_kernel.warmup(
         expert_token_limit,
@@ -579,7 +595,6 @@ def fused_experts_impl(
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
     assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
-
     num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
     CHUNK_SIZE = FFN_MOE_CHUNK_SIZE
@@ -632,6 +647,7 @@ def fused_experts_impl(
             expert_token_limit=2 ** 31 - 1,
             mul_routed_weight=False,
             use_fp8_w8a8=use_fp8_w8a8,
+            alloc_tensor_func=alloc_tensor_func,
             **run_config,
         )
 
@@ -650,6 +666,7 @@ def fused_experts_impl(
             expert_token_limit=2 ** 31 - 1,
             mul_routed_weight=True,
             use_fp8_w8a8=use_fp8_w8a8,
+            alloc_tensor_func=alloc_tensor_func,
             **run_config,
         )
 
 
@@ -6,17 +6,22 @@
 
 
 @triton.jit
-def _compare_and_swap(x, ids, flip, i: tl.core.constexpr, n_dims: tl.core.constexpr):
+def _compare_and_swap(x, x_1, ids, flip, i: tl.core.constexpr, n_dims: tl.core.constexpr):
     n_outer: tl.core.constexpr = x.numel >> n_dims
     shape: tl.core.constexpr = [n_outer * 2 ** i, 2, 2 ** (n_dims - i - 1)]
     y = tl.core.reshape(x, shape)
+    y_1 = tl.core.reshape(x_1, shape)
     # slice left/right with 'stride' 2**(n_dims - i - 1)
     mask = tl.core.arange(0, 2)[None, :, None]
     left = tl.core.broadcast_to(sum(y * (1 - mask), 1)[:, None, :], shape)
     right = tl.core.broadcast_to(sum(y * mask, 1)[:, None, :], shape)
     left = tl.core.reshape(left, x.shape)
     right = tl.core.reshape(right, x.shape)
 
+    left_1 = tl.core.broadcast_to(sum(y_1 * (1 - mask), 1)[:, None, :], shape)
+    right_1 = tl.core.broadcast_to(sum(y_1 * mask, 1)[:, None, :], shape)
+    left_1 = tl.core.reshape(left_1, x_1.shape)
+    right_1 = tl.core.reshape(right_1, x_1.shape)
     # idx
     y_idx = tl.core.reshape(ids, shape)
     left_idx = tl.core.broadcast_to(sum(y_idx * (1 - mask), 1)[:, None, :], shape)
@@ -36,11 +41,18 @@ def _compare_and_swap(x, ids, flip, i: tl.core.constexpr, n_dims: tl.core.conste
 
     new_ids = ids ^ tl.core.where(cond, left_idx ^ right_idx, zeros_like(ids))
 
-    return ret.to(x.dtype, bitcast=True), new_ids
+    # swap x_1
+    idtype_1 = tl.core.get_int_dtype(bitwidth=x_1.dtype.primitive_bitwidth, signed=True)
+    ileft_1 = left_1.to(idtype_1, bitcast=True)
+    iright_1 = right_1.to(idtype_1, bitcast=True)
+    ix_1 = x_1.to(idtype, bitcast=True)
+    ret_1 = ix_1 ^ tl.core.where(cond, ileft_1 ^ iright_1, zeros_like(ix_1))
+
+    return ret.to(x.dtype, bitcast=True), ret_1.to(x_1.dtype, bitcast=True), new_ids
 
 
 @triton.jit
-def _bitonic_merge(x, ids, stage: tl.core.constexpr, order: tl.core.constexpr, n_dims: tl.core.constexpr):
+def _bitonic_merge(x, x_1, ids, stage: tl.core.constexpr, order: tl.core.constexpr, n_dims: tl.core.constexpr):
     """
     order_type 0 == ascending
     order_type 1 == descending
@@ -60,21 +72,21 @@ def _bitonic_merge(x, ids, stage: tl.core.constexpr, order: tl.core.constexpr, n
         flip = order
     # perform `stage` rounds of `compare-and-swap`
     for i in tl.core.static_range(stage):
-        x, ids = _compare_and_swap(x, ids, flip, i + (n_dims - stage), n_dims)
-    return x, ids
+        x, x_1, ids = _compare_and_swap(x, x_1, ids, flip, i + (n_dims - stage), n_dims)
+    return x, x_1, ids
 
 
 @triton.jit
-def argsort(x, ids, dim: tl.core.constexpr = None, descending: tl.core.constexpr = tl.core.CONSTEXPR_0):
+def argsort(x, x_1, ids, dim: tl.core.constexpr = None, descending: tl.core.constexpr = tl.core.CONSTEXPR_0):
     # handle default dimension or check that it is the most minor dim
     _dim: tl.core.constexpr = len(x.shape) - 1 if dim is None else dim
     tl.core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
     # iteratively run bitonic merge-sort steps
     n_dims: tl.core.constexpr = _log2(x.shape[_dim])
 
     for i in tl.core.static_range(1, n_dims + 1):
-        x, ids = _bitonic_merge(x, ids, i, 2 if i < n_dims else descending, n_dims)
-    return x, ids
+        x, x_1, ids = _bitonic_merge(x, x_1, ids, i, 2 if i < n_dims else descending, n_dims)
+    return x, x_1, ids
 
 
 @triton.jit
@@ -106,6 +118,7 @@ def grouped_topk_kernel(
     EXPERT_GROUP_NUM: tl.constexpr,  # tl.next_power_two_of(group_num)
     EXPERT_GROUP_SIZE: tl.constexpr,  # tl.next_power_two_of(group_expert_num)
     RENORMALIZE: tl.constexpr,
+    GROUP_SCORE_USED_TOPK_NUM: tl.constexpr,
 ):
     token_index = tl.program_id(axis=0)
     offs_n = tl.arange(0, EXPERT_BLOCK_SIZE)
@@ -115,12 +128,14 @@ def grouped_topk_kernel(
         other=-10000000.0,
     ).to(tl.float32)
     if IS_SIGMOID:
-        scores = tl.sigmoid(hidden_states)
+        old_scores = tl.sigmoid(hidden_states)
     else:
-        scores = tl.softmax(hidden_states)
+        old_scores = tl.softmax(hidden_states)
 
     if HAS_CORRECTION_BIAS:
-        scores += tl.load(correction_bias_ptr + offs_n, mask=offs_n < total_expert_num, other=-10000000.0)
+        scores = old_scores + tl.load(correction_bias_ptr + offs_n, mask=offs_n < total_expert_num, other=-10000000.0)
+    else:
+        scores = old_scores
 
     offs_group = tl.arange(0, EXPERT_GROUP_NUM)
     offs_group_v = tl.arange(0, EXPERT_GROUP_SIZE)
@@ -134,7 +149,15 @@ def grouped_topk_kernel(
         other=-10000000.0,
     )  # [group, group_size]
 
-    group_value = tl.max(group_scores, axis=1)  # [group,]
+    group_value = tl.sum(
+        tl.where(
+            (offs_group < group_num)[:, None] & (offs_group_v < GROUP_SCORE_USED_TOPK_NUM)[None, :],
+            tl.sort(group_scores, dim=1, descending=True),
+            0.0,
+        ),
+        axis=1,
+    )
+
     sorted_group_value = tl.sort(group_value, descending=True)
     group_topk_value = tl.sum(tl.where(offs_group == group_topk_num - 1, sorted_group_value, 0.0))
     mask_group_scores = tl.where(
@@ -155,7 +178,7 @@ def grouped_topk_kernel(
     mask_scores = tl.load(
         scores_buffer_ptr + scores_stride_m * token_index + offs_n, mask=offs_n < total_expert_num, other=-10000000.0
     )
-    sorted_scores, sorted_indexes = argsort(mask_scores, offs_n, descending=True)
+    _, sorted_scores, sorted_indexes = argsort(mask_scores, old_scores, offs_n, descending=True)
 
     if RENORMALIZE:
         sum_scores = tl.sum(tl.where(offs_n < topk_num, sorted_scores, 0.0))
@@ -184,6 +207,7 @@ def triton_grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     scoring_func: str = "softmax",
+    group_score_used_topk_num=2,
 ):
 
     if correction_bias is not None:
@@ -225,6 +249,7 @@ def triton_grouped_topk(
         EXPERT_GROUP_NUM=triton.next_power_of_2(num_expert_group),
         EXPERT_GROUP_SIZE=triton.next_power_of_2(total_expert_num // num_expert_group),
         RENORMALIZE=renormalize,
+        GROUP_SCORE_USED_TOPK_NUM=group_score_used_topk_num,
         num_warps=1,
         num_stages=1,
     )
 
@@ -70,9 +70,9 @@ def grouped_topk(
         scores = torch.sigmoid(gating_output)
     else:
         scores = torch.softmax(gating_output, dim=-1)
-
+    old_scores = scores
     if correction_bias is not None:
-        scores.add_(correction_bias)
+        scores = scores + correction_bias
 
     num_token = scores.shape[0]
     group_scores = scores.view(num_token, num_expert_group, -1).max(dim=-1).values  # [n, n_group]
@@ -85,7 +85,43 @@ def grouped_topk(
         .reshape(num_token, -1)
     )  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
-    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    topk_weights = old_scores.gather(1, topk_ids)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
+# biased_grouped_topk adapt from sgl-project/sglang/python/sglang/srt/layers/moe/topk.py
+def biased_grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "sigmoid",
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    scores = gating_output.sigmoid()
+    num_token = scores.shape[0]
+    scores_for_choice = scores.view(num_token, -1) + correction_bias.unsqueeze(0)
+    group_scores = (
+        scores_for_choice.view(num_token, num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    topk_weights = scores.gather(1, topk_ids)
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
@@ -161,6 +197,11 @@ def select_experts(
                 scoring_func=scoring_func,
             )
         else:
+            group_score_topk_num = 1
+            # for deepseek v3
+            if topk_group == 4 and num_expert_group == 8 and top_k == 8:
+                group_score_topk_num = 2
+
             topk_weights, topk_ids = triton_grouped_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
@@ -170,7 +211,9 @@ def select_experts(
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
                 scoring_func=scoring_func,
+                group_score_used_topk_num=group_score_topk_num,
             )
+
     elif custom_routing_function is None:
         topk_weights, topk_ids = fused_topk(
             hidden_states=hidden_states, gating_output=router_logits, topk=top_k, renormalize=renormalize
 
@@ -32,6 +32,7 @@ def __init__(self, layer_num, tp_rank, world_size, network_config, mode=[]):
         self.tp_v_head_num_ = 1
         self.qk_nope_head_dim = network_config["qk_nope_head_dim"]
         self.qk_rope_head_dim = network_config["qk_rope_head_dim"]
+        self.v_head_dim = network_config["v_head_dim"]
         self.q_lora_rank = network_config["q_lora_rank"]
         self.kv_lora_rank = network_config["kv_lora_rank"]
 
@@ -196,16 +197,12 @@ def _decompress_kv(
 
         # CC
         compressed_kv = compressed_kv.view(-1, layer_weight.kv_lora_rank).contiguous()
-        k_nope = self.alloc_tensor(
-            [compressed_kv.shape[0], self.tp_q_head_num_, self.qk_nope_head_dim],
+        kv_nope = self.alloc_tensor(
+            [compressed_kv.shape[0], self.tp_q_head_num_, (self.qk_nope_head_dim + self.v_head_dim)],
             dtype=compressed_kv.dtype,
         )
-        v = self.alloc_tensor(
-            k_nope.shape,
-            dtype=compressed_kv.dtype,
-        )
-        layer_weight.cc_k_b_proj_.mm(compressed_kv, out=k_nope.reshape(compressed_kv.shape[0], -1))
-        layer_weight.cc_v_b_proj_.mm(compressed_kv, out=v.reshape(compressed_kv.shape[0], -1))
+        layer_weight.cc_kv_b_proj_.mm(compressed_kv, out=kv_nope.reshape(compressed_kv.shape[0], -1))
+        k_nope, v = torch.split(kv_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         return k_nope, k_rope, v
 
     def _context_attention_kernel_with_CC(