ModelTC
diff --git a/‎lightllm/common/basemodel/triton_kernel/destindex_copy_kv_fp8.py
Lines changed: 19 additions & 78 deletions b/‎lightllm/common/basemodel/triton_kernel/destindex_copy_kv_fp8.py
Lines changed: 19 additions & 78 deletions
diff --git a/‎lightllm/common/mem_manager.py
Lines changed: 12 additions & 4 deletions b/‎lightllm/common/mem_manager.py
Lines changed: 12 additions & 4 deletions
diff --git a/‎lightllm/models/llama/layer_infer/transformer_layer_infer.py
Lines changed: 4 additions & 4 deletions b/‎lightllm/models/llama/layer_infer/transformer_layer_infer.py
Lines changed: 4 additions & 4 deletions
@@ -42,43 +42,6 @@ def _fwd_kernel_destindex_copy_kv_per_head_fp8(
     return
 
 
-@triton.jit
-def _fwd_kernel_destindex_copy_kv_per_tensor_fp8(
-    K,
-    Dest_loc,
-    Out,
-    scalar_scale,
-    stride_k_bs,
-    stride_k_h,
-    stride_k_d,
-    stride_o_bs,
-    stride_o_h,
-    stride_o_d,
-    head_num,
-    BLOCK_DMODEL: tl.constexpr,
-    BLOCK_HEAD: tl.constexpr,
-    FP8_MIN: tl.constexpr,
-    FP8_MAX: tl.constexpr,
-):
-    cur_index = tl.program_id(0)
-    offs_h = tl.arange(0, BLOCK_HEAD)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
-
-    dest_index = tl.load(Dest_loc + cur_index).to(tl.int64)
-
-    k_ptrs = K + cur_index * stride_k_bs + stride_k_h * offs_h[:, None] + stride_k_d * offs_d[None, :]
-    o_ptrs = Out + dest_index * stride_o_bs + stride_o_h * offs_h[:, None] + stride_o_d * offs_d[None, :]
-
-    scale = tl.load(scalar_scale)
-
-    k = tl.load(k_ptrs, mask=offs_h[:, None] < head_num, other=0.0)
-    k_scale = k / scale
-    k_fp8 = tl.clamp(k_scale, min=FP8_MIN, max=FP8_MAX).to(tl.float8e4nv)
-
-    tl.store(o_ptrs, k_fp8, mask=offs_h[:, None] < head_num)
-    return
-
-
 @torch.no_grad()
 def destindex_copy_kv_fp8(K, DestLoc, scales, Out):
     if scales is None:
@@ -93,47 +56,25 @@ def destindex_copy_kv_fp8(K, DestLoc, scales, Out):
     grid = (seq_len,)
     num_warps = 1
 
-    if scales.dim() == 0:
-        _fwd_kernel_destindex_copy_kv_per_tensor_fp8[grid](
-            K,
-            DestLoc,
-            Out,
-            scales,
-            K.stride(0),
-            K.stride(1),
-            K.stride(2),
-            Out.stride(0),
-            Out.stride(1),
-            Out.stride(2),
-            head_num,
-            BLOCK_DMODEL=head_dim,
-            BLOCK_HEAD=BLOCK_HEAD,
-            FP8_MIN=torch.finfo(torch.float8_e4m3fn).min,
-            FP8_MAX=torch.finfo(torch.float8_e4m3fn).max,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-    else:
-        _fwd_kernel_destindex_copy_kv_per_head_fp8[grid](
-            K,
-            DestLoc,
-            Out,
-            scales,
-            K.stride(0),
-            K.stride(1),
-            K.stride(2),
-            Out.stride(0),
-            Out.stride(1),
-            Out.stride(2),
-            head_num,
-            BLOCK_DMODEL=head_dim,
-            BLOCK_HEAD=BLOCK_HEAD,
-            FP8_MIN=torch.finfo(torch.float8_e4m3fn).min,
-            FP8_MAX=torch.finfo(torch.float8_e4m3fn).max,
-            num_warps=num_warps,
-            num_stages=1,
-        )
-    return
+    _fwd_kernel_destindex_copy_kv_per_head_fp8[grid](
+        K,
+        DestLoc,
+        Out,
+        scales,
+        K.stride(0),
+        K.stride(1),
+        K.stride(2),
+        Out.stride(0),
+        Out.stride(1),
+        Out.stride(2),
+        head_num,
+        BLOCK_DMODEL=head_dim,
+        BLOCK_HEAD=BLOCK_HEAD,
+        FP8_MIN=torch.finfo(torch.float8_e4m3fn).min,
+        FP8_MAX=torch.finfo(torch.float8_e4m3fn).max,
+        num_warps=num_warps,
+        num_stages=1,
+    )
 
 
 if __name__ == "__main__":
 
@@ -27,8 +27,9 @@ def __init__(self, layer_num, head_num):
         self.qmax = torch.finfo(torch.float8_e4m3fn).max
         self.model_arch = get_model_architectures(get_env_start_args().model_dir)
         self.layer_num = layer_num
+        self.head_num = head_num
         self.total_head_num = head_num * dist.get_world_size() if dist.is_initialized() else head_num
-        self.scales_shape = [layer_num, 2 * head_num] if get_env_start_args().enable_fa3 else [layer_num]
+        self.scales_shape = [layer_num, 2 * head_num] if get_env_start_args().enable_fa3 else [layer_num, 2]
         self.scales = None
         self.scales_list = []
         self.abs_max = None
@@ -62,9 +63,11 @@ def __init__(self, layer_num, head_num):
                         f"not match current model head num {self.total_head_num}"
                     )
                 if get_env_start_args().enable_fa3:
-                    assert len(cfg["scales_shape"]) == 2, "this config is not for fa3 backend"
+                    if cfg["quant_type"] != "per_head":
+                        raise ValueError(f"quant type {cfg['num_head']} in config not match fa3 backend")
                 else:
-                    assert len(cfg["scales_shape"]) == 1, "this config is not for flashinfer backend"
+                    if cfg["quant_type"] != "per_tensor":
+                        raise ValueError(f"quant type {cfg['quant_type']} in config not match flashinfer backend")
 
                 self.qmin = cfg["qmin"]
                 self.qmax = cfg["qmax"]
@@ -73,6 +76,8 @@ def __init__(self, layer_num, head_num):
                 full_scales_list = cfg["scales"]
                 self.scales_list = full_scales_list
                 self.scales = torch.tensor(self.scales_list, dtype=torch.float32, device="cuda").view(self.scales_shape)
+                if not get_env_start_args().enable_fa3:
+                    self.scales = torch.repeat_interleave(self.scales, self.head_num, dim=-1)
                 if get_env_start_args().enable_fa3 and dist.is_initialized() and dist.get_world_size() > 1:
                     half_head = self.total_head_num // 2
                     start_head = dist.get_rank() * head_num
@@ -103,7 +108,9 @@ def update_calibration_data(self, kv_buffer: torch.Tensor, layer_index: int):
             if get_env_start_args().enable_fa3:
                 kv_max = kv_buffer.abs().amax(dim=(0, 2)).to(torch.float32)
             else:
-                kv_max = kv_buffer.abs().amax(dim=()).to(torch.float32)
+                k_max = kv_buffer[:, : self.head_num, :].abs().amax(dim=()).to(torch.float32)
+                v_max = kv_buffer[:, self.head_num :, :].abs().amax(dim=()).to(torch.float32)
+                kv_max = torch.tensor([k_max, v_max], device="cuda", dtype=torch.float32)
             self.abs_max[layer_index] = torch.maximum(self.abs_max[layer_index], kv_max)
             if self.count == self.warmup_counts + self.inference_counts - 1 and layer_index == self.layer_num - 1:
                 final_abs_max = self.abs_max
@@ -136,6 +143,7 @@ def _export_calibration_data(self):
         cfg = {
             "version": "1.0",
             "architectures": self.model_arch,
+            "quant_type": "per_head" if get_env_start_args().enable_fa3 else "per_tensor",
             "qmin": self.qmin,
             "qmax": self.qmax,
             "num_layers": self.layer_num,
 
@@ -225,8 +225,8 @@ def _context_attention_flashinfer_kernel_fp8(
         k = kv[:, :, : self.tp_k_head_num_, :].view(torch.float8_e4m3fn)
         v = kv[:, :, self.tp_k_head_num_ :, :].view(torch.float8_e4m3fn)
         offline_scales = infer_state.mem_manager.offline_fp8_quant_manager.scales_list
-        k_descale = offline_scales[self.layer_num_] if offline_scales is not None else None
-        v_descale = offline_scales[self.layer_num_] if offline_scales is not None else None
+        k_descale = offline_scales[self.layer_num_][0] if offline_scales is not None else None
+        v_descale = offline_scales[self.layer_num_][1] if offline_scales is not None else None
         infer_state.prefill_wrapper.run(
             q.view(q.shape[0], -1, self.head_dim_),
             (k, v),
@@ -517,8 +517,8 @@ def _token_decode_attention_flashinfer_fp8(self, q, infer_state: LlamaFlashInfer
         k = kv[:, :, : self.tp_k_head_num_, :].view(torch.float8_e4m3fn)
         v = kv[:, :, self.tp_k_head_num_ :, :].view(torch.float8_e4m3fn)
         offline_scales = infer_state.mem_manager.offline_fp8_quant_manager.scales_list
-        k_descale = offline_scales[self.layer_num_] if offline_scales is not None else None
-        v_descale = offline_scales[self.layer_num_] if offline_scales is not None else None
+        k_descale = offline_scales[self.layer_num_][0] if offline_scales is not None else None
+        v_descale = offline_scales[self.layer_num_][1] if offline_scales is not None else None
         infer_state.decode_wrapper.run(
             q.view(calcu_shape1),
             (k, v),