0609

hiworldwzj · hiworldwzj · commit e54052088cc3 · 2025-06-09T15:59:46.000+08:00
diff --git a/lightllm-kernel/lightllm_kernel/ops/fusion.py b/lightllm-kernel/lightllm_kernel/ops/fusion.py
@@ -2,20 +2,28 @@
 from typing import Optional, Tuple
 from . import _C
 
+
 def pre_tp_norm_bf16(input: torch.Tensor) -> torch.Tensor:
-    """ Calculate powersum along embedding dimension of the input """
+    """Calculate powersum along embedding dimension of the input"""
     return _C.pre_tp_norm_bf16(input)
 
-def post_tp_norm_bf16(input: torch.tensor, weight: torch.Tensor, tp_variance: torch.Tensor, embed_dim: int, eps: float) -> torch.Tensor:
-    """ Apply rmsnorm on given input, with weight and pre calculated powersum """
+
+def post_tp_norm_bf16(
+    input: torch.tensor, weight: torch.Tensor, tp_variance: torch.Tensor, embed_dim: int, eps: float
+) -> torch.Tensor:
+    """Apply rmsnorm on given input, with weight and pre calculated powersum"""
     return _C.post_tp_norm_bf16(input, weight, tp_variance, embed_dim, eps)
 
-def add_norm_quant_bf16_fp8(input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Apply add_norm_quant on given input, with residual and weight """
+
+def add_norm_quant_bf16_fp8(
+    input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply add_norm_quant on given input, with residual and weight"""
     return _C.add_norm_quant_bf16_fp8(input, residual, weight, eps)
 
+
 def gelu_per_token_quant_bf16_fp8(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Apply gelu on given input and quantize it from bf16 to fp8 using per token quant method """
+    """Apply gelu on given input and quantize it from bf16 to fp8 using per token quant method"""
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
     _C.gelu_per_token_quant_bf16_fp8(output, input, scales)
diff --git a/lightllm-kernel/lightllm_kernel/ops/gemm.py b/lightllm-kernel/lightllm_kernel/ops/gemm.py
@@ -2,7 +2,15 @@
 from typing import Optional
 from . import _C
 
-def cutlass_scaled_mm_bias_ls(c: torch.Tensor, a: torch.Tensor, b: torch.Tensor,
-                      a_scales: torch.Tensor, b_scales: torch.Tensor, bias: Optional[torch.Tensor], ls: Optional[torch.Tensor]) -> None :
-    """ Apply scaled mm on the given input, with optional bias and ls weight """
+
+def cutlass_scaled_mm_bias_ls(
+    c: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    ls: Optional[torch.Tensor],
+) -> None:
+    """Apply scaled mm on the given input, with optional bias and ls weight"""
     return _C.cutlass_scaled_mm(c, a, b, a_scales, b_scales, bias, ls)
diff --git a/lightllm-kernel/lightllm_kernel/ops/norm.py b/lightllm-kernel/lightllm_kernel/ops/norm.py
@@ -2,6 +2,6 @@
 from typing import Optional
 from . import _C
 
-def rmsnorm_bf16(X: torch.Tensor, W: torch.Tensor, eps: float=1e-12) -> torch.Tensor:
-    """ Apply rmsnorm on given X, with weight W and eps """
+
+def rmsnorm_bf16(X: torch.Tensor, W: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
     return _C.rmsnorm_align16_bf16(X, W, eps)
diff --git a/lightllm-kernel/lightllm_kernel/ops/quant.py b/lightllm-kernel/lightllm_kernel/ops/quant.py
@@ -2,8 +2,9 @@
 from typing import Optional, Tuple
 from . import _C
 
+
 def per_token_quant_bf16_fp8(input: torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Quantize the given input using per token quant method """
+    """Quantize the given input using per token quant method"""
     output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
     _C.per_token_quant_bf16_fp8(output, input, scales)
diff --git a/lightllm-kernel/test/fusion/add_norm_quant_test.py b/lightllm-kernel/test/fusion/add_norm_quant_test.py
@@ -10,12 +10,13 @@ def torch_add_norm_quant_bf16_fp8(X, R, W, eps=1e-6):
     # 1. Add residual
     X = X.add_(R)
     # 2. rmsnorm
-    normalized = torch.nn.functional.rms_norm(X, (N, ), W, eps=eps)
+    normalized = torch.nn.functional.rms_norm(X, (N,), W, eps=eps)
     # 3. per token quant
     quantized, scales = ops.scaled_fp8_quant(normalized, scale=None, use_per_token_if_dynamic=True)
 
     return quantized, scales
 
+
 class TestFusedAddNormQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -31,40 +32,65 @@ def test_accuracy(self):
         for batch in self.batchs:
             for seqLen in self.seqLens:
                 for embed_dim in self.embed_dims:
-                        with self.subTest(shape=[batch, seqLen, embed_dim]):
-                            X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            X2 = X1.clone()
-                            R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            R2 = R1.clone()
-                            W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            output_real, scales_real = torch_add_norm_quant_bf16_fp8(X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps)
-                            output_pred, scales_pred = add_norm_quant_bf16_fp8(X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps)
+                    with self.subTest(shape=[batch, seqLen, embed_dim]):
+                        X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        X2 = X1.clone()
+                        R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R2 = R1.clone()
+                        W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        output_real, scales_real = torch_add_norm_quant_bf16_fp8(
+                            X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps
+                        )
+                        output_pred, scales_pred = add_norm_quant_bf16_fp8(
+                            X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps
+                        )
 
-                            self.assertTrue(
-                                error(output_real, output_pred) < 0.01,
-                                f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. output_real={output_real}, output_pred={output_pred}"
-                            )
-                            self.assertTrue(
-                                error(scales_real, scales_pred) < 0.01,
-                                f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. scales_real={scales_real}, scales_pred={scales_pred}"
-                            )
+                        self.assertTrue(
+                            error(output_real, output_pred) < 0.01,
+                            f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. "
+                            f"output_real={output_real}, output_pred={output_pred}",
+                        )
+                        self.assertTrue(
+                            error(scales_real, scales_pred) < 0.01,
+                            f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. "
+                            f"scales_real={scales_real}, scales_pred={scales_pred}",
+                        )
 
     def test_performance(self):
         """Test the performance of FusedAddNormQuant using benchmark."""
         for batch in self.batchs:
             for seqLen in self.seqLens:
                 for embed_dim in self.embed_dims:
-                        with self.subTest(shape=[batch, seqLen, embed_dim]):
-                            X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            X2 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
-                            R2 = R1.clone()
-                            W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                    with self.subTest(shape=[batch, seqLen, embed_dim]):
+                        X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        X2 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R2 = R1.clone()
+                        W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+
+                        shape = [[batch, seqLen, embed_dim]]
+                        tflops = 0.0
+                        benchmark(
+                            torch_add_norm_quant_bf16_fp8,
+                            shape,
+                            tflops,
+                            100,
+                            X1.reshape(-1, X1.shape[2]),
+                            R1.reshape(-1, R1.shape[2]),
+                            W,
+                            self.eps,
+                        )
+                        benchmark(
+                            add_norm_quant_bf16_fp8,
+                            shape,
+                            tflops,
+                            100,
+                            X2.reshape(-1, X1.shape[2]),
+                            R2.reshape(-1, R2.shape[2]),
+                            W,
+                            self.eps,
+                        )
 
-                            shape = [[batch, seqLen, embed_dim]]
-                            tflops = 0.0
-                            benchmark(torch_add_norm_quant_bf16_fp8, shape, tflops, 100, X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps)
-                            benchmark(add_norm_quant_bf16_fp8, shape, tflops, 100, X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps)
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py b/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
@@ -4,10 +4,12 @@
 from lightllm_kernel.ops import per_token_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
 from test.utils import benchmark, error
 
+
 def gelu_quant(x):
     y = gelu_fwd(x)
     return per_token_quant_bf16_fp8(y)
 
+
 class TestGeluQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -21,20 +23,23 @@ def test_accuracy(self):
         for token in self.tokens:
             for hiddenDim in self.hiddenDims:
                 with self.subTest(shape=[token, hiddenDim]):
-                    input = torch.normal(mean=0.0, std=10, size=[token, hiddenDim], device=self.device, dtype=self.dtype)
+                    input = torch.normal(
+                        mean=0.0, std=10, size=[token, hiddenDim], device=self.device, dtype=self.dtype
+                    )
 
                     y_real, scales_real = gelu_quant(input)
                     y_pred, scales_pred = gelu_per_token_quant_bf16_fp8(input)
-                    
+
                     self.assertTrue(
                         error(scales_real, scales_pred) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. scales_real={scales_real}, scales_pred={scales_pred}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}. "
+                        f"scales_real={scales_real}, scales_pred={scales_pred}",
                     )
                     self.assertTrue(
                         error(y_real, y_pred) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}." f"y_real={y_real}, y_pred={y_pred}",
                     )
-                    
+
     def test_performance(self):
         """Test the performance of gelu_per_token_quant using benchmark."""
         for token in self.tokens:
@@ -46,5 +51,6 @@ def test_performance(self):
                     benchmark(gelu_per_token_quant_bf16_fp8, shape, tflops, 100, input)
                     benchmark(gelu_quant, shape, tflops, 100, input)
 
+
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/post_tp_norm_test.py b/lightllm-kernel/test/fusion/post_tp_norm_test.py
@@ -12,6 +12,7 @@ def post_tp_norm(input, weight, tp_variance, embed_dim, eps):
     out = weight * input.to(torch.bfloat16)
     return out
 
+
 class TestPostTpNormBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -34,7 +35,7 @@ def test_accuracy(self):
                     y_pred = post_tp_norm_bf16(X, W, V, self.embed_dim, self.eps)
                     self.assertTrue(
                         error(y_pred, y_real) < 0.01,
-                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
                     )
 
     def test_performance(self):
@@ -50,5 +51,6 @@ def test_performance(self):
                     benchmark(post_tp_norm_bf16, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
                     benchmark(post_tp_norm, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
 
+
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/pre_tp_norm_test.py b/lightllm-kernel/test/fusion/pre_tp_norm_test.py
@@ -9,6 +9,7 @@ def pre_tp_norm(input):
     tp_variance = input.pow(2).sum(-1, keepdim=False)
     return tp_variance
 
+
 class TestPreTpNormBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -27,20 +28,21 @@ def test_accuracy(self):
                     y_pred = pre_tp_norm_bf16(X)
                     self.assertTrue(
                         error(y_pred, y_real) < 0.01,
-                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}"
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
                     )
 
     def test_performance(self):
         for batch in self.batchs:
             for size in self.sizes:
                 with self.subTest(shape=[batch, size]):
                     X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
-                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+                    # W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
 
                     shape = [[batch, size], [size], [batch, size]]
                     tflops = 0.0
                     benchmark(pre_tp_norm_bf16, shape, tflops, 100, X)
                     benchmark(pre_tp_norm, shape, tflops, 100, X)
 
+
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
diff --git a/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py b/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
@@ -10,6 +10,7 @@ def torch_cutlass_scale_gemm_with_ls(x_q, w_q_t, x_scale, w_scale, out_dtype=tor
     y_pred = y_pred_tmp * ls
     return y_pred
 
+
 class TestQuantBF16(unittest.TestCase):
     def setUp(self):
         """Set up common test parameters."""
@@ -18,7 +19,6 @@ def setUp(self):
         self.device = "cuda"
         self.dtype = torch.bfloat16
 
-
     def test_accuracy(self):
         """Test the accuracy of cutlass_scaled_mm_bias_ls"""
         for token in self.tokens:
@@ -29,10 +29,11 @@ def test_accuracy(self):
                     input = torch.randn(size=[M, K], device=self.device, dtype=self.dtype)
                     x_q, x_scale = ops.scaled_fp8_quant(input, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
 
-                    
                     # 生成权重张量w_q（N×K），转置后为K×N（列优先）
                     weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype)
-                    w_q, w_scale = ops.scaled_fp8_quant(weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+                    w_q, w_scale = ops.scaled_fp8_quant(
+                        weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True
+                    )
 
                     # 转置，w_q_t为列优先
                     w_q_t = w_q.t()
@@ -43,11 +44,13 @@ def test_accuracy(self):
                     ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
 
                     cutlass_scaled_mm_bias_ls(y_pred, x_q, w_q_t, x_scale, w_scale, bias=bias, ls=ls)
-                    y_real = torch_cutlass_scale_gemm_with_ls(x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls)
+                    y_real = torch_cutlass_scale_gemm_with_ls(
+                        x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls
+                    )
 
                     self.assertTrue(
                         error(y_pred, y_real) < 0.01,
-                        f"Accuracy test failed for size {token}, {hiddenDim}. y_pred={y_pred}, y_real={y_real}"
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_pred={y_pred}, y_real={y_real}",
                     )
 
     def test_performance(self):
@@ -62,7 +65,9 @@ def test_performance(self):
 
                     # 生成权重张量w_q（N×K），转置后为K×N（列优先）
                     weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype) - 0.5
-                    w_q, w_scale = ops.scaled_fp8_quant(weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+                    w_q, w_scale = ops.scaled_fp8_quant(
+                        weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True
+                    )
 
                     bias = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
                     ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
@@ -72,9 +77,34 @@ def test_performance(self):
 
                     y_pred = torch.empty((M, N), dtype=input.dtype, device=input.device)
                     shape = [[token, hiddenDim]]
-                    tflops = 2 * token * (3 * hiddenDim) * hiddenDim / 1024**4
-                    benchmark(cutlass_scaled_mm_bias_ls, shape, tflops, 100, y_pred, x_q, w_q_t, x_scale, w_scale, bias=bias, ls=ls)
-                    benchmark(torch_cutlass_scale_gemm_with_ls, shape, tflops, 100, x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls) # 无bias 495GB/s, 有bias 482GB/s
+                    tflops = 2 * token * (3 * hiddenDim) * hiddenDim / 1024 ** 4
+                    benchmark(
+                        cutlass_scaled_mm_bias_ls,
+                        shape,
+                        tflops,
+                        100,
+                        y_pred,
+                        x_q,
+                        w_q_t,
+                        x_scale,
+                        w_scale,
+                        bias=bias,
+                        ls=ls,
+                    )
+                    benchmark(
+                        torch_cutlass_scale_gemm_with_ls,
+                        shape,
+                        tflops,
+                        100,
+                        x_q,
+                        w_q_t,
+                        x_scale,
+                        w_scale,
+                        out_dtype=torch.bfloat16,
+                        bias=bias,
+                        ls=ls,
+                    )  # 无bias 495GB/s, 有bias 482GB/s
+
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
diff --git a/lightllm-kernel/test/norm/rmsnorm_test.py b/lightllm-kernel/test/norm/rmsnorm_test.py
diff --git a/lightllm-kernel/test/quant/quant_test.py b/lightllm-kernel/test/quant/quant_test.py
diff --git a/lightllm-kernel/test/utils.py b/lightllm-kernel/test/utils.py