add some miss file

Difers · Difers · commit 7e4fb0dbcbc4 · 2025-12-01T13:19:11.000+08:00
diff --git a/paddleformers/transformers/deepseek_v3/fp8_linear.py b/paddleformers/transformers/deepseek_v3/fp8_linear.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+original_linear = paddle.nn.functional.linear
+
+from typing import Literal, Optional
+
+# from ..linear_utils import RowParallelLinear as PD_RowParallelLinear
+from ..linear_utils import ColumnParallelLinear as PD_ColumnParallelLinear
+from ..linear_utils import (
+    ColumnSequenceParallelLinear as PD_ColumnSequenceParallelLinear,
+)
+from ..linear_utils import Linear as PD_Linear
+from ..linear_utils import RowParallelLinear as PD_RowParallelLinear
+from ..linear_utils import RowSequenceParallelLinear as PD_RowSequenceParallelLinear
+
+try:
+    from .kernel import act_quant, fp8_gemm, weight_dequant
+except:
+    pass
+
+
+__all__ = [
+    "Linear",
+    "ColumnParallelLinear",
+    "RowParallelLinear",
+    "ColumnSequenceParallelLinear",
+    "RowSequenceParallelLinear",
+]
+
+gemm_impl: Literal["bf16", "fp8"] = "bf16"
+block_size = 128
+
+
+def fp8_linear(
+    x: paddle.Tensor, weight: paddle.Tensor, bias: Optional[paddle.Tensor] = None, name=None
+) -> paddle.Tensor:
+    """
+    Applies a linear transformation to the incoming data: y = xA^T + b.
+    This function supports specialized implementations based on quantization
+    and tensor formats.
+
+    Args:
+        x (paddle.Tensor): The input tensor.
+        weight (paddle.Tensor): The weight tensor. It may be quantized and
+            requires dequantization for certain cases.
+        bias (Optional[paddle.Tensor]): The bias tensor to be added. Default is None.
+
+    Returns:
+        paddle.Tensor: The result of the linear transformation, which may involve
+        quantization-aware computations depending on the input parameters.
+
+    Notes:
+        - If `weight` is quantized (e.g., `element_size() == 1`), a dequantized version
+          is used for computation.
+        - If `gemm_impl == "bf16"`, dequantization and a `bf16` GEMM operation are applied.
+        - For other cases, the function applies quantization to `x` and uses `fp8_gemm` for computation.
+    """
+
+    if paddle.in_dynamic_mode():
+        if weight.element_size() > 1:
+            return original_linear(x, weight, bias)
+        elif gemm_impl == "bf16":
+            weight = weight_dequant(weight, weight._scale)
+            return original_linear(x, weight, bias)
+        else:
+            x, scale = act_quant(x, block_size)
+            y = fp8_gemm(x, scale, weight, weight._scale)
+            if bias is not None:
+                y += bias
+            return y
+    else:
+        return original_linear(x, weight, bias)
+
+
+paddle.nn.functional.linear = fp8_linear
+
+
+def register_scale(self):
+    if self.weight.element_size() == 1:
+        in_features, out_features = self.weight.shape
+        scale_out_features = (out_features + self.block_size - 1) // self.block_size
+        scale_in_features = (in_features + self.block_size - 1) // self.block_size
+        self.weight_scale_inv = self.create_parameter(
+            shape=[scale_in_features, scale_out_features],
+            attr=self._weight_attr,
+            dtype="float32",
+            is_bias=False,
+        )
+        self.weight._scale = self.weight_scale_inv
+
+
+class Linear(PD_Linear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_size = kwargs.get("block_size", 128)
+        register_scale(self)
+
+
+class ColumnParallelLinear(PD_ColumnParallelLinear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_size = kwargs.get("block_size", 128)
+        register_scale(self)
+
+
+class RowParallelLinear(PD_RowParallelLinear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_size = kwargs.get("block_size", 128)
+        register_scale(self)
+
+
+class ColumnSequenceParallelLinear(PD_ColumnSequenceParallelLinear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_size = kwargs.get("block_size", 128)
+        register_scale(self)
+
+
+class RowSequenceParallelLinear(PD_RowSequenceParallelLinear):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_size = kwargs.get("block_size", 128)
+        register_scale(self)
diff --git a/paddleformers/transformers/deepseek_v3/kernel.py b/paddleformers/transformers/deepseek_v3/kernel.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 DeepSeek. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import paddle
+import triton
+import triton.language as tl
+
+# from triton import Config
+
+
+@triton.jit
+def act_quant_kernel(x_ptr, y_ptr, s_ptr, BLOCK_SIZE: tl.constexpr):
+    """
+    Quantizes the input tensor `x_ptr` and stores the result in `y_ptr` and the scaling factor in `s_ptr`.
+
+    Args:
+        x_ptr (triton.Pointer): Pointer to the input tensor.
+        y_ptr (triton.Pointer): Pointer to the output tensor where quantized values will be stored.
+        s_ptr (triton.Pointer): Pointer to the output tensor where scaling factors will be stored.
+        BLOCK_SIZE (tl.constexpr): The size of the block to be processed by each program instance.
+
+    Returns:
+        None
+    """
+    pid = tl.program_id(axis=0)
+    offs = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(x_ptr + offs).to(tl.float32)
+    s = tl.max(tl.abs(x)) / 448.0
+    y = x / s
+    y = y.to(y_ptr.dtype.element_ty)
+    tl.store(y_ptr + offs, y)
+    tl.store(s_ptr + pid, s)
+
+
+def act_quant(x: paddle.Tensor, block_size: int = 128) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization.
+
+    Args:
+        x (paddle.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
+
+    Returns:
+        Tuple[paddle.Tensor, paddle.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `paddle.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `paddle.float32`.
+    """
+    assert x.is_contiguous(), "Input tensor must be contiguous"
+    assert (
+        x.shape[-1] % block_size == 0
+    ), f"Last dimension size must be divisible by block_size (block_size={block_size})"
+    y = paddle.empty_like(x, dtype=paddle.float8_e4m3fn)
+    s = paddle.empty((*x.shape[:-1], x.shape[-1] // block_size), dtype=paddle.float32)
+    grid = lambda meta: (triton.cdiv(x.numel().item(), meta["BLOCK_SIZE"]),)
+    act_quant_kernel[grid](x, y, s, BLOCK_SIZE=block_size)
+    return y, s
+
+
+@triton.jit
+def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+    """
+    Dequantizes weights using the provided scaling factors and stores the result.
+
+    Args:
+        x_ptr (tl.pointer): Pointer to the quantized weights.
+        s_ptr (tl.pointer): Pointer to the scaling factors.
+        y_ptr (tl.pointer): Pointer to the output buffer for dequantized weights.
+        M (int): Number of rows in the weight matrix.
+        N (int): Number of columns in the weight matrix.
+        BLOCK_SIZE (tl.constexpr): Size of the block for tiling.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    n = tl.cdiv(N, BLOCK_SIZE)
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs = offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+    s = tl.load(s_ptr + pid_m * n + pid_n)
+    y = x * s
+    tl.store(y_ptr + offs, y, mask=mask)
+
+
+def weight_dequant(x: paddle.Tensor, s: paddle.Tensor, block_size: int = 128) -> paddle.Tensor:
+    """
+    Dequantizes the given weight tensor using the provided scale tensor.
+
+    Args:
+        x (paddle.Tensor): The quantized weight tensor of shape (M, N).
+        s (paddle.Tensor): The scale tensor of shape (M, N).
+        block_size (int, optional): The block size to use for dequantization. Defaults to 128.
+
+    Returns:
+        paddle.Tensor: The dequantized weight tensor of the same shape as `x`.
+
+    Raises:
+        AssertionError: If `x` or `s` are not contiguous or if their dimensions are not 2.
+    """
+    assert x.is_contiguous() and s.is_contiguous(), "Input tensors must be contiguous"
+    assert x.dim() == 2 and s.dim() == 2, "Input tensors must have 2 dimensions"
+    M, N = x.shape
+    y = paddle.empty_like(x, dtype=paddle.get_default_dtype())
+    grid = lambda meta: (triton.cdiv(M, meta["BLOCK_SIZE"]), triton.cdiv(N, meta["BLOCK_SIZE"]))
+    weight_dequant_kernel[grid](x, s, y, M, N, BLOCK_SIZE=block_size)
+    return y
+
+
+# fp8_gemm_configs = [
+#     Config({"BLOCK_SIZE_M": block_m, "BLOCK_SIZE_N": block_n, "BLOCK_SIZE_K": 128}, num_stages=num_stages, num_warps=8)
+#     for block_m in [16, 32, 64]
+#     for block_n in [32, 64, 128]
+#     for num_stages in [3, 4, 5, 6]
+# ]
+# FIXME @ZHUI, paddle not support triton autotune temporarily.
+# # @triton.autotune(configs=fp8_gemm_configs, key=["N", "K"])
+@triton.jit
+def fp8_gemm_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_s_ptr,
+    b_s_ptr,
+    M,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    """
+    Performs a matrix multiplication operation on FP8 matrices with scaling factors.
+
+    Args:
+        a_ptr (tl.tensor): Pointer to the first input matrix A.
+        b_ptr (tl.tensor): Pointer to the second input matrix B.
+        c_ptr (tl.tensor): Pointer to the output matrix C.
+        a_s_ptr (tl.tensor): Pointer to the scaling factors for matrix A.
+        b_s_ptr (tl.tensor): Pointer to the scaling factors for matrix B.
+        M (int): Number of rows in matrix A and C.
+        N (tl.constexpr): Number of columns in matrix B and C.
+        K (tl.constexpr): Number of columns in matrix A and rows in matrix B.
+        BLOCK_SIZE_M (tl.constexpr): Block size for the M dimension.
+        BLOCK_SIZE_N (tl.constexpr): Block size for the N dimension.
+        BLOCK_SIZE_K (tl.constexpr): Block size for the K dimension.
+
+    Returns:
+        None
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    a_s_ptrs = a_s_ptr + offs_m * k
+    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(k):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
+        a_s = tl.load(a_s_ptrs)
+        b_s = tl.load(b_s_ptrs)
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+        a_s_ptrs += 1
+        b_s_ptrs += 1
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+
+
+def fp8_gemm(a: paddle.Tensor, a_s: paddle.Tensor, b: paddle.Tensor, b_s: paddle.Tensor):
+    """
+    Modified for B matrix with shape [K, N]
+    """
+    # FIXME @ZHUI, transposed
+    b = b.T.contiguous()
+    b_s = b_s.T.contiguous()
+    assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous"
+    assert a_s.is_contiguous() and b_s.is_contiguous(), "Scaling factor tensors must be contiguous"
+
+    K = a.shape[-1]
+    M = a.numel().item() // K
+    # N = b.shape[-1]  # Get N from the second dimension of B
+    N = b.shape[0]  # Get N from the second dimension of B
+
+    c = paddle.empty((*a.shape[:-1], N), dtype=paddle.get_default_dtype())
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_SIZE_M"]), triton.cdiv(N, META["BLOCK_SIZE_N"]))
+    fp8_gemm_kernel[grid](
+        a,
+        b,
+        c,
+        a_s,
+        b_s,
+        M,
+        N,
+        K,
+        BLOCK_SIZE_M=32,
+        BLOCK_SIZE_N=64,
+        BLOCK_SIZE_K=128,
+    )
+    return c
diff --git a/paddleformers/transformers/deepseek_v3/mfu_utils.py b/paddleformers/transformers/deepseek_v3/mfu_utils.py