-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add fp8_gemm fallback for non-triton systems (#6916)
- Removed try/except from __init__ file in fp_quantizer and added a single entry point instead - Renamed file fp8_gemm to fp8_gemm_triton, and the function matmul_fp8 to matmul_fp8_triton - Added a new entry point fp8_gemm with matmul_fp8 inside, and if the system supports triton it calls the triton implementation and if not it calls the fallback Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
- Loading branch information
Showing
4 changed files
with
189 additions
and
168 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# DeepSpeed Team | ||
|
||
######## Fused MoE kernel ######### | ||
# These kernels are implemented for | ||
# fusing GeMM with dequantization of | ||
# fp8 weight data when using bit-16 | ||
# activation. | ||
################################### | ||
|
||
import torch | ||
import triton | ||
import triton.language as tl | ||
|
||
|
||
@triton.jit | ||
def matmul_kernel_fp8_bf16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk, | ||
stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, | ||
BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, | ||
quantization_group_size: tl.constexpr): | ||
pid = tl.program_id(axis=0) | ||
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) | ||
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) | ||
num_pid_in_group = GROUP_SIZE_M * num_pid_n | ||
group_id = pid // num_pid_in_group | ||
first_pid_m = group_id * GROUP_SIZE_M | ||
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) | ||
pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) | ||
pid_n = (pid % num_pid_in_group) // group_size_m | ||
|
||
offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M | ||
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N | ||
offs_k = tl.arange(0, BLOCK_SIZE_K) | ||
|
||
inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) | ||
weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) | ||
weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + ( | ||
(pid_n * BLOCK_SIZE_N) // quantization_group_size) | ||
|
||
weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0) | ||
scale = tl.load(scale_ptr + weight_ptrs_offset) | ||
|
||
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) | ||
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): | ||
inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) | ||
# Dequantize weight (fp8 -> bf16) | ||
w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 4)).to(tl.uint16) | ||
w = (w + 0x3C00).to(tl.uint16) | ||
w = (w.to(tl.bfloat16, bitcast=True) * scale).to(tl.bfloat16) | ||
|
||
inp_data += BLOCK_SIZE_K * stride_ak | ||
weight_data += BLOCK_SIZE_K * stride_bk | ||
weight_mask = offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K | ||
weight = tl.load(weight_data, mask=weight_mask, other=0.0) | ||
scale = tl.load(scale_ptr + (weight_ptrs_offset + | ||
(((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)), | ||
mask=weight_mask, | ||
other=0.0) | ||
|
||
accumulator += tl.dot(inp, w) | ||
|
||
out = accumulator.to(tl.bfloat16) | ||
|
||
offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) | ||
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) | ||
out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] | ||
tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N)) | ||
|
||
|
||
@triton.jit | ||
def matmul_kernel_fp8_fp16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk, | ||
stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, | ||
BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, | ||
quantization_group_size: tl.constexpr): | ||
pid = tl.program_id(axis=0) | ||
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) | ||
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) | ||
num_pid_in_group = GROUP_SIZE_M * num_pid_n | ||
group_id = pid // num_pid_in_group | ||
first_pid_m = group_id * GROUP_SIZE_M | ||
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) | ||
pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) | ||
pid_n = (pid % num_pid_in_group) // group_size_m | ||
|
||
offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M | ||
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N | ||
offs_k = tl.arange(0, BLOCK_SIZE_K) | ||
|
||
inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) | ||
weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) | ||
weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + ( | ||
(pid_n * BLOCK_SIZE_N) // quantization_group_size) | ||
|
||
weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0) | ||
scale = tl.load(scale_ptr + weight_ptrs_offset) | ||
|
||
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) | ||
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): | ||
inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) | ||
# Dequantize weight (fp8 -> fp16) | ||
w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 7)).to(tl.uint16) | ||
w = (w + 0x2000).to(tl.uint16) | ||
w = (w.to(tl.float16, bitcast=True) * scale).to(tl.float16) | ||
|
||
inp_data += BLOCK_SIZE_K * stride_ak | ||
weight_data += BLOCK_SIZE_K * stride_bk | ||
|
||
weight = tl.load(weight_data, mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0) | ||
scale = tl.load(scale_ptr + (weight_ptrs_offset + | ||
(((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size))) | ||
|
||
accumulator += tl.dot(inp, w) | ||
|
||
out = accumulator.to(tl.float16) | ||
|
||
offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) | ||
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) | ||
out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] | ||
tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N)) | ||
|
||
|
||
def matmul_fp8_triton(inp, weight, scale, quantization_group_size): | ||
|
||
assert inp.shape[1] == weight.shape[0], \ | ||
f"Incompatible dimensions (input: {inp.shape}, weight: {weight.shape})" | ||
|
||
M, K = inp.shape | ||
K, N = weight.shape | ||
|
||
out = torch.empty((M, N), device=inp.device, dtype=inp.dtype) | ||
|
||
# GEMM tuning parameters! | ||
# TODO: Add a more configurable tuning for selecting the best GeMM | ||
BLOCK_SIZE_M = 16 if M <= 16 else 32 if M <= 32 else 64 if M <= 64 else 128 | ||
BLOCK_SIZE_N = 64 | ||
BLOCK_SIZE_K = max(64, quantization_group_size) | ||
GROUP_SIZE_M = 8 | ||
num_stages = 4 | ||
num_warps = 4 | ||
if M >= 256: | ||
BLOCK_SIZE_M = 256 | ||
BLOCK_SIZE_N = 128 | ||
BLOCK_SIZE_K = max(128, quantization_group_size) | ||
num_stages = 3 | ||
num_warps = 8 | ||
|
||
grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), ) | ||
kernel = matmul_kernel_fp8_bf16 if inp.dtype == torch.bfloat16 else matmul_kernel_fp8_fp16 | ||
kernel[grid](inp, | ||
weight, | ||
out, | ||
scale, | ||
M, | ||
N, | ||
K, | ||
inp.stride(0), | ||
inp.stride(1), | ||
weight.stride(0), | ||
weight.stride(1), | ||
out.stride(0), | ||
out.stride(1), | ||
quantization_group_size=quantization_group_size, | ||
BLOCK_SIZE_M=BLOCK_SIZE_M, | ||
BLOCK_SIZE_N=BLOCK_SIZE_N, | ||
BLOCK_SIZE_K=BLOCK_SIZE_K, | ||
GROUP_SIZE_M=GROUP_SIZE_M, | ||
num_stages=num_stages, | ||
num_warps=num_warps) | ||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters