-
-
Notifications
You must be signed in to change notification settings - Fork 8.5k
[Kernel] Initial Activation Quantization Support #4525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4d27a2c
92b3703
2a3eb83
3dd1fe8
f2f8c52
c9308eb
d9d49b5
b111ee6
c31a7af
ca01b39
f0197d4
4624b46
75757d5
e1df0eb
bc0991c
74ad650
43c43f3
cf5600f
169ce7f
03b53e7
f9df31b
ba4b6b3
3c223c6
b27f31a
b589cdd
98159cf
8dbeb31
5eeb40a
c55e023
f5cbbd3
a685957
4dfb37f
de81f9e
15f1863
bd53847
b2926f3
1274386
18640c8
5c5dc84
a44b4a0
6f0e6e1
0090454
4b10fd7
68a59c7
b0afe67
4f4951e
869de3f
e68e391
d77cf50
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,6 +167,7 @@ set(VLLM_EXT_SRC | |
"csrc/layernorm_kernels.cu" | ||
"csrc/quantization/squeezellm/quant_cuda_kernel.cu" | ||
"csrc/quantization/gptq/q_gemm.cu" | ||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a bit unclear to me about the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So the flow is:
|
||
"csrc/quantization/fp8/common.cu" | ||
"csrc/cuda_utils_kernels.cu" | ||
"csrc/moe_align_block_size_kernels.cu" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#include <ATen/cuda/CUDAContext.h> | ||
#include <torch/extension.h> | ||
#include <cmath> | ||
|
||
#include "../../dispatch_utils.h" | ||
|
||
static inline __device__ int8_t float_to_int8_rn(float x) { | ||
#ifdef USE_ROCM | ||
static const float i8_min = | ||
static_cast<float>(std::numeric_limits<int8_t>::min()); | ||
static const float i8_max = | ||
static_cast<float>(std::numeric_limits<int8_t>::max()); | ||
// round | ||
float dst = std::nearbyint(x); | ||
// saturate | ||
dst = std::clamp(dst, i8_min, i8_max); | ||
return static_cast<int8_t>(dst); | ||
#else | ||
// CUDA path | ||
uint32_t dst; | ||
asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x)); | ||
return reinterpret_cast<const int8_t&>(dst); | ||
#endif | ||
} | ||
|
||
namespace vllm { | ||
|
||
template <typename scalar_t, typename scale_type> | ||
__global__ void static_scaled_int8_quant_kernel( | ||
const scalar_t* __restrict__ input, int8_t* __restrict__ out, | ||
scale_type scale, const int hidden_size) { | ||
const int tid = threadIdx.x; | ||
const int token_idx = blockIdx.x; | ||
|
||
for (int i = tid; i < hidden_size; i += blockDim.x) { | ||
out[token_idx * hidden_size + i] = | ||
float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale); | ||
} | ||
} | ||
} // namespace vllm | ||
|
||
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] | ||
torch::Tensor& input, // [..., hidden_size] | ||
float scale) { | ||
TORCH_CHECK(input.is_contiguous()); | ||
TORCH_CHECK(out.is_contiguous()); | ||
int hidden_size = input.size(-1); | ||
int num_tokens = input.numel() / hidden_size; | ||
dim3 grid(num_tokens); | ||
dim3 block(std::min(hidden_size, 1024)); | ||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||
VLLM_DISPATCH_FLOATING_TYPES( | ||
input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { | ||
vllm::static_scaled_int8_quant_kernel<scalar_t, float> | ||
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(), | ||
out.data_ptr<int8_t>(), scale, | ||
hidden_size); | ||
}); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import pytest | ||
import torch | ||
|
||
from vllm._C import ops | ||
|
||
DTYPES = [torch.half, torch.bfloat16, torch.float] | ||
HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192] # Arbitrary values for testing | ||
NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing | ||
SEEDS = [0] | ||
SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] | ||
|
||
|
||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) | ||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) | ||
@pytest.mark.parametrize("dtype", DTYPES) | ||
@pytest.mark.parametrize("seed", SEEDS) | ||
@pytest.mark.parametrize("scale", SCALE) | ||
@torch.inference_mode() | ||
def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, | ||
seed: int, scale: float) -> None: | ||
torch.random.manual_seed(seed) | ||
torch.cuda.manual_seed(seed) | ||
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 | ||
|
||
out1 = (x / scale).round().clamp( | ||
torch.iinfo(torch.int8).min, | ||
torch.iinfo(torch.int8).max).to(torch.int8) | ||
out2 = torch.empty_like(x, dtype=torch.int8) | ||
ops.static_scaled_int8_quant(out2, x, scale) | ||
assert torch.allclose(out1, out2, | ||
atol=1) # big atol to account for rounding errors |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
"""Test model set-up and weight loading for sparseml-quantized models. | ||
|
||
Run `pytest tests/quantization/test_compressed_tensors.py`. | ||
""" | ||
|
||
import torch | ||
|
||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 | ||
CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor) | ||
|
||
|
||
def test_compressed_tensors_w8a8_static_setup(vllm_runner): | ||
model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed" | ||
llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True) | ||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model | ||
layer = model.model.layers[0] | ||
|
||
qkv_proj = layer.self_attn.qkv_proj | ||
o_proj = layer.self_attn.o_proj | ||
gate_up_proj = layer.mlp.gate_up_proj | ||
down_proj = layer.mlp.down_proj | ||
|
||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) | ||
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) | ||
assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod) | ||
assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod) | ||
|
||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor) | ||
|
||
assert qkv_proj.weight.dtype is torch.int8 | ||
assert o_proj.weight.dtype is torch.int8 | ||
assert gate_up_proj.weight.dtype is torch.int8 | ||
|
||
assert qkv_proj.weight_scale.shard_splitter is not None | ||
assert qkv_proj.weight_scale.logical_widths is not None | ||
assert qkv_proj.input_scale.dtype is torch.float32 |
Uh oh!
There was an error while loading. Please reload this page.