forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Misc] Support FP8 kv cache scales from compressed-tensors (vllm-proj…
- Loading branch information
Showing
7 changed files
with
186 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import torch | ||
|
||
from vllm.model_executor.layers.quantization.base_config import ( | ||
QuantizationConfig, QuantizeMethodBase) | ||
from vllm.utils import print_warning_once | ||
|
||
|
||
class BaseKVCacheMethod(QuantizeMethodBase): | ||
""" | ||
Quant method that adds `_k_scale` and `_v_scale` attributes to the | ||
Attention layer to support loading those scaling factors from checkpoints. | ||
The k/v_scale will be used to: | ||
- quantize k/v_cache entries before saving them to the cache | ||
- dequantize k/v_cache entries before fetching them from the cache | ||
:param quant_config: the appropriate QuantizationConfig | ||
""" | ||
|
||
def __init__(self, quant_config: QuantizationConfig): | ||
self.quant_config = quant_config | ||
|
||
def create_weights(self, layer: torch.nn.Module): | ||
""" | ||
Create "weight" (aka k_scale and v_scale) for an attention layer. | ||
""" | ||
# Initialize the KV cache scales to -1.0, which is an invalid value. | ||
# If the k/v_scale appears in the checkpoint, it will be | ||
# overwritten when loading weights. | ||
layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0), | ||
requires_grad=False) | ||
layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0), | ||
requires_grad=False) | ||
|
||
def apply(self, layer: torch.nn.Module) -> torch.Tensor: | ||
raise RuntimeError( | ||
f"{self.__class__.__name__}.apply should not be called.") | ||
|
||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: | ||
# If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0 | ||
# regardless whether the kv-scale is available in the checkpoint. | ||
if layer.kv_cache_dtype != "auto": | ||
if layer.k_scale > 0.0 and layer.v_scale > 0.0: | ||
# We prefer to use separate k_scale and v_scale if present | ||
k_scale = layer.k_scale.to("cpu").tolist() | ||
v_scale = layer.v_scale.to("cpu").tolist() | ||
elif layer.k_scale < 0.0 and layer.v_scale < 0.0: | ||
# If no scales were loaded (both scales are invalid negative | ||
# values), use the default value of 1.0 | ||
k_scale = torch.nn.Parameter(torch.tensor(1.0), | ||
requires_grad=False) | ||
v_scale = torch.nn.Parameter(torch.tensor(1.0), | ||
requires_grad=False) | ||
else: | ||
# If we find a single kv_scale in the checkpoint, we remap | ||
# kv_scale to k_scale during weight loading, and duplicate | ||
# k_scale to v_scale here | ||
assert layer.k_scale > 0.0 | ||
scale_to_duplicate = max(layer.k_scale, layer.v_scale) | ||
k_scale = scale_to_duplicate.to("cpu").tolist() | ||
v_scale = scale_to_duplicate.to("cpu").tolist() | ||
|
||
if not isinstance(k_scale, float) or not isinstance( | ||
v_scale, float): | ||
raise ValueError("Only support per-tensor scaling factor " | ||
"for fp8 KV cache") | ||
|
||
# These are used in the final Attention.forward() | ||
layer._k_scale = k_scale | ||
layer._v_scale = v_scale | ||
if (layer._k_scale == 1.0 and layer._v_scale == 1.0 | ||
and "e5m2" not in layer.kv_cache_dtype): | ||
print_warning_once( | ||
"Using KV cache scaling factor 1.0 for fp8_e4m3. This " | ||
"may cause accuracy issues. Please make sure k/v_scale " | ||
"scaling factors are available in the fp8 checkpoint.") | ||
|
||
del layer.k_scale | ||
del layer.v_scale |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters