[SW-207579] support load vLLM compatible FP8 model (#18)

changwangss · chensuyue · commit 664f2af596b5 · 2024-12-19T17:34:30.000+08:00
Support load vLLM compatible FP8 model, both G2 and G3, both single card and multi-cards.
---------

Signed-off-by: changwang &lt;changwang@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/save_load.py b/neural_compressor/torch/algorithms/fp8_quant/save_load.py
@@ -20,8 +20,7 @@
 import torch
 
 from ._quant_common.quant_config import local_rank, world_size
-from neural_compressor.torch.utils import get_accelerator
-
+from neural_compressor.torch.utils import get_accelerator, is_optimum_habana_available
 
 MAX_FILE_SIZE = 5  # GB
 cur_accelerator = get_accelerator()
@@ -153,12 +152,36 @@ def load_empty_raw_model(model_name_or_path, **kwargs):
     """Initialize BF16 model with meta tensor."""
     import transformers
     from accelerate import init_empty_weights
+    config = transformers.AutoConfig.from_pretrained(model_name_or_path, **kwargs)
+    # fp8 model provided by neuralmagic.
+    if (
+        "quant_method" in config.quantization_config
+        and config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"]
+    ):
+        from_neuralmagic = True
+        if (
+            "kv_cache_scheme" in config.quantization_config
+            and config.quantization_config["kv_cache_scheme"] is not None
+        ):
+            from_neuralmagic_with_kv = True
+        else:
+            from_neuralmagic_with_kv = False
+    else:
+        from_neuralmagic = False
+        from_neuralmagic_with_kv = False
+
+    if from_neuralmagic_with_kv:
+        config.flash_attention_fp8 = True
+        if is_optimum_habana_available:
+            from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+            adapt_transformers_to_gaudi()
+        else:
+            raise ValueError("Please install optimum-habana to load fp8 kv cache model.")
+
     from neural_compressor.torch.utils import get_non_persistent_buffers, load_non_persistent_buffers
 
     if world_size > 1:
         import deepspeed
-
-        config = transformers.AutoConfig.from_pretrained(model_name_or_path, **kwargs)
         with init_empty_weights(include_buffers=False):
             model = transformers.AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
         # TODO: [SW-199728] [DeepSpeed] Buffers initialized by model are not correct after tensor parallel
@@ -172,10 +195,9 @@ def load_empty_raw_model(model_name_or_path, **kwargs):
         model = model.module
         load_non_persistent_buffers(model, non_persistent_buffers)
     else:
-        config = transformers.AutoConfig.from_pretrained(model_name_or_path, **kwargs)
         with init_empty_weights(include_buffers=False):
             model = transformers.AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
-    return model
+    return model, from_neuralmagic, from_neuralmagic_with_kv
 
 
 def find_safetensors_files(model_name_or_path, **kwargs):
@@ -205,6 +227,9 @@ def find_safetensors_files(model_name_or_path, **kwargs):
                 resolved_archive_file,
                 **kwargs,
             )
+        # for the model only with 1 model.safetensors file.
+        if isinstance(resolved_archive_file, str):
+            resolved_archive_file = [resolved_archive_file]
         return resolved_archive_file
 
 
@@ -219,6 +244,57 @@ def shard_state_dict(state_dict):
             rank_state_dict[k] = v.to("hpu")
     return rank_state_dict
 
+def split_rank_state_dict(model, gathered_state_dict):
+    """split state_dict for current local_rank."""
+    rank_state_dict = {}
+    for name, param in model.named_parameters():
+        if name in gathered_state_dict:
+            full_weight = gathered_state_dict[name]
+            if len(param.shape) != 0 and full_weight.shape != param.shape:
+                if full_weight.shape[0] != param.shape[0]:
+                    split_weight = split_weights(full_weight, world_size, local_rank, split_axis=0)
+                elif full_weight.shape[1] != param.shape[1]:
+                    split_weight = split_weights(full_weight, world_size, local_rank, split_axis=1)
+                else:
+                    split_weight = split_weights(full_weight, world_size, local_rank, split_axis=0)
+            else:
+                split_weight = full_weight
+            rank_state_dict[name] = split_weight
+
+    return rank_state_dict
+
+
+def get_inc_fp8config(model, from_neuralmagic=False, from_neuralmagic_with_kv=False):
+    """Get INC FP8 Config.
+
+    Args:
+        model: empty model.
+        from_neuralmagic(bool, optional): whether provided from nerualmagic modelhub.
+        from_neuralmagic_with_kv(bool, optional): whether provided from nerualmagic modelhub and quantized kv_cache.
+
+    Returns:
+        INC FP8 Config.
+    """
+    from neural_compressor.torch.quantization import FP8Config
+    if from_neuralmagic:
+        if "ignore" in model.config.quantization_config.keys():
+            blocklist =  {"types": [], "names": model.config.quantization_config["ignore"]}
+        elif "ignored_layers" in model.config.quantization_config.keys():
+            blocklist =  {"types": [], "names": model.config.quantization_config["ignored_layers"]}
+        else:
+            blocklist =  {"types": [], "names": ["lm_head"]}
+        if "target" in model.config.quantization_config.keys():
+            allowlist = {"types": model.config.quantization_config["target"], "names": []}
+        else:
+            if from_neuralmagic_with_kv:
+                allowlist = {"types": ["Linear", "LinearLayer", "LinearAllreduce", "KVCache"], "names": []}
+            else:
+                allowlist = {"types": ["Linear", "LinearLayer", "LinearAllreduce"], "names": []}
+        qconfig = FP8Config(mode="LOAD", allowlist=allowlist, blocklist=blocklist, scale_format="CONST")
+    else:
+        qconfig = FP8Config.from_dict(model.config.quantization_config)
+    return qconfig
+
 
 def load(model_name_or_path, format="huggingface", device="hpu", **kwargs):
     """Load FP8 model.
@@ -236,12 +312,12 @@ def load(model_name_or_path, format="huggingface", device="hpu", **kwargs):
     assert device == "hpu", "Currently, only hpu device is supported for FP8 model."
     from safetensors.torch import load_file as safe_load_file
 
-    model = load_empty_raw_model(model_name_or_path, **kwargs)
     from neural_compressor.torch.algorithms.fp8_quant import prep_model
-    from neural_compressor.torch.quantization import FP8Config
 
-    qconfig = FP8Config.from_dict(model.config.quantization_config)
+    model, from_neuralmagic, from_neuralmagic_with_kv = load_empty_raw_model(model_name_or_path, **kwargs)
+    qconfig = get_inc_fp8config(model, from_neuralmagic, from_neuralmagic_with_kv)
     qconfig.save_temp_json_file()  # generate qconfig.json_file
+
     # replace modules to patched modules
     prep_model(model, qconfig.json_file)
     # get the safetensors file list from one folder
@@ -250,15 +326,106 @@ def load(model_name_or_path, format="huggingface", device="hpu", **kwargs):
     for file_name in files_list:
         cur_file = os.path.join(model_name_or_path, file_name)
         gathered_state_dict = safe_load_file(cur_file)
+        if from_neuralmagic or from_neuralmagic_with_kv:
+            import habana_frameworks.torch.utils.experimental as htexp
+            gathered_state_dict = convert_weight_to_inc(
+                                        state_dict=gathered_state_dict,
+                                        on_gaudi2=htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2
+                                        )
         if world_size > 0:
             # only return state_dict for the current local_rank
-            rank_state_dict = shard_state_dict(gathered_state_dict)
+            if from_neuralmagic or from_neuralmagic_with_kv:
+                rank_state_dict = split_rank_state_dict(model, gathered_state_dict)
+            else:
+                rank_state_dict = shard_state_dict(gathered_state_dict)
             model.load_state_dict(rank_state_dict, assign=True, strict=False)
         else:
             model.load_state_dict(gathered_state_dict, assign=True, strict=False)
+
+    if from_neuralmagic or from_neuralmagic_with_kv:
+        model.tie_weights()
     model = model.eval()
     model = model.to(cur_accelerator.name())
+
     cur_accelerator.synchronize()
     # make sure cpu and hpu memory are all released.
     gc.collect()
     return model
+
+
+def convert_weight_to_inc(state_dict, on_gaudi2=False):
+    """To convert the vllm compatable fp8 model weight to INC format,
+       one is operators' name are different, the other is to adapt weight on G2
+       due to the torch.float8_e4m3fn scope [-240, 240].
+
+    Args:
+        state_dict (dict): state_dict from modelhub.
+        on_gaudi2 (bool, optional): whether is on Gaudi2. Defaults to False.
+
+    Returns:
+        state_dict includes weight and scale adapted to INC format.
+    """
+    key_name = state_dict.keys()
+    for key in list(key_name):
+        if "weight_scale" in key:
+            scale_weight = key.replace("weight_scale", "scale_weight")
+            if on_gaudi2:
+                # dequant_weight
+                weight_key = key.replace("weight_scale", "weight")
+                qweight = state_dict[weight_key].t().to(torch.bfloat16).to("hpu")
+                scale = state_dict[key].to("hpu")
+                dequant_weight = qweight * scale
+                # recompute scale, qweight
+                recompute_scale = scale * (torch.finfo(torch.float8_e4m3fn).max /
+                                        torch.finfo(torch.float8_e4m3fnuz).max)
+                qweight = torch.ops.hpu.cast_to_fp8_v2(dequant_weight, 1.0 / recompute_scale, False, False, torch.float8_e4m3fn)[0]
+                state_dict[weight_key] = qweight
+                state_dict[scale_weight] = recompute_scale
+            else:
+                state_dict[scale_weight] = state_dict[key].to("hpu")
+            state_dict.pop(key)
+        elif "kv_scale" in key:
+            k_scale_inv = key.replace("kv_scale", "k_cache.quant_input.scale_inv")
+            v_scale_inv = key.replace("kv_scale", "v_cache.quant_input.scale_inv")
+            k_scale = key.replace("kv_scale", "k_cache.dequant_output.scale")
+            v_scale = key.replace("kv_scale", "v_cache.dequant_output.scale")
+            state_dict[k_scale_inv] = 1 / state_dict[key].to("hpu")
+            state_dict[v_scale_inv] = 1 / state_dict[key].to("hpu")
+            state_dict[k_scale] = state_dict[key].to("hpu")
+            state_dict[v_scale] = state_dict[key].to("hpu")
+            state_dict.pop(key)
+        elif "input_scale" in key:
+            scale_input_inv = key.replace("input_scale", "quant_input.scale_inv")
+            scale_input = key.replace("input_scale", "scale_input")
+            state_dict[scale_input_inv] = 1 / state_dict[key].to("hpu")
+            state_dict[scale_input] = state_dict[key].to("hpu")
+            state_dict.pop(key)
+        elif "proj.weight" in key and not on_gaudi2:
+            state_dict[key] = state_dict[key].detach().t().to("hpu")
+        else:
+            pass
+    return state_dict
+
+
+def split_weights(weight, tp_size, tp_rank, split_axis=0):
+    """
+    Args:
+        weight (torch.Tensor): weight tensor.
+        tp_size (int): tensor parallel size.
+        tp_rank (int): tensor parallel rank.
+        split_axis (int): split by column or line, 0 or 1.
+    Returns:
+        torch.Tensor: split weight tensor.
+    """
+    split_size = weight.shape[split_axis] // tp_size
+    start_idx = tp_rank * split_size
+    end_idx = (tp_rank + 1) * split_size
+
+    if len(weight.shape) == 1:
+        return weight[start_idx:end_idx]
+    elif split_axis == 0:
+        return weight[start_idx:end_idx, :]
+    elif split_axis == 1:
+        return weight[:, start_idx:end_idx]
+    else:
+        raise ValueError("split_axis must be 0 (row) or 1 (column).")
diff --git a/neural_compressor/torch/quantization/save_load_entry.py b/neural_compressor/torch/quantization/save_load_entry.py
@@ -126,7 +126,14 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
         import transformers
         config = transformers.AutoConfig.from_pretrained(model_name_or_path, **kwargs)
         # use config to check which algorithm is used.
-        if "fp8_config" in config.quantization_config:
+        if (
+            "fp8_config" in config.quantization_config or
+            # for FP8 LLMs for vLLM (https://huggingface.co/neuralmagic).
+            (
+                "quant_method" in config.quantization_config and
+                config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"]
+            )
+        ):
             from neural_compressor.torch.algorithms import fp8_quant
             return fp8_quant.load(model_name_or_path, format=format, device=device, **kwargs)
         else:
diff --git a/test/3x/torch/quantization/fp8_quant/test_multi_device.py b/test/3x/torch/quantization/fp8_quant/test_multi_device.py
@@ -7,6 +7,7 @@
 
 from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import local_rank, world_size
 from neural_compressor.torch.quantization import FP8Config, convert, load, prepare, save
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedLinear
 
 
 def get_hpu_used_mem():
@@ -24,6 +25,23 @@ def calib_func(model):
         model(example_inputs)
 
 
+def test_load_model_provided_by_neuralmagic():
+    model_name_or_path = "neuralmagic/Qwen2-0.5B-Instruct-FP8"
+    model = load(model_name_or_path, format="huggingface", device="hpu")
+    assert isinstance(model, torch.nn.Module)
+    assert isinstance(model.model.layers[0].self_attn.q_proj, PatchedLinear)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
+    prompt = "There existed a little girl, who liked to have adventures."
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("hpu")
+    generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=1)
+    gen_ids = model.generate(
+        input_ids,
+        max_new_tokens=5,
+        **generate_kwargs,
+    )
+    assert isinstance(gen_ids, torch.Tensor)
+
+
 def test_multi_cards_save_load():
     name = "facebook/opt-350m"
     if world_size > 0:
@@ -58,3 +76,4 @@ def test_multi_cards_save_load():
 
 if __name__ == "__main__":
     test_multi_cards_save_load()
+    test_load_model_provided_by_neuralmagic()