vllm-project · akllm · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 12, 2024
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -480,6 +480,9 @@
     mistral_mapping = {
         "layers": "model.layers",
         "attention": "self_attn",
+        "qscale_act": "input_scale",
+        "qscale_weight": "weight_scale",
+        "kv_fake_quantizer.qscale_act": "kv_scale",
         "wq": "q_proj",
         "wk": "k_proj",
         "wv": "v_proj",
@@ -614,15 +617,22 @@
         modules = name.split(".")
 
         # rotary embeds should be sliced
-        if "wk" in modules:
+        if "wk" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
                                     self.config.num_key_value_heads)
-        elif "wq" in modules:
+        elif "wq" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
                                     self.config.num_attention_heads)
 
-        for item in modules:
-            if item in mapping and mapping[item] not in name:
+        num_modules = len(modules)
+        for i in range(num_modules):
+            item = modules[i]
+            next_item = modules[i + 1] if i < num_modules - 1 else None
+
+            combined_item = f"{item}.{next_item}" if next_item is not None else None
+            if combined_item in mapping:
+                name = name.replace(combined_item, mapping[combined_item])
+            elif item in mapping and mapping[item] not in name:
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -490,12 +490,12 @@ def load_params_config(model: Union[str, Path],
         "hidden_dim": "intermediate_size",
     }
 
-    def recurse_elems(elem: Any):
-        if isinstance(elem, dict):
+    def recurse_elems(elem: Any, wrap_to_hf_config: bool=True):
+        if isinstance(elem, dict) and wrap_to_hf_config:
             config_dict = {}
             for key, value in elem.items():
                 key = config_mapping.get(key, key)
-                config_dict[key] = recurse_elems(value)
+                config_dict[key] = recurse_elems(value, wrap_to_hf_config=False)
             return PretrainedConfig(**config_dict)
         else:
             return elem
@@ -507,7 +507,11 @@ def recurse_elems(elem: Any):
     config_dict["max_seq_len"] = config_dict.get("max_seq_len", 128_000)
     config_dict["max_position_embeddings"] = config_dict.get(
         "max_position_embeddings", 128_000)
-
+    if config_dict.get("quantization") is not None:
+        config_dict["quantization_config"] = {
+            "quant_method": "fp8",
+            "activation_scheme": "static"
+        }
     if config_dict.get("moe") is not None:
         config_dict["architectures"] = ["MixtralForCausalLM"]
     else: