Fix bug for p-tuning

xusenlin · xusenlin · commit 553a867fa130 · 2023-12-07T12:14:14.000+08:00
diff --git a/api/adapter/model.py b/api/adapter/model.py
@@ -1,4 +1,3 @@
-import json
 import os
 import sys
 from typing import List, Optional, Any, Dict, Tuple
@@ -135,12 +134,8 @@ def load_model(
             setattr(config, "bf16", dtype == "bfloat16")
             config_kwargs.pop("torch_dtype", None)
 
-        use_ptuning_v2 = kwargs.get("use_ptuning_v2", False)
-        if use_ptuning_v2 and adapter_model:
-            with open(f"{adapter_model}/config.json", "r") as prefix_encoder_file:
-                prefix_encoder_config = json.loads(prefix_encoder_file.read())
-            config.pre_seq_len = prefix_encoder_config["pre_seq_len"]
-            config.prefix_projection = prefix_encoder_config["prefix_projection"]
+        if kwargs.get("using_ptuning_v2", False) and adapter_model:
+            config.pre_seq_len = kwargs.get("pre_seq_len", 128)
 
         # Load and prepare pretrained models (without valuehead).
         model = self.model_class.from_pretrained(
@@ -205,7 +200,7 @@ def load_adapter_model(
         model_kwargs: Dict,
         **kwargs: Any,
     ) -> PreTrainedModel:
-        use_ptuning_v2 = kwargs.get("use_ptuning_v2", False)
+        using_ptuning_v2 = kwargs.get("using_ptuning_v2", False)
         resize_embeddings = kwargs.get("resize_embeddings", False)
         if adapter_model and resize_embeddings and not is_chatglm:
             model_vocab_size = model.get_input_embeddings().weight.size(0)
@@ -218,10 +213,10 @@ def load_adapter_model(
                 logger.info("Resize model embeddings to fit tokenizer")
                 model.resize_token_embeddings(tokenzier_vocab_size)
 
-        if use_ptuning_v2:
+        if using_ptuning_v2:
             prefix_state_dict = torch.load(os.path.join(adapter_model, "pytorch_model.bin"))
             new_prefix_state_dict = {
-                k[len("transformer.prefix_encoder.") :]: v
+                k[len("transformer.prefix_encoder."):]: v
                 for k, v in prefix_state_dict.items()
                 if k.startswith("transformer.prefix_encoder.")
             }
diff --git a/api/config.py b/api/config.py
@@ -116,6 +116,10 @@ class Settings(BaseModel):
         default=get_bool_env("USING_PTUNING_V2"),
         description="Whether to load the model using ptuning_v2."
     )
+    pre_seq_len: Optional[bool] = Field(
+        default=get_bool_env("PRE_SEQ_LEN"),
+        description="PRE_SEQ_LEN for ptuning_v2."
+    )
 
     # context related
     context_length: Optional[int] = Field(
diff --git a/api/models.py b/api/models.py
@@ -40,7 +40,7 @@ def create_generate_model():
         apply_ntk_scaling_patch(SETTINGS.alpha)
 
     include = {
-        "model_name", "quantize", "device", "device_map", "num_gpus",
+        "model_name", "quantize", "device", "device_map", "num_gpus", "pre_seq_len",
         "load_in_8bit", "load_in_4bit", "using_ptuning_v2", "dtype", "resize_embeddings"
     }
     kwargs = SETTINGS.model_dump(include=include)

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ def create_generate_model():`
`40`	`40`	`apply_ntk_scaling_patch(SETTINGS.alpha)`
`41`	`41`
`42`	`42`	`include = {`
`43`		`- "model_name", "quantize", "device", "device_map", "num_gpus",`
	`43`	`+ "model_name", "quantize", "device", "device_map", "num_gpus", "pre_seq_len",`
`44`	`44`	`"load_in_8bit", "load_in_4bit", "using_ptuning_v2", "dtype", "resize_embeddings"`
`45`	`45`	`}`
`46`	`46`	`kwargs = SETTINGS.model_dump(include=include)`