Add apply lora

xusenlin · xusenlin · commit 086c8f811681 · 2024-01-13T09:20:56.000+08:00
diff --git a/api/utils/protocol.py b/api/utils/protocol.py
@@ -78,6 +78,14 @@ class ChatCompletionCreateParams(BaseModel):
     or exclusive selection of the relevant token.
     """
 
+    logprobs: Optional[bool] = False
+    """Whether to return log probabilities of the output tokens or not.
+
+    If true, returns the log probabilities of each output token returned in the
+    `content` of `message`. This option is currently not available on the
+    `gpt-4-vision-preview` model.
+    """
+
     max_tokens: Optional[int] = None
     """The maximum number of [tokens](/tokenizer) to generate in the chat completion.
 
@@ -146,6 +154,13 @@ class ChatCompletionCreateParams(BaseModel):
     functions the model may generate JSON inputs for.
     """
 
+    top_logprobs: Optional[int] = None
+    """
+    An integer between 0 and 5 specifying the number of most likely tokens to return
+    at each token position, each with an associated log probability. `logprobs` must
+    be set to `true` if this parameter is used.
+    """
+
     top_p: Optional[float] = 1.0
     """
     An alternative to sampling with temperature, called nucleus sampling, where the
diff --git a/examples/qwen-7b-chat/get_weather.py b/examples/qwen-7b-chat/get_weather.py
@@ -134,4 +134,4 @@ def run_conversation(query: str, stream=False, functions=None, max_retry=5):
     logger.info("\n=========== next conversation ===========")
 
     query = "波士顿天气如何？"
-    run_conversation(query, functions=functions, stream=True)
+    run_conversation(query, functions=functions, stream=False)
diff --git a/libs/langchain_llm/langchain_llm/__init__.py b/libs/langchain_llm/langchain_llm/__init__.py
@@ -2,13 +2,15 @@
     HuggingFaceLLM,
     ChatHuggingFace,
 )
-from ._vllm import XVLLM as VLLM
 from ._vllm import ChatVLLM
+from ._vllm import XVLLM as VLLM
+from .utils import apply_lora
 
 
 __all__ = [
     "HuggingFaceLLM",
     "ChatHuggingFace",
     "VLLM",
     "ChatVLLM",
+    "apply_lora"
 ]
diff --git a/libs/langchain_llm/langchain_llm/_huggingface.py b/libs/langchain_llm/langchain_llm/_huggingface.py
@@ -170,18 +170,6 @@ def _validate_environment(values: Dict) -> Dict:
             values["context_length"] = get_context_length(values["model"].config)
             logger.info(f"Context length is set to : {values['context_length']}")
 
-        # fix the tokenizer by adding the end-of-sequence (eos) token and the padding (pad) token if they are missing.
-        if values["tokenizer"].eos_token_id is None:
-            values["tokenizer"].eos_token = "<|endoftext|>"
-            logger.info(f"Add eos token: {values['tokenizer'].eos_token}")
-
-        if values["tokenizer"].pad_token_id is None:
-            if values["tokenizer"].unk_token_id is not None:
-                values["tokenizer"].pad_token = values["tokenizer"].unk_token
-            else:
-                values["tokenizer"].pad_token = values["tokenizer"].eos_token
-            logger.info(f"Add pad token: {values['tokenizer'].pad_token}")
-
         return values
 
     @property
diff --git a/libs/langchain_llm/langchain_llm/adapters/patcher.py b/libs/langchain_llm/langchain_llm/adapters/patcher.py
@@ -112,6 +112,17 @@ def patch_tokenizer(tokenizer: "PreTrainedTokenizer") -> None:
     if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__):
         tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer)
 
+    if tokenizer.eos_token_id is None:
+        tokenizer.eos_token = "<|endoftext|>"
+        logger.info(f"Add eos token: {tokenizer.eos_token}")
+
+    if tokenizer.pad_token_id is None:
+        if tokenizer.unk_token_id is not None:
+            tokenizer.pad_token = tokenizer.unk_token
+        else:
+            tokenizer.pad_token = tokenizer.eos_token
+        logger.info(f"Add pad token: {tokenizer.pad_token}")
+
 
 def patch_config(
     config: "PretrainedConfig",
diff --git a/libs/langchain_llm/langchain_llm/utils.py b/libs/langchain_llm/langchain_llm/utils.py
@@ -0,0 +1,43 @@
+from typing import Optional
+
+import torch
+from loguru import logger
+from peft import PeftModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel
+
+
+def apply_lora(
+    base_model_path: str,
+    lora_path: str,
+    target_model_path: str,
+    max_shard_size: Optional[str] = "2GB",
+    safe_serialization: Optional[bool] = True,
+) -> PreTrainedModel:
+
+    logger.info(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    )
+    base_tokenizer = AutoTokenizer.from_pretrained(
+        base_model_path,
+        use_fast=False,
+        trust_remote_code=True,
+    )
+
+    logger.info(f"Loading the LoRA adapter from {lora_path}")
+
+    lora_model = PeftModel.from_pretrained(base, lora_path)
+
+    logger.info("Applying the LoRA")
+    model = lora_model.merge_and_unload()
+
+    logger.info(f"Saving the target model to {target_model_path}")
+    model.save_pretrained(
+        target_model_path,
+        max_shard_size=max_shard_size,
+        safe_serialization=safe_serialization,
+    )
+    base_tokenizer.save_pretrained(target_model_path)