Skip to content

Commit

Permalink
Merge pull request #23 from vtuber-plan/function
Browse files Browse the repository at this point in the history
add mistral
  • Loading branch information
jstzwj authored Oct 23, 2023
2 parents 5354db8 + a59e2a2 commit ee32f73
Show file tree
Hide file tree
Showing 17 changed files with 203 additions and 50 deletions.
5 changes: 2 additions & 3 deletions langport/data/conversation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,10 @@ def get_prompt(self) -> str:
ret = system_prompt + self.settings.sep

for i, (role, message) in enumerate(self.messages):
ret += im_start
if message:
ret += role + "\n" + message + im_end + self.settings.sep
ret += im_start + role + "\n" + message + im_end + self.settings.sep
else:
ret += role + "\n"
ret += im_start + role + "\n"
return ret
else:
raise ValueError(f"Invalid style: {self.settings.sep_style}")
Expand Down
15 changes: 15 additions & 0 deletions langport/data/conversation/settings/mistral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from langport.data.conversation import (
ConversationSettings,
SeparatorStyle,
)


# Mistral default template
mistral = ConversationSettings(
name="mistral",
system_template="[INST]{system_message}\n",
roles=("[INST]", "[/INST]"),
sep_style=SeparatorStyle.LLAMA,
sep=" ",
sep2="</s>",
)
2 changes: 1 addition & 1 deletion langport/data/conversation/settings/mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
name="mpt",
roles=("<|im_start|>user", "<|im_start|>assistant"),
sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
sep="<|im_end|>",
sep="<|im_end|>\n",
stop_token_ids=[50278, 0],
)
16 changes: 16 additions & 0 deletions langport/data/conversation/settings/starchat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from langport.data.conversation import (
ConversationSettings,
SeparatorStyle,
)


# StarChat default template
starchat = ConversationSettings(
name="starchat",
system_template="<system>\n{system_message}",
roles=("<|user|>", "<|assistant|>"),
sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
sep="<|end|>\n",
stop_token_ids=[0, 49155],
stop_str="<|end|>",
)
19 changes: 19 additions & 0 deletions langport/model/adapters/mistral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from langport.data.conversation import ConversationHistory, SeparatorStyle
from langport.data.conversation.conversation_settings import get_conv_settings
from langport.model.model_adapter import BaseAdapter


class MistralAdapter(BaseAdapter):
"""The model adapter for Mistral"""

def match(self, model_path: str):
return model_path.lower().startswith("mistral")

def get_default_conv_template(self, model_path: str) -> ConversationHistory:
settings = get_conv_settings("mistral")
return ConversationHistory(
system="",
messages=[],
offset=0,
settings=settings,
)
4 changes: 2 additions & 2 deletions langport/model/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,11 @@ def load_compress_model(model_path, device, compression_config: CompressionConfi
model,
max_memory=max_memory,
no_split_module_classes=no_split_module_classes,
dtype=torch_dtype,
dtype=torch.int8,
low_zero=(device_map == "balanced_low_0"),
)
device_map = infer_auto_device_map(
model, max_memory=max_memory, no_split_module_classes=no_split_module_classes, dtype=torch_dtype
model, max_memory=max_memory, no_split_module_classes=no_split_module_classes, dtype=torch.int8
)
else:
device_map = defaultdict(lambda:device)
Expand Down
3 changes: 2 additions & 1 deletion langport/model/executor/generation/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,7 @@ def __init__(
cpu_offloading: bool,
deepspeed: bool = False,
gptq: bool = False,
group_size: Optional[int] = None,
trust_remote_code: bool = False,
offload_folder: Optional[str] = None,
) -> None:
Expand All @@ -604,7 +605,7 @@ def __init__(
self.model = None
self.tokenizer = None
self.adapter, self.model, self.tokenizer = self.load_model(
model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading, deepspeed, gptq, trust_remote_code, offload_folder
model_path, device, num_gpus, max_gpu_memory, quantization, cpu_offloading, deepspeed, gptq, group_size, trust_remote_code, offload_folder
)
self.model.eval()

Expand Down
98 changes: 75 additions & 23 deletions langport/model/executor/huggingface.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from functools import partial
import os
from typing import Optional

from langport.model.adapters.dolly_v2 import DollyV2Adapter
Expand All @@ -14,6 +15,7 @@
import torch

from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoModel,
AutoTokenizer,
Expand All @@ -23,6 +25,9 @@
BertModel,
)

from transformers.utils.quantization_config import QuantizationMethod
from accelerate.utils import get_balanced_memory, infer_auto_device_map
from accelerate import init_empty_weights

import math
from typing import Optional
Expand Down Expand Up @@ -105,10 +110,32 @@ def _load_hf_model(self, adapter, model_path: str, from_pretrained_kwargs: dict)
model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
) # , offload_folder="offload"
else:
# GPTQ quanted mode work around
config = AutoConfig.from_pretrained(model_path)
if hasattr(config, "quantization_config"):
quantization_method_from_config = config.quantization_config.get(
"quant_method", QuantizationMethod.BITS_AND_BYTES
)
if quantization_method_from_config == QuantizationMethod.GPTQ:
no_split_module_classes = ["LlamaDecoderLayer", "GPTJBlock", "GPT2Block", "GPTBigCodeBlock", "GPTNeoBlock"]
device_map = from_pretrained_kwargs.get("device_map", "auto")
if config.quantization_config.get("bits", None) == 4:
torch_dtype = torch.quint4x2
elif config.quantization_config.get("bits", None) == 8:
torch_dtype = torch.int8
else:
torch_dtype = torch.int8
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
device_map = infer_auto_device_map(
model, max_memory=None, no_split_module_classes=no_split_module_classes, dtype=torch_dtype
)
from_pretrained_kwargs["device_map"] = device_map

trust_remote_code = from_pretrained_kwargs.get("trust_remote_code", False)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=trust_remote_code)
model = AutoModelForCausalLM.from_pretrained(
model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
model_path, low_cpu_mem_usage=True,**from_pretrained_kwargs
)

return model, tokenizer
Expand All @@ -123,6 +150,7 @@ def load_model(
cpu_offloading: bool = False,
deepspeed: bool = False,
gptq: bool = False,
group_size: Optional[int] = None,
trust_remote_code: bool = False,
offload_folder: Optional[str] = None,
debug: bool = False,
Expand Down Expand Up @@ -159,7 +187,8 @@ def load_model(
raise ValueError(f"Invalid device: {device}")

kwargs["trust_remote_code"] = trust_remote_code
kwargs["offload_folder"] = offload_folder
if offload_folder is not None:
kwargs["offload_folder"] = offload_folder

if cpu_offloading:
# raises an error on incompatible platforms
Expand All @@ -175,30 +204,53 @@ def load_model(
kwargs["load_in_8bit"] = quantization!=None
# Load model
model, tokenizer = self._load_hf_model(adapter, model_path, kwargs)
elif quantization is not None:
elif quantization is not None or gptq:
if group_size is None:
group_size = 128
if gptq:
if quantization is None:
quantization = "8bit"
import datasets
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, **kwargs)
if "4" in quantization:
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
elif "8" in quantization:
quantize_config = BaseQuantizeConfig(
bits=8, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
if "gptq" in model_path.lower():
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, **kwargs)
model = AutoGPTQForCausalLM.from_quantized(model_path, **kwargs)
else:
quantize_config = BaseQuantizeConfig(
bits=8, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(model_path, quantize_config, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, **kwargs)
if tokenizer._pad_token is None:
tokenizer.pad_token = tokenizer.eos_token

if "4" in quantization:
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=group_size, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
elif "8" in quantization:
quantize_config = BaseQuantizeConfig(
bits=8, # quantize model to 4-bit
group_size=group_size, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
else:
quantize_config = BaseQuantizeConfig(
bits=8, # quantize model to 4-bit
group_size=group_size, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
# load un-quantized model, by default, the model will always be loaded into CPU memory
# temp_kwargs = {k: v for k,v in kwargs.items() if k not in ["max_memory", "device_map"]}
temp_kwargs = {k: v for k,v in kwargs.items()}
model = AutoGPTQForCausalLM.from_pretrained(model_path, quantize_config, low_cpu_mem_usage=True, **temp_kwargs)
quant_dataset = datasets.load_dataset("Vtuber-plan/quantdata-10k")
train_quant_dataset = quant_dataset["train"]
examples = []
for data in train_quant_dataset:
examples.append(tokenizer(data["text"], max_length=512, padding="longest", truncation=True, return_tensors='pt'))
model.quantize(examples, batch_size=1, cache_examples_on_gpu=False)
temp_model_path = os.path.join(offload_folder, os.path.basename(model_path))
model.save_quantized(temp_model_path, use_safetensors=True)
model = AutoGPTQForCausalLM.from_quantized(temp_model_path, low_cpu_mem_usage=True, **kwargs)
else:
if num_gpus != 1:
warnings.warn(
Expand Down
5 changes: 5 additions & 0 deletions langport/model/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ def add_model_args(parser):
type=str, default=None,
help="If the device_map contains any value \"disk\", the folder where we will offload weights",
)
parser.add_argument(
"--group-size",
type=str, default=None,
help="The group size parameter of quantization",
)
parser.add_argument(
"--deepspeed", action="store_true", help="Use deepspeed"
)
Expand Down
4 changes: 2 additions & 2 deletions langport/protocol/openai_api_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ class ChatCompletionRequest(BaseModel):
stream: Optional[bool] = False
presence_penalty: Optional[float] = 0.0
frequency_penalty: Optional[float] = 0.0
functions: Optional[List[FunctionDefinition]]
function_call: Optional[Union[Literal["none", "auto"], FunctionEntry]]
functions: Optional[List[FunctionDefinition]] = None
function_call: Optional[Union[Literal["none", "auto"], FunctionEntry]] = None
user: Optional[str] = None


Expand Down
24 changes: 18 additions & 6 deletions langport/routers/gateway/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,33 @@
from typing import Optional, Union
import httpx
import numpy as np
from pydantic import BaseSettings

BASE_SETTINGS = False
if not BASE_SETTINGS:
try:
from pydantic import BaseSettings
BASE_SETTINGS = True
except ImportError:
BASE_SETTINGS = False

if not BASE_SETTINGS:
try:
from pydantic_settings import BaseSettings
BASE_SETTINGS = True
except ImportError:
BASE_SETTINGS = False

if not BASE_SETTINGS:
raise Exception("Cannot import BaseSettings from pydantic or pydantic-settings")

from langport.core.dispatch import DispatchMethod
from langport.protocol.openai_api_protocol import ErrorResponse
from langport.protocol.worker_protocol import WorkerAddressRequest, WorkerAddressResponse

import json

from typing import Generator, Optional, Union, Dict, List, Any

from fastapi.responses import StreamingResponse, JSONResponse
from starlette.middleware.base import BaseHTTPMiddleware, DispatchFunction
import httpx
import numpy as np
from pydantic import BaseSettings

from langport.constants import WORKER_API_TIMEOUT, ErrorCode
from langport.model.model_adapter import get_conversation_template
Expand Down
1 change: 1 addition & 0 deletions langport/service/server/generation_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
cpu_offloading=args.cpu_offloading,
deepspeed=args.deepspeed,
gptq=args.gptq,
group_size=args.group_size,
trust_remote_code=args.trust_remote_code,
offload_folder=args.offload_folder,
)
Expand Down
38 changes: 35 additions & 3 deletions langport/tests/test_conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@

from langport.data.conversation.conversation_settings import ConversationHistory
from langport.data.conversation.settings.baichuan import baichuan
from langport.data.conversation.settings.chatglm import chatglm
from langport.data.conversation.settings.chatgpt import chatgpt
from langport.data.conversation.settings.openbuddy import openbuddy
from langport.data.conversation.settings.qwen import qwen

class TestBaiChuanMethods(unittest.TestCase):

def test_upper(self):
def test_conv(self):
history = ConversationHistory(
"SYSTEM_MESSAGE",
messages=[
Expand All @@ -20,9 +22,39 @@ def test_upper(self):
)
self.assertEqual(history.get_prompt(), "SYSTEM_MESSAGE <reserved_102> aaa <reserved_103> bbb</s>")


class TestChatGLMMethods(unittest.TestCase):

def test_conv(self):
history = ConversationHistory(
"SYSTEM_MESSAGE",
messages=[
(chatglm.roles[0], "aaa"),
(chatglm.roles[1], "bbb"),
],
offset=0,
settings=chatglm
)
self.assertEqual(history.get_prompt(), "SYSTEM_MESSAGE\n\n[Round 1]\n\n问:aaa\n\n答:bbb\n\n")

class TestChatGPTMethods(unittest.TestCase):

def test_conv(self):
history = ConversationHistory(
"SYSTEM_MESSAGE",
messages=[
(chatgpt.roles[0], "aaa"),
(chatgpt.roles[1], "bbb"),
],
offset=0,
settings=chatgpt
)
self.assertEqual(history.get_prompt(), "SYSTEM_MESSAGE\n### user: aaa\n### assistant: bbb\n### ")


class TestOpenbuddyMethods(unittest.TestCase):

def test_upper(self):
def test_conv(self):
history = ConversationHistory(
"SYSTEM_MESSAGE",
messages=[
Expand All @@ -40,7 +72,7 @@ def test_upper(self):

class TestQwenMethods(unittest.TestCase):

def test_upper(self):
def test_conv(self):
history = ConversationHistory(
"SYSTEM_MESSAGE",
messages=[
Expand Down
2 changes: 1 addition & 1 deletion langport/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
LANGPORT_VERSION = "0.3.3"
LANGPORT_VERSION = "0.3.7"
Loading

0 comments on commit ee32f73

Please sign in to comment.