diff --git a/README.md b/README.md index 71efb1d34..6331799da 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,16 @@ LLama 2, Vicuna, Alpaca, Baize, ChatGLM, Dolly, Falcon, FastChat-T5, GPT4ALL, Gu See a complete list of supported models and instructions to add a new model [here](docs/model_support.md). +#### Use Models from modelscope +You can use models from www.modelscope.cn, just set environment variable FASTCHAT_USE_MODELSCOPE. +``` +export FASTCHAT_USE_MODELSCOPE=True +``` +Example: +``` +FASTCHAT_USE_MODELSCOPE=True python3 -m fastchat.serve.cli --model-path qwen/Qwen-7B-Chat --revision v1.1.9 +``` + #### Single GPU The command below requires around 14GB of GPU memory for Vicuna-7B and 28GB of GPU memory for Vicuna-13B. See the ["Not Enough Memory" section](#not-enough-memory) below if you do not have enough memory. diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index c8740543a..a4afabb14 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -319,6 +319,19 @@ def load_model( if dtype is not None: # Overwrite dtype if it is provided in the arguments. kwargs["torch_dtype"] = dtype + if os.environ.get("FASTCHAT_USE_MODELSCOPE", "False").lower() == "true": + # download model from ModelScope hub, + # lazy import so that modelscope is not required for normal use. + try: + from modelscope.hub.snapshot_download import snapshot_download + + model_path = snapshot_download(model_id=model_path, revision=revision) + except ImportError as e: + warnings.warn( + "Use model from www.modelscope.cn need pip install modelscope" + ) + raise e + # Load model model, tokenizer = adapter.load_model(model_path, kwargs) diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py index 5e84a4262..fa43acf9a 100644 --- a/fastchat/serve/model_worker.py +++ b/fastchat/serve/model_worker.py @@ -49,6 +49,7 @@ def __init__( device: str, num_gpus: int, max_gpu_memory: str, + revision: str = None, dtype: Optional[torch.dtype] = None, load_8bit: bool = False, cpu_offloading: bool = False, @@ -76,6 +77,7 @@ def __init__( logger.info(f"Loading the model {self.model_names} on worker {worker_id} ...") self.model, self.tokenizer = load_model( model_path, + revision=revision, device=device, num_gpus=num_gpus, max_gpu_memory=max_gpu_memory, @@ -345,6 +347,7 @@ def create_model_worker(): args.model_path, args.model_names, args.limit_worker_concurrency, + revision=args.revision, no_register=args.no_register, device=args.device, num_gpus=args.num_gpus,