From e7afbe4c36c272d230cef5f8665ec02a1d1f381b Mon Sep 17 00:00:00 2001 From: KexinFeng Date: Tue, 3 Oct 2023 09:12:13 -0700 Subject: [PATCH] [fix] Gptq dependency (#1137) Co-authored-by: KexinFeng --- .../rolling_batch/vllm_rolling_batch.py | 17 ++++++++--------- .../djl_python/tests/test_scheduler_bloom.py | 10 ++++++---- .../setup/djl_python/transformers_neuronx.py | 1 - serving/docker/deepspeed.Dockerfile | 5 ++++- serving/docker/fastertransformer.Dockerfile | 4 +++- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py index 9c6b6fd6b..1903e2246 100644 --- a/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py +++ b/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py @@ -44,15 +44,14 @@ def __init__(self, model_id_or_path, device, properties, **kwargs): properties.get("max_rolling_batch_prefill_tokens")) tensor_parallel_degree = int( properties.get("tensor_parallel_degree", None)) - args = EngineArgs( - model=model_id_or_path, - tensor_parallel_size=tensor_parallel_degree, - dtype=DTYPE_MAPPER[self.dtype], - seed=0, - max_num_batched_tokens=max_batched_prefill_tokens, - trust_remote_code=kwargs.get("trust_remote_code", False), - quantization=properties.get("quantize", None) - ) + args = EngineArgs(model=model_id_or_path, + tensor_parallel_size=tensor_parallel_degree, + dtype=DTYPE_MAPPER[self.dtype], + seed=0, + max_num_batched_tokens=max_batched_prefill_tokens, + trust_remote_code=kwargs.get("trust_remote_code", + False), + quantization=properties.get("quantize", None)) self.engine = LLMEngine.from_engine_args(args) self.request_cache = OrderedDict() diff --git a/engines/python/setup/djl_python/tests/test_scheduler_bloom.py b/engines/python/setup/djl_python/tests/test_scheduler_bloom.py index 941a1187c..a48e90de6 100644 --- a/engines/python/setup/djl_python/tests/test_scheduler_bloom.py +++ b/engines/python/setup/djl_python/tests/test_scheduler_bloom.py @@ -15,7 +15,8 @@ class TestSchedulerBloom(unittest.TestCase): def test_lm_block(self): model_id = "bigscience/bloom-560m" model = AutoModelForCausalLM.from_pretrained( - model_id, device_map="auto" if global_device == "cuda" else "cpu") + model_id, + device_map="auto" if global_device.type == "cuda" else "cpu") device = model.device tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -57,7 +58,8 @@ def test_lm_block(self): def test_contrastive_scheduler(self): model_id = "bigscience/bloom-560m" model = BloomForCausalLM.from_pretrained( - model_id, device_map="auto" if global_device == "cuda" else "cpu") + model_id, + device_map="auto" if global_device.type == "cuda" else "cpu") device = model.device tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left') @@ -135,7 +137,7 @@ def test_greedy_scheduler_llama(self): model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, - device_map="auto" if global_device == "cuda" else "cpu") + device_map="auto" if global_device.type == "cuda" else "cpu") device = model.device lm_block = HuggingfaceBlock(model) @@ -200,7 +202,7 @@ def test_greedy_scheduler_llama2_gptq(self): model_name, trust_remote_code=False, revision="main", - device_map="auto" if global_device == "cuda" else "cpu") + device_map="auto" if global_device.type == "cuda" else "cpu") device = model.device lm_block = HuggingfaceBlock(model) diff --git a/engines/python/setup/djl_python/transformers_neuronx.py b/engines/python/setup/djl_python/transformers_neuronx.py index 12caba30e..b26d39ae8 100644 --- a/engines/python/setup/djl_python/transformers_neuronx.py +++ b/engines/python/setup/djl_python/transformers_neuronx.py @@ -70,7 +70,6 @@ def __init__(self) -> None: self.rolling_batch = None self.load_in_8bit = False - def init_load_path(self, model_type): path = os.environ.get("SERVING_DOWNLOAD_DIR") folder = f"inf2_{model_type}_{self.amp}" diff --git a/serving/docker/deepspeed.Dockerfile b/serving/docker/deepspeed.Dockerfile index f02cc9744..08e030f83 100644 --- a/serving/docker/deepspeed.Dockerfile +++ b/serving/docker/deepspeed.Dockerfile @@ -29,6 +29,8 @@ ARG transformers_version=4.33.2 ARG accelerate_version=0.23.0 ARG diffusers_version=0.16.0 ARG bitsandbytes_version=0.41.1 +ARG optimum_version=1.13.2 +ARG auto_gptq_version=0.4.2 EXPOSE 8080 @@ -69,7 +71,8 @@ RUN apt-get update && \ ${deepspeed_wheel} ${flash_attn_wheel} ${dropout_layer_norm_wheel} ${rotary_emb_wheel} ${flash_attn_2_wheel} \ ${vllm_wheel} ${lmi_dist_wheel} ${seq_scheduler_wheel} ${peft_wheel} protobuf==${protobuf_version} \ transformers==${transformers_version} \ - mpi4py sentencepiece einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version}\ + mpi4py sentencepiece einops accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version} \ + optimum=${optimum_version} auto-gptq=${auto_gptq_version} \ diffusers[torch]==${diffusers_version} opencv-contrib-python-headless safetensors scipy && \ scripts/install_aitemplate.sh && \ scripts/patch_oss_dlc.sh python && \ diff --git a/serving/docker/fastertransformer.Dockerfile b/serving/docker/fastertransformer.Dockerfile index 8c6e04f97..cb5364a51 100644 --- a/serving/docker/fastertransformer.Dockerfile +++ b/serving/docker/fastertransformer.Dockerfile @@ -25,6 +25,8 @@ ARG protobuf_version=3.20.3 ARG transformers_version=4.33.2 ARG accelerate_version=0.23.0 ARG bitsandbytes_version=0.41.1 +ARG optimum_version=1.13.2 +ARG auto_gptq_version=0.4.2 EXPOSE 8080 @@ -65,7 +67,7 @@ RUN apt-get update && apt-get install -y wget git libnuma-dev zlib1g-dev rapidjs cd ../../ && rm -rf ompi && \ scripts/install_python.sh ${python_version} && \ pip3 install ${torch_wheel} ${ft_wheel} ${tb_wheel} ${peft_wheel} ${seq_scheduler_wheel} safetensors protobuf==${protobuf_version} && \ - pip3 install transformers==${transformers_version} accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version} \ + pip3 install transformers==${transformers_version} accelerate==${accelerate_version} bitsandbytes==${bitsandbytes_version} optimum=${optimum_version} auto-gptq=${auto_gptq_version} \ scipy einops && \ pip3 install cmake sentencepiece bfloat16 tiktoken && \ pip3 cache purge && \