Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix swift deploy log error (repeat log) #2808

Merged
merged 6 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/deploy/client/llm/chat/swift_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
def run_client(host: str = '127.0.0.1', port: int = 8000):
engine = InferClient(host=host, port=port)
print(f'models: {engine.models}')

dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
3 changes: 2 additions & 1 deletion examples/deploy/client/mllm/swift_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def get_data(mm_type: Literal['text', 'image', 'video', 'audio']):
def run_client(host: str = '127.0.0.1', port: int = 8000):
engine = InferClient(host=host, port=port)
print(f'models: {engine.models}')
dataset = load_dataset(['AI-ModelScope/LaTeX_OCR:small#1000'], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset(['AI-ModelScope/LaTeX_OCR:small#1000'], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
3 changes: 2 additions & 1 deletion examples/infer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
from swift.llm import LmdeployEngine
engine = LmdeployEngine(model)

dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
3 changes: 2 additions & 1 deletion examples/infer/demo_mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def get_data(mm_type: Literal['text', 'image', 'video', 'audio']):
dataset = 'AI-ModelScope/LaTeX_OCR:small#1000'
engine = LmdeployEngine(model, vision_batch_size=8)

dataset = load_dataset([dataset], strict=False, seed=42)[0]
# Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
dataset = load_dataset([dataset], seed=42)[0]
print(f'dataset: {dataset}')
infer_requests = [InferRequest(**data) for data in dataset]
infer_batch(engine, infer_requests)
Expand Down
4 changes: 2 additions & 2 deletions requirements/install_all.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# please use python=3.10, cuda12.*
# sh requirements/install_all.sh
pip install vllm -U
pip install lmdeploy -U --no-deps
pip install "vllm>=0.5.1" -U
pip install "lmdeploy>=0.5" -U --no-deps
pip install autoawq!=0.2.7.post3 -U --no-deps
pip install auto_gptq optimum bitsandbytes -U
pip install git+https://github.com/modelscope/ms-swift.git#egg=ms-swift[all]
Expand Down
1 change: 0 additions & 1 deletion swift/llm/argument/infer_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ class InferArguments(MergeArguments, VllmArguments, LmdeployArguments, BaseArgum
infer_backend: Literal['vllm', 'pt', 'lmdeploy'] = 'pt'

result_path: Optional[str] = None
writer_buffer_size: int = 65536
# for pt engine
max_batch_size: int = 1
ddp_backend: Optional[str] = None
Expand Down
7 changes: 5 additions & 2 deletions swift/llm/infer/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
from contextlib import contextmanager
from dataclasses import asdict
from functools import partial
from http import HTTPStatus
from threading import Thread
from typing import List, Optional, Union
Expand Down Expand Up @@ -153,10 +154,12 @@ def pre_infer_hook(kwargs):
logger.info(request_info)
return kwargs

self.infer_engine.pre_infer_hooks = [pre_infer_hook]
infer_kwargs['pre_infer_hook'] = pre_infer_hook
try:
res_or_gen = await self.infer_async(infer_request, request_config, template=self.template, **infer_kwargs)
except ValueError as e:
except Exception as e:
import traceback
print(traceback.format_exc())
return self.create_error_response(HTTPStatus.BAD_REQUEST, str(e))
if request_config.stream:

Expand Down
3 changes: 1 addition & 2 deletions swift/llm/infer/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def get_infer_engine(args: InferArguments, **kwargs):

def main(self):
args = self.args
context = open_jsonl_writer(
args.result_path, buffer_size=args.writer_buffer_size) if args.result_path else nullcontext()
context = open_jsonl_writer(args.result_path) if args.result_path else nullcontext()
with context as json_writer:
self.jsonl_writer = json_writer
return super().main()
Expand Down
9 changes: 6 additions & 3 deletions swift/llm/infer/infer_engine/infer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def _post_init(self):
self.model_name = self.model_info.model_name
self.max_model_len = self.model_info.max_model_len
self.config = self.model_info.config
self.pre_infer_hooks = []
if getattr(self, 'default_template', None) is None:
self.default_template = get_template(self.model_meta.template, self.processor)
self._adapters_pool = {}
Expand Down Expand Up @@ -60,7 +59,9 @@ async def _run_infer(i, task, queue, stream: bool = False):
queue.put((i, stream_response))
else:
queue.put((i, await task))
finally:
except Exception as e:
queue.put((i, e))
else:
queue.put((i, None))

async def _batch_run(tasks):
Expand All @@ -78,7 +79,9 @@ async def _batch_run(tasks):

while n_finished < len(new_tasks):
i, output = queue.get()
if output is None: # is_finished
if isinstance(output, Exception):
raise output
elif output is None: # is_finished
n_finished += 1
prog_bar.update()
else:
Expand Down
3 changes: 2 additions & 1 deletion swift/llm/infer/infer_engine/lmdeploy_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ async def infer_async(self,
request_config: Optional[RequestConfig] = None,
*,
template: Optional[Template] = None,
pre_infer_hook=None,
**kwargs) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionStreamResponse]]:
request_config = deepcopy(request_config or RequestConfig())
if template is None:
Expand All @@ -275,7 +276,7 @@ async def infer_async(self,
generation_config = self._prepare_generation_config(request_config)
self._add_stop_words(generation_config, request_config, template.template_meta)
kwargs.update({'template': template, 'inputs': inputs, 'generation_config': generation_config})
for pre_infer_hook in self.pre_infer_hooks:
if pre_infer_hook:
kwargs = pre_infer_hook(kwargs)
if request_config.stream:
return self._infer_stream_async(**kwargs)
Expand Down
14 changes: 8 additions & 6 deletions swift/llm/infer/infer_engine/pt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,15 +348,16 @@ async def infer_async(
*,
template: Optional[Template] = None,
adapter_request: Optional[AdapterRequest] = None,
pre_infer_hook=None,
) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionStreamResponse]]:
# TODO:auto batch
if request_config is None:
request_config = RequestConfig()
res_or_gen = self.infer([infer_request],
request_config,
template=template,
use_tqdm=False,
adapter_request=adapter_request)
res_or_gen = self._infer([infer_request],
request_config,
template=template,
adapter_request=adapter_request,
pre_infer_hook=pre_infer_hook)
if request_config.stream:

async def _gen_wrapper():
Expand All @@ -376,6 +377,7 @@ def _infer(
*,
template: Optional[Template] = None,
adapter_request: Optional[AdapterRequest] = None,
pre_infer_hook=None,
) -> Union[List[ChatCompletionResponse], Iterator[List[Optional[ChatCompletionStreamResponse]]]]:
self.model.eval()
request_config = deepcopy(request_config)
Expand Down Expand Up @@ -414,7 +416,7 @@ def _infer(
'adapter_request': adapter_request,
'template_inputs': template_inputs
}
for pre_infer_hook in self.pre_infer_hooks:
if pre_infer_hook:
kwargs = pre_infer_hook(kwargs)
if request_config.stream:

Expand Down
26 changes: 10 additions & 16 deletions swift/llm/infer/infer_engine/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union

import torch
import vllm
from packaging import version
from transformers import GenerationConfig, PreTrainedTokenizerBase
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams

from swift.llm import InferRequest, Template, TemplateMeta, get_model_tokenizer
from swift.plugin import Metric
Expand All @@ -21,6 +19,14 @@
from .patch import patch_auto_config, patch_auto_tokenizer
from .utils import AdapterRequest, InferStreamer

try:
# After setting the environment variables, import vllm. This way of writing allows lint to pass.
os.environ['VLLM_ENGINE_ITERATION_TIMEOUT_S'] = '3600'
import vllm
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
except Exception:
raise

logger = get_logger()
dtype_mapping = {torch.float16: 'float16', torch.bfloat16: 'bfloat16', torch.float32: 'float32'}

Expand Down Expand Up @@ -50,7 +56,6 @@ def __init__(
max_loras: int = 1,
max_lora_rank: int = 16,
engine_kwargs: Optional[Dict[str, Any]] = None) -> None:
self._init_env()
self.processor = get_model_tokenizer(
model_id_or_path,
torch_dtype,
Expand Down Expand Up @@ -137,18 +142,6 @@ def _prepare_engine_kwargs(self,
if max_model_len is not None:
model_info.max_model_len = max_model_len

@staticmethod
def _init_env() -> None:
try:
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
destroy_model_parallel()
except ImportError:
pass
# fix HTTPError bug (use model_dir)
os.environ.pop('VLLM_USE_MODELSCOPE', None)
if version.parse(vllm.__version__) >= version.parse('0.5.1'):
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

def _fix_vllm_bug(self) -> None:
# fix vllm==0.4 bug (very slow)
tokenizer = self.tokenizer
Expand Down Expand Up @@ -364,6 +357,7 @@ async def infer_async(
*,
template: Optional[Template] = None,
adapter_request: Optional[AdapterRequest] = None,
pre_infer_hook=None,
) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionStreamResponse]]:
request_config = deepcopy(request_config or RequestConfig())
if template is None:
Expand All @@ -381,7 +375,7 @@ async def infer_async(
'generation_config': generation_config,
'adapter_request': adapter_request
}
for pre_infer_hook in self.pre_infer_hooks:
if pre_infer_hook:
kwargs = pre_infer_hook(kwargs)
if request_config.stream:
return self._infer_stream_async(**kwargs)
Expand Down
Loading