|
8 | 8 | """
|
9 | 9 |
|
10 | 10 | from ray import serve
|
11 |
| -from ray.serve.llm import LLMConfig, LLMRouter, LLMServer |
| 11 | +from ray.serve.llm import LLMConfig, build_openai_app |
12 | 12 |
|
13 | 13 | llm_config = LLMConfig(
|
14 |
| - model_loading_config=dict( |
15 |
| - model_id="deepseek", |
16 |
| - # Change to model download path |
17 |
| - model_source="/path/to/the/model", |
18 |
| - ), |
19 |
| - deployment_config=dict(autoscaling_config=dict( |
20 |
| - min_replicas=1, |
21 |
| - max_replicas=1, |
22 |
| - )), |
| 14 | + model_loading_config={ |
| 15 | + "model_id": "deepseek", |
| 16 | + # Since DeepSeek model is huge, it is recommended to pre-download |
| 17 | + # the model to local disk, say /path/to/the/model and specify: |
| 18 | + # model_source="/path/to/the/model" |
| 19 | + "model_source": "deepseek-ai/DeepSeek-R1", |
| 20 | + }, |
| 21 | + deployment_config={ |
| 22 | + "autoscaling_config": { |
| 23 | + "min_replicas": 1, |
| 24 | + "max_replicas": 1, |
| 25 | + } |
| 26 | + }, |
23 | 27 | # Change to the accelerator type of the node
|
24 | 28 | accelerator_type="H100",
|
25 |
| - runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")), |
| 29 | + runtime_env={"env_vars": { |
| 30 | + "VLLM_USE_V1": "1" |
| 31 | + }}, |
26 | 32 | # Customize engine arguments as needed (e.g. vLLM engine kwargs)
|
27 |
| - engine_kwargs=dict( |
28 |
| - tensor_parallel_size=8, |
29 |
| - pipeline_parallel_size=2, |
30 |
| - gpu_memory_utilization=0.92, |
31 |
| - dtype="auto", |
32 |
| - max_num_seqs=40, |
33 |
| - max_model_len=16384, |
34 |
| - enable_chunked_prefill=True, |
35 |
| - enable_prefix_caching=True, |
36 |
| - trust_remote_code=True, |
37 |
| - ), |
| 33 | + engine_kwargs={ |
| 34 | + "tensor_parallel_size": 8, |
| 35 | + "pipeline_parallel_size": 2, |
| 36 | + "gpu_memory_utilization": 0.92, |
| 37 | + "dtype": "auto", |
| 38 | + "max_num_seqs": 40, |
| 39 | + "max_model_len": 16384, |
| 40 | + "enable_chunked_prefill": True, |
| 41 | + "enable_prefix_caching": True, |
| 42 | + "trust_remote_code": True, |
| 43 | + }, |
38 | 44 | )
|
39 | 45 |
|
40 | 46 | # Deploy the application
|
41 |
| -deployment = LLMServer.as_deployment( |
42 |
| - llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config) |
43 |
| -llm_app = LLMRouter.as_deployment().bind([deployment]) |
| 47 | +llm_app = build_openai_app({"llm_configs": [llm_config]}) |
44 | 48 | serve.run(llm_app)
|
0 commit comments