|
2 | 2 | import dataclasses
|
3 | 3 | import json
|
4 | 4 | from dataclasses import dataclass
|
5 |
| -from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, |
6 |
| - Type, Union) |
| 5 | +from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional, |
| 6 | + Tuple, Type, Union) |
7 | 7 |
|
8 | 8 | import torch
|
9 | 9 |
|
@@ -177,6 +177,7 @@ class EngineArgs:
|
177 | 177 | disable_async_output_proc: bool = False
|
178 | 178 | override_neuron_config: Optional[Dict[str, Any]] = None
|
179 | 179 | mm_processor_kwargs: Optional[Dict[str, Any]] = None
|
| 180 | + scheduling_policy: Literal["fcfs", "priority"] = "fcfs" |
180 | 181 |
|
181 | 182 | def __post_init__(self):
|
182 | 183 | if self.tokenizer is None:
|
@@ -797,6 +798,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
797 | 798 | default=None,
|
798 | 799 | help="override or set neuron device configuration.")
|
799 | 800 |
|
| 801 | + parser.add_argument( |
| 802 | + '--scheduling-policy', |
| 803 | + choices=['fcfs', 'priority'], |
| 804 | + default="fcfs", |
| 805 | + help='The scheduling policy to use. "fcfs" (first come first served' |
| 806 | + ', i.e. requests are handled in order of arrival; default) ' |
| 807 | + 'or "priority" (requests are handled based on given ' |
| 808 | + 'priority (lower value means earlier handling) and time of ' |
| 809 | + 'arrival deciding any ties).') |
| 810 | + |
800 | 811 | return parser
|
801 | 812 |
|
802 | 813 | @classmethod
|
@@ -1011,6 +1022,7 @@ def create_engine_config(self) -> EngineConfig:
|
1011 | 1022 | multi_step_stream_outputs=self.multi_step_stream_outputs,
|
1012 | 1023 | send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
|
1013 | 1024 | and parallel_config.use_ray),
|
| 1025 | + policy=self.scheduling_policy, |
1014 | 1026 | )
|
1015 | 1027 | lora_config = LoRAConfig(
|
1016 | 1028 | max_lora_rank=self.max_lora_rank,
|
|
0 commit comments