Skip to content

Commit

Permalink
add max_serving_time
Browse files Browse the repository at this point in the history
  • Loading branch information
mchen644 committed Oct 11, 2024
1 parent d1eb746 commit 7997a74
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 90 deletions.
20 changes: 10 additions & 10 deletions benchmarks/1_serving_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ result_dir="/root/v1/vllm/benchmarks/result"
# scheduler_policy=(infer)
# swap_policies=(partial)
declare -a scheduler_swap_policies
scheduler_swap_policies[0]="tfittradeoff partial"
# scheduler_swap_policies[1]="fcfs full"
# scheduler_swap_policies[0]="tfittradeoff partial"
scheduler_swap_policies[1]="fcfs full"
# scheduler_swap_policies[2]="las full"
# scheduler_swap_policies[1]="tfittradeoff full"
# scheduler_swap_policies[2]="sjf full"
Expand All @@ -37,19 +37,19 @@ scheduler_swap_policies[0]="tfittradeoff partial"

preemption_mode="swap"
gpu_memory_utilization=0.7 # 0.5, 0.7, 0.9
max_num_seqs=384
max_num_seqs=384
swap_space=64
max_tokens=2048
iter_theshold=15

max_serving_time=500
# request_rates[0]=0.5
# request_rates[1]=1.0
request_rates[2]=2.0
# request_rates[2]=2.0
# request_rates[3]=5.0
# request_rates[4]=10.0
# request_rates[5]=20.0
request_rates[5]=20.0
# request_rates[5]=30.0
request_rates[5]=50.0
# request_rates[5]=50.0
# request_rates[5]=100.0

# request_rates=(2.0)
Expand All @@ -66,16 +66,16 @@ for i in {0..0}; do
swap_policy=${element[1]}
# tmux new-session -s "api_server" -d bash start_server.sh $gpu_devices $model_name $swap_space $preemption_mode $policy $max_tokens $iter_theshold $max_num_seqs $swap_policy $swap_out_partial_rate $gpu_memory_utilization $waiting_iter

CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 20-21 python3 -m vllm.entrypoints.openai.api_server \
CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 23-24 python3 -m vllm.entrypoints.openai.api_server \
--model $model_name --swap-space $swap_space --preemption-mode $preemption_mode --scheduler-policy $policy \
--enable-chunked-prefill --max-num-batched-tokens $max_tokens --iter-threshold $iter_theshold --max-num-seqs $max_num_seqs --swap-out-tokens-policy $swap_policy --swap-out-partial-rate $swap_out_partial_rate --execution-budget $iter_theshold \
--tensor-parallel-size 1 --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests >api_server_${policy}_${swap_policy}.log 2>&1 &
--tensor-parallel-size 1 --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests --max-serving-time $max_serving_time >api_server_${policy}_${swap_policy}.log 2>&1 &
pid=$!

# run benchmark and save the output to benchmark.log
python3 benchmark_serving.py --execution-counter $COUNTER --dataset-path $dataset_path \
--dataset-name $dataset_name --request-rate $request_rate \
--num-prompts 500 --request-duration 500 --sharegpt-output-len 2000 --model $model_name --scheduler-policy $policy \
--num-prompts 500 --request-duration $max_serving_time --sharegpt-output-len 2000 --model $model_name --scheduler-policy $policy \
--save-result --result-dir $result_dir \
--metadata swap_space=$swap_space preemption_mode=$preemption_mode \
scheduler_policy=$policy gpu_memory_utilization=$gpu_memory_utilization\
Expand Down
107 changes: 35 additions & 72 deletions benchmarks/result/analysis/result_analysis_1.ipynb

Large diffs are not rendered by default.

19 changes: 11 additions & 8 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,14 +557,16 @@ def add_cli_args(
'before the engine preempts it. The engine will preempt the sequence if the execution budget is exceeded.'
'Also the execution budget is used to determine the maximum number of the waiting iterations before promoting into the running queue.'
)
# parser.add_argument(
# "--max-serving-time",
# type=int,
# default=EngineArgs.max_serving_time,
# help='The maximum serving time of a sequence in seconds. '
# 'If the sequence exceeds this time, the engine will '
# 'terminate.'
# )

parser.add_argument(
"--max-serving-time",
type=int,
default=EngineArgs.max_serving_time,
help='The maximum serving time of a sequence in seconds. '
'If the sequence exceeds this time, the engine will '
'terminate.'
)

parser.add_argument(
"--iter-threshold",
type=int,
Expand Down Expand Up @@ -864,6 +866,7 @@ def add_cli_args(parser: argparse.ArgumentParser,
help='Max number of prompt characters or prompt '
'ID numbers being printed in log.'
'\n\nDefault: Unlimited')
# parser.add_argument("--max-serving-time", type=int, default=600, help="Max serving time in seconds")
return parser


Expand Down

0 comments on commit 7997a74

Please sign in to comment.