add max_serving_time

blinkbear · Oct 11, 2024 · 7997a74 · 7997a74
1 parent d1eb746
commit 7997a74
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 90 deletions.
diff --git a/benchmarks/1_serving_benchmark.sh b/benchmarks/1_serving_benchmark.sh
@@ -25,8 +25,8 @@ result_dir="/root/v1/vllm/benchmarks/result"
 # scheduler_policy=(infer)
 # swap_policies=(partial)
 declare -a scheduler_swap_policies
-scheduler_swap_policies[0]="tfittradeoff partial"
-# scheduler_swap_policies[1]="fcfs full"
+# scheduler_swap_policies[0]="tfittradeoff partial"
+scheduler_swap_policies[1]="fcfs full"
 # scheduler_swap_policies[2]="las full"
 # scheduler_swap_policies[1]="tfittradeoff full"
 # scheduler_swap_policies[2]="sjf full"
@@ -37,19 +37,19 @@ scheduler_swap_policies[0]="tfittradeoff partial"
 
 preemption_mode="swap"
 gpu_memory_utilization=0.7 # 0.5, 0.7, 0.9
-max_num_seqs=384 
+max_num_seqs=384
 swap_space=64
 max_tokens=2048
 iter_theshold=15
-
+max_serving_time=500
 # request_rates[0]=0.5
 # request_rates[1]=1.0
-request_rates[2]=2.0
+# request_rates[2]=2.0
 # request_rates[3]=5.0
 # request_rates[4]=10.0
-# request_rates[5]=20.0
+request_rates[5]=20.0
 # request_rates[5]=30.0
-request_rates[5]=50.0
+# request_rates[5]=50.0
 # request_rates[5]=100.0
 
 # request_rates=(2.0)
@@ -66,16 +66,16 @@ for i in {0..0}; do
           swap_policy=${element[1]}
           # tmux new-session -s "api_server" -d bash start_server.sh $gpu_devices $model_name $swap_space $preemption_mode $policy $max_tokens $iter_theshold $max_num_seqs $swap_policy $swap_out_partial_rate $gpu_memory_utilization $waiting_iter
 
-          CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 20-21 python3 -m vllm.entrypoints.openai.api_server \
+          CUDA_VISIBLE_DEVICES=$gpu_devices taskset -c 23-24 python3 -m vllm.entrypoints.openai.api_server \
             --model $model_name --swap-space $swap_space --preemption-mode $preemption_mode --scheduler-policy $policy \
             --enable-chunked-prefill --max-num-batched-tokens $max_tokens --iter-threshold $iter_theshold --max-num-seqs $max_num_seqs --swap-out-tokens-policy $swap_policy --swap-out-partial-rate $swap_out_partial_rate --execution-budget $iter_theshold \
-            --tensor-parallel-size 1 --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests >api_server_${policy}_${swap_policy}.log 2>&1 &
+            --tensor-parallel-size 1 --gpu-memory-utilization $gpu_memory_utilization --disable-sliding-window --waiting-iter-base $waiting_iter --disable-log-requests --max-serving-time $max_serving_time >api_server_${policy}_${swap_policy}.log 2>&1 &
           pid=$! 
 
           # run benchmark and save the output to benchmark.log
           python3 benchmark_serving.py --execution-counter $COUNTER --dataset-path $dataset_path \
             --dataset-name $dataset_name --request-rate $request_rate \
-            --num-prompts 500 --request-duration 500 --sharegpt-output-len 2000 --model $model_name --scheduler-policy $policy \
+            --num-prompts 500 --request-duration $max_serving_time --sharegpt-output-len 2000 --model $model_name --scheduler-policy $policy \
             --save-result --result-dir $result_dir \
             --metadata swap_space=$swap_space preemption_mode=$preemption_mode \
             scheduler_policy=$policy gpu_memory_utilization=$gpu_memory_utilization\

diff --git a/benchmarks/result/analysis/result_analysis_1.ipynb b/benchmarks/result/analysis/result_analysis_1.ipynb
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -557,14 +557,16 @@ def add_cli_args(
             'before the engine preempts it. The engine will preempt the sequence if the execution budget is exceeded.'
             'Also the execution budget is used to determine the maximum number of the waiting iterations before promoting into the running queue.'
         )
-        # parser.add_argument(
-        #     "--max-serving-time",
-        #     type=int,
-        #     default=EngineArgs.max_serving_time,
-        #     help='The maximum serving time of a sequence in seconds. '
-        #     'If the sequence exceeds this time, the engine will '
-        #     'terminate.'
-        # )
+
+        parser.add_argument(
+            "--max-serving-time",
+            type=int,
+            default=EngineArgs.max_serving_time,
+            help='The maximum serving time of a sequence in seconds. '
+            'If the sequence exceeds this time, the engine will '
+            'terminate.'
+        )
+
         parser.add_argument(
             "--iter-threshold",
             type=int,
@@ -864,6 +866,7 @@ def add_cli_args(parser: argparse.ArgumentParser,
                             help='Max number of prompt characters or prompt '
                             'ID numbers being printed in log.'
                             '\n\nDefault: Unlimited')
+        # parser.add_argument("--max-serving-time", type=int, default=600, help="Max serving time in seconds")
         return parser