Skip to content

Commit

Permalink
[CI/Build] Add shell script linting using shellcheck (vllm-project#7925)
Browse files Browse the repository at this point in the history
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
  • Loading branch information
russellb authored and Isotr0py committed Nov 8, 2024
1 parent 98ac0ed commit 72fe6ba
Show file tree
Hide file tree
Showing 28 changed files with 204 additions and 129 deletions.
6 changes: 3 additions & 3 deletions .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
done

lm_eval --model hf \
--model_args pretrained=$MODEL,parallelize=True \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE
--model_args "pretrained=$MODEL,parallelize=True" \
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size "$BATCH_SIZE"
6 changes: 3 additions & 3 deletions .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done

lm_eval --model vllm \
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size "$BATCH_SIZE"
2 changes: 1 addition & 1 deletion .buildkite/lm-eval-harness/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
done

# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"

for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
Expand Down
63 changes: 25 additions & 38 deletions .buildkite/nightly-benchmarks/scripts/launch-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,58 +50,54 @@ launch_trt_server() {
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
git checkout $trt_llm_version
tensorrtllm_backend_dir=$(pwd)
git checkout "$trt_llm_version"
git submodule update --init --recursive

# build trtllm engine
cd /tensorrtllm_backend
cd ./tensorrt_llm/examples/${model_type}
cd "./tensorrt_llm/examples/${model_type}"
python3 convert_checkpoint.py \
--model_dir ${model_path} \
--dtype ${model_dtype} \
--tp_size ${model_tp_size} \
--output_dir ${trt_model_path}
--model_dir "${model_path}" \
--dtype "${model_dtype}" \
--tp_size "${model_tp_size}" \
--output_dir "${trt_model_path}"
trtllm-build \
--checkpoint_dir ${trt_model_path} \
--checkpoint_dir "${trt_model_path}" \
--use_fused_mlp \
--reduce_fusion disable \
--workers 8 \
--gpt_attention_plugin ${model_dtype} \
--gemm_plugin ${model_dtype} \
--tp_size ${model_tp_size} \
--max_batch_size ${max_batch_size} \
--max_input_len ${max_input_len} \
--max_seq_len ${max_seq_len} \
--max_num_tokens ${max_num_tokens} \
--output_dir ${trt_engine_path}
--gpt_attention_plugin "${model_dtype}" \
--gemm_plugin "${model_dtype}" \
--tp_size "${model_tp_size}" \
--max_batch_size "${max_batch_size}" \
--max_input_len "${max_input_len}" \
--max_seq_len "${max_seq_len}" \
--max_num_tokens "${max_num_tokens}" \
--output_dir "${trt_engine_path}"

# handle triton protobuf files and launch triton server
cd /tensorrtllm_backend
mkdir triton_model_repo
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
cd triton_model_repo
rm -rf ./tensorrt_llm/1/*
cp -r ${trt_engine_path}/* ./tensorrt_llm/1
cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
--world_size=${model_tp_size} \
--world_size="${model_tp_size}" \
--model_repo=/tensorrtllm_backend/triton_model_repo &

}

launch_tgi_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")

if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
Expand Down Expand Up @@ -129,10 +125,7 @@ launch_tgi_server() {
launch_lmdeploy_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")

server_command="lmdeploy serve api_server $model \
Expand All @@ -149,10 +142,7 @@ launch_sglang_server() {

model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")

if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
Expand Down Expand Up @@ -185,10 +175,7 @@ launch_vllm_server() {

model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")

if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
Expand Down Expand Up @@ -217,19 +204,19 @@ launch_vllm_server() {

main() {

if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
launch_trt_server
fi

if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
launch_tgi_server
fi

if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
launch_lmdeploy_server
fi

if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
launch_sglang_server
fi

Expand Down
12 changes: 6 additions & 6 deletions .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ main() {
fi

# initial annotation
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"

# download results
cd $VLLM_SOURCE_CODE_LOC/benchmarks
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls
Expand All @@ -30,15 +30,15 @@ main() {
/workspace/buildkite-agent artifact upload "results.zip"

# upload benchmarking scripts
cd $VLLM_SOURCE_CODE_LOC/
cd "$VLLM_SOURCE_CODE_LOC/"
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"

cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"

cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md


Expand Down Expand Up @@ -75,4 +75,4 @@ main() {
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}

main "$@"
main "$@"
30 changes: 14 additions & 16 deletions .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
echo "GPU type is $gpu_type"
}

Expand Down Expand Up @@ -102,7 +102,7 @@ kill_gpu_processes() {
pkill -f text-generation
pkill -f lmdeploy

while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
}
Expand All @@ -119,8 +119,8 @@ wait_for_server() {
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
if ! which $cmd >/dev/null; then
apt-get update && apt-get install -y $cmd
if ! which "$cmd" >/dev/null; then
apt-get update && apt-get install -y "$cmd"
fi
}

Expand Down Expand Up @@ -173,13 +173,11 @@ run_serving_tests() {
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi

wait_for_server

if [ $? -eq 0 ]; then
if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
Expand All @@ -190,13 +188,13 @@ run_serving_tests() {

# prepare tokenizer
# this is required for lmdeploy.
cd $VLLM_SOURCE_CODE_LOC/benchmarks
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"


# change model name for lmdeploy (it will not follow standard hf name)
Expand Down Expand Up @@ -307,11 +305,11 @@ run_serving_tests() {
prepare_dataset() {

# download sharegpt dataset
cd $VLLM_SOURCE_CODE_LOC/benchmarks
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

# duplicate sonnet by 4x, to allow benchmarking with input length 2048
cd $VLLM_SOURCE_CODE_LOC/benchmarks
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
echo "" > sonnet_4x.txt
for _ in {1..4}
do
Expand Down Expand Up @@ -339,17 +337,17 @@ main() {

prepare_dataset

cd $VLLM_SOURCE_CODE_LOC/benchmarks
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"

# run the test
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"

# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
upload_to_buildkite

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
echo "GPU type is $gpu_type"
}

Expand Down Expand Up @@ -93,7 +93,7 @@ kill_gpu_processes() {


# wait until GPU memory usage smaller than 1GB
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done

Expand All @@ -117,7 +117,7 @@ upload_to_buildkite() {
fi

# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}

Expand Down Expand Up @@ -150,7 +150,7 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi

Expand Down Expand Up @@ -206,9 +206,9 @@ run_throughput_tests() {
throughput_args=$(json2args "$throughput_params")

# check if there is enough GPU to run the test
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi

Expand Down Expand Up @@ -270,15 +270,15 @@ run_serving_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi

# check if server model and client model is aligned
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
echo "Server model and client model must be the same. Skip testcase $testname."
echo "Server model and client model must be the same. Skip testcase $test_name."
continue
fi

Expand All @@ -293,8 +293,7 @@ run_serving_tests() {
server_pid=$!

# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
if wait_for_server; then
echo ""
echo "vllm server is up and running."
else
Expand Down
4 changes: 2 additions & 2 deletions .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10

retries=0
while [ $retries -lt 1000 ]; do
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
exit 0
fi

Expand All @@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do
sleep 5
done

exit 1
exit 1
Loading

0 comments on commit 72fe6ba

Please sign in to comment.