triton-inference-server · yinggeh · Sep 24, 2025 · Sep 23, 2025
diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -64,7 +64,7 @@ wget -P models/add_sub https://raw.githubusercontent.com/triton-inference-server
 
 # Invalid model attribute
 cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_1/
-sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
+sed -i 's/"enforce_eager"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
 
 # Invalid model name
 cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/

diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh
@@ -110,7 +110,6 @@ export SERVER_ENABLE_LORA=true
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.7,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -202,7 +201,6 @@ wait $SERVER_PID
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.7,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -282,7 +280,6 @@ export SERVER_ENABLE_LORA=false
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.8,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -344,7 +341,6 @@ export SERVER_ENABLE_LORA=false
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.8,
     "tensor_parallel_size": 2,
     "block_size": 16,

diff --git a/docs/llama_multi_lora_tutorial.md b/docs/llama_multi_lora_tutorial.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -146,7 +146,6 @@ For this tutorial we will use the following set of parameters, specified in the
 ```json
 {
     "model":"/vllm_workspace/weights/backbone/llama-7b-hf",
-    "disable_log_requests": "true",
     "gpu_memory_utilization": 0.8,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -157,7 +156,6 @@ For this tutorial we will use the following set of parameters, specified in the
 ```
 
 + `model`: The path to your model repository
-+ `disable_log_requests`: To show logs when launch vllm or not.
 + `gpu_memory_utilization`: The gpu memory allocated for the model weights and vllm *PagedAttention* kv cache manager.
 + `tensor_parallel_size`: The vllm now support the tensor paralism, so you can decide how many gpus you want to use for serving.
 + `block_size`: vLLM kv cache block size.

diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json
@@ -1,6 +1,5 @@
 {
     "model":"facebook/opt-125m",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.5,
     "enforce_eager": true
 }