Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
...

@abstractmethod
async def start_profile(self) -> None:
async def start_profile(self, profile_options: dict[str, Any] | None = None) -> None:

Check failure on line 107 in vllm/engine/protocol.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/engine/protocol.py:107:89: E501 Line too long (89 > 88)
"""Start profiling the engine"""
...

Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,8 +1484,8 @@ def ensure_str(prompt: SingletonPrompt):
lora_request,
)

def start_profile(self) -> None:
self.llm_engine.start_profile()
def start_profile(self, profile_options: dict[str, Any] | None = None) -> None:
self.llm_engine.start_profile(profile_options=profile_options)

def stop_profile(self) -> None:
self.llm_engine.stop_profile()
Expand Down
28 changes: 22 additions & 6 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,16 +1201,32 @@ async def invocations(raw_request: Request):
return JSONResponse(content=res.model_dump(), status_code=res.error.code)


if envs.VLLM_TORCH_PROFILER_DIR:
logger.warning(
"Torch Profiler is enabled in the API server. This should ONLY be "
"used for local development!"
)
if envs.VLLM_TORCH_PROFILER_DIR or envs.USE_PROTON:
if envs.VLLM_TORCH_PROFILER_DIR:
logger.warning(
"Torch Profiler is enabled in the API server. This should ONLY be "
"used for local development!"
)
if envs.USE_PROTON:
logger.warning(
"Proton profiler is enabled in the API server. This should ONLY be "
"used for local development!"
)

@router.post("/start_profile")
async def start_profile(raw_request: Request):
logger.info("Starting profiler...")
await engine_client(raw_request).start_profile()
profile_options = None
try:
payload = await raw_request.json()
except json.JSONDecodeError:
payload = None
except Exception:
payload = None
else:
if isinstance(payload, dict):
profile_options = payload
Comment on lines +1220 to +1228
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The exception handling for parsing the JSON payload is too broad and contains a redundant except block. except Exception: will catch json.JSONDecodeError, making the first except block unreachable. More importantly, catching Exception and silently setting payload = None can hide unexpected errors during request processing. It's better to handle only the expected json.JSONDecodeError.

Suggested change
try:
payload = await raw_request.json()
except json.JSONDecodeError:
payload = None
except Exception:
payload = None
else:
if isinstance(payload, dict):
profile_options = payload
try:
payload = await raw_request.json()
if isinstance(payload, dict):
profile_options = payload
except json.JSONDecodeError:
# It's okay if the request has no body or is not valid JSON.
pass

await engine_client(raw_request).start_profile(profile_options=profile_options)
logger.info("Profiler started.")
return Response(status_code=200)

Expand Down
18 changes: 18 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@
VLLM_FORCE_AOT_LOAD: bool = False
VLLM_TORCH_PROFILER_WITH_STACK: bool = True
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
USE_PROTON: bool = False
PROTON_PROFILE_NAME: str | None = None
PROTON_PROFILE_NAME_PREFIX: str | None = None
PROTON_PROFILE_CONTEXT: str | None = None
PROTON_PROFILE_DATA: str | None = None
PROTON_PROFILE_BACKEND: str | None = None
PROTON_PROFILE_MODE: str | None = None
PROTON_PROFILE_HOOK: str | None = None
VLLM_USE_TRITON_AWQ: bool = False
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False
Expand Down Expand Up @@ -803,6 +811,16 @@ def get_vllm_port() -> int | None:
"VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
),
"USE_PROTON": lambda: bool(
os.getenv("USE_PROTON", "0").lower() not in ("0", "false")
),
"PROTON_PROFILE_NAME": lambda: os.getenv("PROTON_PROFILE_NAME"),
"PROTON_PROFILE_NAME_PREFIX": lambda: os.getenv("PROTON_PROFILE_NAME_PREFIX"),
"PROTON_PROFILE_CONTEXT": lambda: os.getenv("PROTON_PROFILE_CONTEXT"),
"PROTON_PROFILE_DATA": lambda: os.getenv("PROTON_PROFILE_DATA"),
"PROTON_PROFILE_BACKEND": lambda: os.getenv("PROTON_PROFILE_BACKEND"),
"PROTON_PROFILE_MODE": lambda: os.getenv("PROTON_PROFILE_MODE"),
"PROTON_PROFILE_HOOK": lambda: os.getenv("PROTON_PROFILE_HOOK"),
# If set, vLLM will use Triton implementations of AWQ.
"VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
# If set, allow loading or unloading lora adapters in runtime,
Expand Down
Loading
Loading