[Docs] Enable fail_on_warning for the docs build in CI (#25580)

hmellor · web-flow · commit 8c853050e7da · 2025-09-24T19:30:33.000Z
Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -13,6 +13,7 @@ build:
 
 mkdocs:
   configuration: mkdocs.yaml
+  fail_on_warning: true
 
 # Optionally declare the Python requirements required to build your docs
 python:
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
@@ -9,7 +9,7 @@ NixlConnector is a high-performance KV cache transfer connector for vLLM's disag
 Install the NIXL library: `uv pip install nixl`, as a quick start.
 
 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
-- The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
+- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
 
 ### Transport Configuration
 
@@ -154,6 +154,6 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
 
 Refer to these example scripts in the vLLM repository:
 
-- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
-- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
-- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
+- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
+- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
+- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
@@ -32,8 +32,9 @@ def auto_mock(module, attr, max_mocks=50):
     for _ in range(max_mocks):
         try:
             # First treat attr as an attr, then as a submodule
-            return getattr(importlib.import_module(module), attr,
-                           importlib.import_module(f"{module}.{attr}"))
+            with patch("importlib.metadata.version", return_value="0.0.0"):
+                return getattr(importlib.import_module(module), attr,
+                               importlib.import_module(f"{module}.{attr}"))
         except importlib.metadata.PackageNotFoundError as e:
             raise e
         except ModuleNotFoundError as e:
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
@@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
 
 In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
-which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
+which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.
 
 ## Configuration
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -29,7 +29,7 @@ _*Vision-language models currently accept only image inputs. Support for video i
 
 If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
 
-- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
+- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
     - Pipeline parallel
     - Tensor parallel
diff --git a/docs/usage/README.md b/docs/usage/README.md
@@ -1,6 +1,6 @@
 # Using vLLM
 
-First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
+First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
 
 Then, vLLM supports the following usage patterns:
 
diff --git a/examples/online_serving/dashboards/grafana/README.md b/examples/online_serving/dashboards/grafana/README.md
@@ -11,9 +11,9 @@ vLLM performance and metrics.
 
 ## Dashboard Descriptions
 
-- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
+- **performance_statistics.json**: Tracks performance metrics including latency and
   throughput for your vLLM service.
-- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
+- **query_statistics.json**: Tracks query performance, request volume, and key
   performance indicators for your vLLM service.
 
 ## Deployment Options
diff --git a/examples/online_serving/dashboards/perses/README.md b/examples/online_serving/dashboards/perses/README.md
@@ -21,9 +21,9 @@ deployment methods:
 
 ## Dashboard Descriptions
 
-- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
+- **performance_statistics.yaml**: Performance metrics with aggregated latency
   statistics
-- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
+- **query_statistics.yaml**: Query performance and deployment metrics
 
 ## Deployment Options
 
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
@@ -18,12 +18,14 @@ def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
     final attention output.
 
     Args:
-        output: [ B, H, D ]
-        lses   : [ N, B, H ]
-        cp, batch, q_heads, v_head_dim
-    Return:
-        output: [ B, H, D ]
-        lse   : [ B, H ]
+        outputs_ptr (triton.PointerType):
+            Pointer to input tensor of shape [ B, H, D ]
+        lses_ptr (triton.PointerType):
+            Pointer to input tensor of shape [ N, B, H ]
+        new_output_ptr (triton.PointerType):
+            Pointer to output tensor of shape [ B, H, D ]
+        vlse_ptr (triton.PointerType):
+            Pointer to output tensor of shape [ B, H ]
     """
     batch_idx = tl.program_id(axis=0).to(tl.int64)
     head_idx = tl.program_id(axis=1).to(tl.int64)
@@ -81,19 +83,19 @@ def call_kernel(self, kernel, grid, *regular_args, **const_args):
             self.inner_kernel[grid](*regular_args)
 
 
-def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
-                     ctx: CPTritonContext):
-    """
-    Apply the all-gathered lses to correct each local rank's attention
-    output. we still need perform a cross-rank reduction to obtain the
-    final attention output.
+def correct_attn_out(
+        out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
+        ctx: CPTritonContext) -> tuple[torch.Tensor, torch.Tensor]:
+    """Correct the attention output using the all-gathered lses.
 
     Args:
-        output: [ B, H, D ]
-        lses   : [ N, B, H ]
-    Return:
-        output: [ B, H, D ]
-        lse   : [ B, H ]
+        out: Tensor of shape [ B, H, D ]
+        lses: Tensor of shape [ N, B, H ]
+        cp_rank: Current rank in the context-parallel group
+        ctx: Triton context to avoid recompilation
+
+    Returns:
+        Tuple of (out, lse) with corrected attention and final log-sum-exp.
     """
     if ctx is None:
         ctx = CPTritonContext()
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
@@ -287,8 +287,8 @@ class EncoderDecoderInputs(TypedDict):
 
 SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be 
-passed to [`vllm.sequence.Sequence`][].
+A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be
+passed to [`Sequence`][collections.abc.Sequence].
 """
 
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -57,14 +57,15 @@
     FusedMoEPermuteExpertsUnpermute = None  # type: ignore
     FusedMoEPrepareAndFinalize = None  # type: ignore
 
-    def eplb_map_to_physical_and_record(
+    def _eplb_map_to_physical_and_record(
             topk_ids: torch.Tensor, expert_load_view: torch.Tensor,
             logical_to_physical_map: torch.Tensor,
             logical_replica_count: torch.Tensor,
             indices_type: Optional[torch.dtype]) -> torch.Tensor:
         # CPU fallback: no EPLB so just return as is
         return topk_ids
 
+    eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
 
 if is_rocm_aiter_moe_enabled():
     from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
@@ -807,12 +808,11 @@ def maybe_roundup_hidden_size(
     if necessary.
     
     Args:
-        hidden_size(int): Layer hidden-size
+        hidden_size: Layer hidden-size
         act_dtype: Data type of the layer activations.
-        quant_config(FusedMoEQuantConfig): Fused MoE quantization configuration.
-        moe_parallel_config(FusedMoEParallelConfig): Fused MoE parallelization
-            strategy configuration.
-    
+        quant_config: Fused MoE quantization configuration.
+        moe_parallel_config: Fused MoE parallelization strategy configuration.
+
     Return:
         Rounded up hidden_size if rounding up is required based on the configs.
         Original hidden size otherwise.
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -13,7 +13,7 @@
 from collections.abc import Generator
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import IO, Any, Callable, Optional, Union
 
 import filelock
 import huggingface_hub.constants
@@ -102,7 +102,7 @@ def get_lock(model_name_or_path: Union[str, Path],
 @contextmanager
 def atomic_writer(filepath: Union[str, Path],
                   mode: str = 'w',
-                  encoding: Optional[str] = None):
+                  encoding: Optional[str] = None) -> Generator[IO]:
     """
     Context manager that provides an atomic file writing routine.
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
@@ -1445,14 +1445,18 @@ def forward(
                 **NOTE**: If mrope is enabled (default setting for Qwen3VL
                 opensource models), the shape will be `(3, seq_len)`,
                 otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+            intermediate_tensors: Intermediate tensors from previous pipeline
+                stages.
+            inputs_embeds: Pre-computed input embeddings.
+            **kwargs: Additional keyword arguments including:
+                - pixel_values: Pixel values to be fed to a model.
+                    `None` if no images are passed.
+                - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
+                    LLM. `None` if no images are passed.
+                - pixel_values_videos: Pixel values of videos to be fed to a
+                    model. `None` if no videos are passed.
+                - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
+                    LLM. `None` if no videos are passed.
         """
 
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
@@ -944,11 +944,10 @@ def compute_logits(
         hidden_states: torch.Tensor,
     ) -> Optional[torch.Tensor]:
         """Compute logits for next token prediction.
-        
+
         Args:
             hidden_states: Hidden states from model forward pass
-            sampling_metadata: Metadata for sampling process
-            
+
         Returns:
             Logits for next token prediction
         """
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
@@ -278,11 +278,11 @@ def _get_delta_message_with_both_bounds(
         content and normal (response) content.
 
         Args:
-            delta_text (str): Text to consider and parse content from.
-            reasoning_content (str): reasoning content from current_text.
-            response_content (str): response content from current_text.
-            current_text (str): The full previous + delta text.
-            response_seq_len(str): Len of the complete response sequence used.
+            delta_text: Text to consider and parse content from.
+            reasoning_content: reasoning content from current_text.
+            response_content: response content from current_text.
+            current_text: The full previous + delta text.
+            response_seq_len: Len of the complete response sequence used.
 
         Returns:
             DeltaMessage: Message containing the parsed content.
diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py
@@ -27,36 +27,23 @@ class RadioConfig(PretrainedConfig):
     specified arguments, defining the model architecture.
 
     Args:
-        model_name (`str`, *optional*, defaults to "vit_base_patch16_224"):
-            Name of the vision transformer model (e.g., "vit_base_patch16_224").
-            Used to determine architecture dimensions from
-            `VIT_TIMM_DIM_BY_NAME`.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        qkv_bias (`bool`, *optional*, defaults to True):
-            Whether to add a bias to the queries, keys and values.
-        qk_normalization (`bool`, *optional*, defaults to False):
-            Whether to apply normalization to queries and keys.
-        norm_type (`str`, *optional*, defaults to "layer_norm"):
-            The normalization type to use.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
-            The epsilon used by the layer normalization layers.
-        initializer_factor (`float`, *optional*, defaults to 1.0):
-            A factor for initializing all weight matrices.
-        hidden_act (`str`, *optional*, defaults to "gelu"):
-            The non-linear activation function in the encoder.
-        max_img_size (`int`, *optional*, defaults to 2048):
-            Maximum image size for position embeddings.
-        norm_mean (`tuple` or `list`, *optional*,
-            defaults to (0.48145466, 0.4578275, 0.40821073)):
-            Mean values for image normalization (RGB channels).
-        norm_std (`tuple` or `list`, *optional*,
-            defaults to (0.26862954, 0.26130258, 0.27577711)):
-            Standard deviation values for image normalization (RGB channels).
-        reg_tokens (`int`, *optional*):
-            Number of register tokens to use.
+        model_name: Name of the vision transformer model
+            (e.g., "vit_base_patch16_224"). Used to determine architecture
+            dimensions from `VIT_TIMM_DIM_BY_NAME`.
+        image_size: The size (resolution) of each image.
+        patch_size: The size (resolution) of each patch.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        qk_normalization: Whether to apply normalization to queries and keys.
+        norm_type: The normalization type to use.
+        layer_norm_eps: The epsilon used by the layer normalization layers.
+        initializer_factor: A factor for initializing all weight matrices.
+        hidden_act: The non-linear activation function in the encoder.
+        max_img_size: Maximum image size for position embeddings.
+        norm_mean: Mean values for image normalization (RGB channels).
+            Defaults to (0.48145466, 0.4578275, 0.40821073)).
+        norm_std: Standard deviation values for image normalization
+            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
+        reg_tokens: Number of register tokens to use.
     """
 
     model_type = "radio"
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
@@ -27,7 +27,7 @@ def try_get_class_from_dynamic_module(
     **kwargs,
 ) -> Optional[type]:
     """
-    As [transformers.dynamic_module_utils.get_class_from_dynamic_module][],
+    As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
     but ignoring any errors.
     """
     try:
diff --git a/vllm/v1/kv_offload/__init__.py b/vllm/v1/kv_offload/__init__.py
diff --git a/vllm/v1/kv_offload/backends/__init__.py b/vllm/v1/kv_offload/backends/__init__.py
diff --git a/vllm/v1/kv_offload/worker/__init__.py b/vllm/v1/kv_offload/worker/__init__.py