Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions .github/workflows/build-docker-deterministic.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Build Docker Image with Deterministic Sampling

on:
push:
branches:
- feat/sequence-check-vllm
workflow_dispatch: # Allows manual trigger

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker system prune -af --volumes
df -h

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ghcr.io/${{ github.repository }}
tags: |
type=raw,value=v0.9.1-deterministic
type=raw,value=latest-deterministic
type=sha,prefix={{branch}}-

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile.quick
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
platforms: linux/amd64
no-cache: true

- name: Image built successfully
run: |
echo "Docker image built and pushed successfully!"
echo "Image tags:"
echo "${{ steps.meta.outputs.tags }}"
echo ""
echo "You can pull the image with:"
echo "docker pull ghcr.io/${{ github.repository }}:v0.9.1-deterministic"
5 changes: 5 additions & 0 deletions Dockerfile.quick
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM vllm/vllm-openai:v0.9.1
RUN python3 -m pip uninstall nvidia-nccl-cu12 -y && python3 -m pip install nvidia-nccl-cu12==2.26.2.post1
ENV VLLM_USE_V1=0

COPY ./vllm /usr/local/lib/python3.12/dist-packages/vllm
49 changes: 49 additions & 0 deletions tests/entrypoints/openai/test_validation_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest
from vllm.entrypoints.openai.validation_utils import (
generate_run_seed,
compute_derived_seed,
)


def test_generate_run_seed():
run_seed = generate_run_seed(42, "chatcmpl-abc123")
assert run_seed != ""
assert len(run_seed) == 64


def test_generate_run_seed_deterministic():
run_seed1 = generate_run_seed(42, "chatcmpl-abc123")
run_seed2 = generate_run_seed(42, "chatcmpl-abc123")
assert run_seed1 == run_seed2


def test_generate_run_seed_different_inference_ids():
run_seed1 = generate_run_seed(42, "chatcmpl-abc123")
run_seed2 = generate_run_seed(42, "chatcmpl-xyz789")
assert run_seed1 != run_seed2


def test_generate_run_seed_none():
run_seed = generate_run_seed(None, "chatcmpl-abc123")
assert run_seed == ""


def test_compute_derived_seed():
derived_seed, run_seed = compute_derived_seed(42, "chatcmpl-abc123")
assert derived_seed is not None
assert isinstance(derived_seed, int)
assert derived_seed > 0
assert len(run_seed) == 64


def test_compute_derived_seed_deterministic():
derived_seed1, run_seed1 = compute_derived_seed(42, "chatcmpl-abc123")
derived_seed2, run_seed2 = compute_derived_seed(42, "chatcmpl-abc123")
assert derived_seed1 == derived_seed2
assert run_seed1 == run_seed2


def test_compute_derived_seed_none():
derived_seed, run_seed = compute_derived_seed(None, "chatcmpl-abc123")
assert derived_seed is None
assert run_seed == ""
1 change: 1 addition & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1421,6 +1421,7 @@ class ChatCompletionLogProbsContent(ChatCompletionLogProb):

class ChatCompletionLogProbs(OpenAIBaseModel):
content: Optional[list[ChatCompletionLogProbsContent]] = None
run_seed: Optional[str] = None


class ChatCompletionResponseChoice(OpenAIBaseModel):
Expand Down
24 changes: 18 additions & 6 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
MistralToolCall)
from vllm.entrypoints.openai.validation_utils import generate_run_seed, compute_derived_seed
from vllm.logger import init_logger
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.reasoning import ReasoningParser, ReasoningParserManager
Expand Down Expand Up @@ -207,6 +208,12 @@ async def create_chat_completion(
if raw_request:
raw_request.state.request_metadata = request_metadata

original_seed = request.seed
run_seed_str = ""
if request.seed is not None:
derived_seed, run_seed_str = compute_derived_seed(request.seed, request_id)
request.seed = derived_seed

# Schedule the request and get the result generator.
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
Expand Down Expand Up @@ -259,18 +266,20 @@ async def create_chat_completion(
result_generator, = generators

# Streaming response
if request.stream:
return self.chat_completion_stream_generator(
request, result_generator, request_id, model_name,
conversation, tokenizer, request_metadata)

try:
if request.stream:
return self.chat_completion_stream_generator(
request, result_generator, request_id, model_name,
conversation, tokenizer, request_metadata)

return await self.chat_completion_full_generator(
request, result_generator, request_id, model_name,
conversation, tokenizer, request_metadata)
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e))
finally:
request.seed = original_seed

def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
if request.add_generation_prompt:
Expand Down Expand Up @@ -572,6 +581,7 @@ async def chat_completion_stream_generator(
num_output_top_logprobs=request.top_logprobs,
return_as_token_id=request.
return_tokens_as_token_ids,
run_seed=run_seed_str,
)
else:
logprobs = None
Expand Down Expand Up @@ -954,6 +964,7 @@ async def chat_completion_full_generator(
num_output_top_logprobs=request.top_logprobs,
tokenizer=tokenizer,
return_as_token_id=request.return_tokens_as_token_ids,
run_seed=run_seed_str,
)
else:
logprobs = None
Expand Down Expand Up @@ -1144,6 +1155,7 @@ def _create_chat_logprobs(
tokenizer: AnyTokenizer,
num_output_top_logprobs: Optional[int] = None,
return_as_token_id: Optional[bool] = None,
run_seed: Optional[str] = None,
) -> ChatCompletionLogProbs:
"""Create OpenAI-style logprobs."""
logprobs_content: list[ChatCompletionLogProbsContent] = []
Expand Down Expand Up @@ -1183,7 +1195,7 @@ def _create_chat_logprobs(
tokenizer, should_return_as_token_id),
))

return ChatCompletionLogProbs(content=logprobs_content)
return ChatCompletionLogProbs(content=logprobs_content, run_seed=run_seed)

def _should_stream_with_auto_tool_parsing(self,
request: ChatCompletionRequest):
Expand Down
21 changes: 21 additions & 0 deletions vllm/entrypoints/openai/validation_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import hashlib
from typing import Optional, Tuple


def generate_run_seed(user_seed: Optional[int], inference_id: str) -> str:
if user_seed is None:
return ""

combined = f"{user_seed}{inference_id}"
hash_digest = hashlib.sha256(combined.encode()).hexdigest()
return hash_digest


def compute_derived_seed(user_seed: Optional[int], inference_id: str) -> Tuple[Optional[int], str]:
if user_seed is None:
return None, ""

run_seed = generate_run_seed(user_seed, inference_id)
derived_seed = int(run_seed[:16], 16) & 0x7FFFFFFFFFFFFFFF

return derived_seed, run_seed
Loading