vllm-project · simon-mo · Feb 13, 2025 · Jan 23, 2025 · Feb 3, 2025 · Feb 5, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -117,7 +117,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -205,7 +205,7 @@ steps:
     - VLLM_USE_V1=1 pytest -v -s v1/e2e
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/test_accuracy.py::test_lm_eval_accuracy_v1_engine
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
 - label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
@@ -339,6 +339,14 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
+- label: OpenAI API correctness
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
+
 - label: Encoder Decoder tests # 5min
   source_file_dependencies:
   - vllm/

@@ -41,6 +41,8 @@ We currently support the following OpenAI APIs:
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
   - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
+- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
+  - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
 
 In addition, we have the following custom APIs:
 
@@ -296,6 +298,17 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 :end-before: end-chat-embedding-extra-params
 :::
 
+(transcriptions-api)=
+
+### Transcriptions API
+
+Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+<!-- TODO: api enforced limits + uploading audios -->
+
+Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+
 (tokenizer-api)=
 
 ### Tokenizer API

diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path()
+winning_call = AudioAsset('winning_call').get_local_path()
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+with open(str(mary_had_lamb), "rb") as f:
+    transcription = client.audio.transcriptions.create(
+        file=f,
+        model="openai/whisper-large-v3",
+        language="en",
+        response_format="text",
+        temperature=0.0)
+    print("transcription result:", transcription)
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -8,12 +8,11 @@ py-cpuinfo
 transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
-fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
+fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9'
+fastapi[standard]  >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
 openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
-uvicorn[standard]
-pydantic >= 2.9  # Required for fastapi >= 0.113.0
+pydantic >= 2.9
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0

diff --git a/requirements-test.in b/requirements-test.in
@@ -19,6 +19,7 @@ pqdm
 ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
+jiwer # required for audio tests
 timm # required for internvl test
 torch==2.5.1
 torchaudio==2.5.1

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -66,6 +66,7 @@ charset-normalizer==3.4.0
 click==8.1.7
     # via
     #   black
+    #   jiwer
     #   nltk
     #   ray
 colorama==0.4.6
@@ -187,6 +188,8 @@ jinja2==3.1.4
     # via
     #   datamodel-code-generator
     #   torch
+jiwer==3.0.5
+    # via -r requirements-test.in
 jmespath==1.0.1
     # via
     #   boto3
@@ -470,6 +473,8 @@ pyyaml==6.0.2
     #   timm
     #   transformers
     #   vocos
+rapidfuzz==3.12.1
+    # via jiwer
 ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0

@@ -13,7 +13,7 @@
 
 from vllm.platforms import current_platform
 
-from ...utils import RemoteOpenAIServer
+from ....utils import RemoteOpenAIServer
 
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
 NUM_CONCURRENT = 500

@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Evaluate Transcription API correctness by computing Word Error Rate (WER)
+on a given ASR dataset. When provided, it will also compare the WER against
+a baseline.
+This simulates real work usage of the API and makes sure that the frontend and
+AsyncLLMEngine are working correctly.
+"""
+import asyncio
+import io
+import time
+from statistics import mean, median
+from typing import List
+
+import librosa
+import pytest
+import soundfile
+import torch
+from datasets import load_dataset
+from evaluate import load
+from transformers import AutoTokenizer
+
+from ....utils import RemoteOpenAIServer
+
+
+def to_bytes(y, sr):
+    buffer = io.BytesIO()
+    soundfile.write(buffer, y, sr, format="WAV")
+    buffer.seek(0)
+    return buffer
+
+
+async def transcribe_audio(client, tokenizer, y, sr):
+    # Send loaded audio directly instead of loading from disk,
+    # dont account for that time though
+    with to_bytes(y, sr) as f:
+        start_time = time.perf_counter()
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model=tokenizer.name_or_path,
+            language="en",
+            temperature=0.0,
+        )
+        end_time = time.perf_counter()
+        # NOTE there's no streaming in transcriptions, can't measure ttft
+    latency = end_time - start_time
+    num_output_tokens = len(
+        tokenizer(transcription.text, add_special_tokens=False).input_ids)
+    return latency, num_output_tokens, transcription.text
+
+
+async def bound_transcribe(model_name, sem, client, audio, reference):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Use semaphore to limit concurrent requests.
+    async with sem:
+        result = await transcribe_audio(client, tokenizer, *audio)
+        # Normalize *english* output/reference for evaluation.
+        out = tokenizer.normalize(result[2])
+        ref = tokenizer.normalize(reference)
+        return result[:2] + (out, ref)
+
+
+async def process_dataset(model, client, data, concurrent_request):
+    sem = asyncio.Semaphore(concurrent_request)
+
+    # Warmup call as the first `librosa.load` server-side is quite slow.
+    audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
+    _ = await bound_transcribe(model, sem, client, (audio, sr), "")
+
+    tasks: List[asyncio.Task] = []
+    for sample in data:
+        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+        task = asyncio.create_task(
+            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+        tasks.append(task)
+    return await asyncio.gather(*tasks)
+
+
+def print_performance_metrics(results, total_time):
+    latencies = [res[0] for res in results]
+    total_tokens = sum([res[1] for res in results])
+
+    total = len(results)
+    print(f"Total Requests: {total}")
+    print(f"Successful Requests: {len(latencies)}")
+    print(f"Average Latency: {mean(latencies):.4f} seconds")
+    print(f"Median Latency: {median(latencies):.4f} seconds")
+    perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
+    print(f"95th Percentile Latency: {perc:.4f} seconds")
+    # Throughput
+    req_throughput = len(latencies) / total_time
+    print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
+    throughput = total_tokens / total_time
+    print(f"Estimated Throughput: {throughput:.2f} tok/s")
+
+
+def add_duration(sample):
+    y, sr = sample['audio']["array"], sample['audio']["sampling_rate"]
+    sample['duration_ms'] = librosa.get_duration(y=y, sr=sr) * 1000
+    return sample
+
+
+def load_hf_dataset(dataset_repo: str, split='validation', **hf_kwargs):
+    ## Load and filter the dataset
+    dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
+    if 'duration_ms' not in dataset[0]:
+        # compute duration to filter
+        dataset = dataset.map(add_duration)
+
+    # Whisper max supported duration
+    dataset = dataset.filter(lambda example: example['duration_ms'] < 30000)
+    return dataset
+
+
+def run_evaluation(model: str,
+                   client,
+                   dataset,
+                   max_concurrent_reqs: int,
+                   n_examples: int = -1,
+                   print_metrics: bool = True):
+    if n_examples > 0:
+        dataset = dataset.select(range(n_examples))
+    start = time.perf_counter()
+    results = asyncio.run(
+        process_dataset(model, client, dataset, max_concurrent_reqs))
+    end = time.perf_counter()
+    total_time = end - start
+    print(f"Total Test Time: {total_time:.4f} seconds")
+    if print_metrics:
+        print_performance_metrics(results, total_time)
+    # Compute WER
+    predictions = [res[2] for res in results]
+    references = [res[3] for res in results]
+    wer = load("wer")
+    wer_score = 100 * wer.compute(references=references,
+                                  predictions=predictions)
+    print("WER:", wer_score)
+    return wer_score
+
+
+# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
+# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
+@pytest.mark.parametrize(
+    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"])
+# NOTE: Expected WER measured with equivalent hf.transformers args:
+# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
+@pytest.mark.parametrize("expected_wer", [12.744980])
+def test_wer_correctness(model_name,
+                         dataset_repo,
+                         expected_wer,
+                         n_examples=-1,
+                         max_concurrent_request=None):
+    with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
+        dataset = load_hf_dataset(dataset_repo)
+
+        if not max_concurrent_request:
+            # No max concurrency
+            max_concurrent_request = n_examples if n_examples > 0\
+                else len(dataset)
+
+        client = remote_server.get_async_client()
+        wer = run_evaluation(model_name, client, dataset,
+                             max_concurrent_request, n_examples)
+        if expected_wer:
+            torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)