vllm-project · markVaykhansky · May 18, 2025 · May 19, 2025 · May 19, 2025 · May 19, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,40 +4,40 @@ repos:
   hooks:
   - id: trailing-whitespace
   - id: end-of-file-fixer
-- repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
-  hooks:
-    - id: ruff
-- repo: https://github.com/pre-commit/mirrors-mypy
-  rev: v1.15.0
-  hooks:
-  - id: mypy
-    args: [--check-untyped-defs]
-    additional_dependencies:
-      [
-        # main dependencies
-        click,
-        datasets,
-        ftfy,
-        loguru,
-        numpy,
-        pillow,
-        pydantic,
-        pydantic_settings,
-        pyyaml,
-        respx,
-        rich,
-        setuptools,
-        setuptools-git-versioning,
-        transformers,
-
-        # dev dependencies
-        pytest,
-        pydantic_settings,
-
-        # types
-        types-click,
-        types-PyYAML,
-        types-requests,
-        types-toml,
-      ]
+#- repo: https://github.com/astral-sh/ruff-pre-commit
+#  rev: v0.11.7
+#  hooks:
+#    - id: ruff
+#- repo: https://github.com/pre-commit/mirrors-mypy
+#  rev: v1.15.0
+#  hooks:
+#  - id: mypy
+#    args: [--check-untyped-defs]
+#    additional_dependencies:
+#      [
+#        # main dependencies
+#        click,
+#        datasets,
+#        ftfy,
+#        loguru,
+#        numpy,
+#        pillow,
+#        pydantic,
+#        pydantic_settings,
+#        pyyaml,
+#        respx,
+#        rich,
+#        setuptools,
+#        setuptools-git-versioning,
+#        transformers,
+#
+#        # dev dependencies
+#        pytest,
+#        pydantic_settings,
+#
+#        # types
+#        types-click,
+#        types-PyYAML,
+#        types-requests,
+#        types-toml,
+#      ]
diff --git a/README.md b/README.md
@@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
 
 - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
 
+- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.
- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.
+- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold.
- `--max-error-rate`: The maximum error rate after which a benchmark will stop. Applicable only for finite deterministic scenarios i.e `rate_type` is `constant` and `--max-seconds` exists OR `--max-requests` exists OR the dataset is finite. If `--max-error-rate` is `None` or not applicable, benchmarks will continue regardless of error rate.
+- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold.
+
 - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
 
 - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
@@ -163,12 +163,28 @@ def cli():
         "If None, will run until max_seconds or the data is exhausted."
     ),
 )
+@click.option(
+    "--max-error",
+    type=float,
+    help=(
+        "The maximum error after which a benchmark will stop. "
+        "Can either be a rate i.e 0 < rate < 1 or constant number. "
+        "If rate is given and rate_type is 'constant' and 'max_seconds' exists "
+        "then the rate will be calculated as part of the total expected "
+        "requests count i.e rate * duration. If rate is given and number"
+        "of requests is not pre-determined than a context window "
+        "of the last requests will be looked at. Context window size"
+        "is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
+        "If a number above 1 is given than we just count the total"
+        "number of error and check if it's above the threshold."
+    ),
+)
 @click.option(
     "--warmup-percent",
     type=float,
     default=None,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, "
+        "The percent of the benchmark (based on max-seconds, max-requests, "
         "or lenth of dataset) to run as a warmup and not include in the final results. "
         "Defaults to None."
     ),
@@ -177,7 +193,7 @@ def cli():
     "--cooldown-percent",
     type=float,
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
+        "The percent of the benchmark (based on max-seconds, max-requests, or length "
         "of dataset) to run as a cooldown and not include in the final results. "
         "Defaults to None."
     ),
@@ -242,6 +258,7 @@ def benchmark(
     rate,
     max_seconds,
     max_requests,
+    max_error,
     warmup_percent,
     cooldown_percent,
     disable_progress,
@@ -267,6 +284,7 @@ def benchmark(
             rate=rate,
             max_seconds=max_seconds,
             max_requests=max_requests,
+            max_error=max_error,
             warmup_percent=warmup_percent,
             cooldown_percent=cooldown_percent,
             show_progress=not disable_progress,

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
@@ -93,7 +93,7 @@ def __init__(
             raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
 
         if self._target.endswith("/v1") or self._target.endswith("/v1/"):
-            # backwards compatability, strip v1 off
+            # backwards compatibility, strip v1 off
             self._target = self._target[:-3]
 
         if self._target.endswith("/"):

diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
@@ -600,6 +600,8 @@ def compile(self) -> GenerativeBenchmark:
         """
         successful, incomplete, errored = self._compile_results()
 
+        error_rate = self._calculate_error_rate()
+
         return GenerativeBenchmark.from_stats(
             run_id=self.run_id,
             successful=successful,
@@ -625,12 +627,19 @@ def compile(self) -> GenerativeBenchmark:
                 request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
                 request_time_delay_avg=self.requests_stats.request_time_delay.mean,
                 request_time_avg=self.requests_stats.request_time.mean,
+                error_rate=error_rate,
             ),
             worker=self.worker_description,
             requests_loader=self.request_loader_description,
             extras=self.extras,
         )
 
+    def _calculate_error_rate(self) -> float:
+        total_successful = self.requests_stats.totals.successful.total
+        total_errored = self.requests_stats.totals.errored.total
+        total_finished = total_errored + total_successful
+        return total_errored / total_finished if total_finished > 0 else 0
+
     def _compile_results(
         self,
     ) -> tuple[

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
@@ -90,6 +90,9 @@ class BenchmarkArgs(StandardBaseModel):
     max_duration: Optional[float] = Field(
         description="The maximum duration in seconds to run this benchmark, if any."
     )
+    max_error: Optional[float] = Field(
+        description="Maximum error rate or const after which a benchmark will stop."
+    )
     warmup_number: Optional[int] = Field(
         description=(
             "The number of requests to run for the warmup phase of this benchmark, "
@@ -213,6 +216,15 @@ class BenchmarkRunStats(StandardBaseModel):
             "it was completed."
         )
     )
+    error_rate: float = Field(
+        description=(
+            "The number of errored requests divided by the number "
+            "of successful and errored requests. "
+            "This can be higher than max_error "
+            "(if applicable) cause it does not take into "
+            "account incomplete requests."
+        )
+    )
 
 
 class BenchmarkMetrics(StandardBaseModel):

diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
@@ -74,6 +74,12 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
         description="Maximum duration (in seconds) to process requests per strategy.",
         ge=0,
     )
+    max_error: Optional[float] = Field(
+        description="Maximum error after which a "
+        "benchmark will stop,"
+        " either rate or fixed number",
+        ge=0,
+    )
     warmup_percent_per_strategy: Optional[float] = Field(
         description="Percentage of requests to use for warmup.",
         ge=0,
@@ -148,6 +154,7 @@ async def run(
         profile: Profile,
         max_number_per_strategy: Optional[int],
         max_duration_per_strategy: Optional[float],
+        max_error: Optional[float],
         warmup_percent_per_strategy: Optional[float],
         cooldown_percent_per_strategy: Optional[float],
     ) -> AsyncGenerator[
@@ -162,6 +169,7 @@ async def run(
             requests_loader_size=requests_loader_size,
             max_number_per_strategy=max_number_per_strategy,
             max_duration_per_strategy=max_duration_per_strategy,
+            max_error=max_error,
             warmup_percent_per_strategy=warmup_percent_per_strategy,
             cooldown_percent_per_strategy=cooldown_percent_per_strategy,
         )
@@ -196,6 +204,7 @@ async def run(
                 scheduling_strategy=scheduling_strategy,
                 max_number=max_number_per_strategy,
                 max_duration=max_duration_per_strategy,
+                max_error=max_error,
             ):
                 if result.type_ == "run_start":
                     yield BenchmarkerResult(
@@ -321,6 +330,7 @@ def create_benchmark_aggregator(
                 strategy=strategy,
                 max_number=limits.max_number,
                 max_duration=limits.max_duration,
+                max_error=limits.max_error,
                 warmup_number=limits.warmup_number,
                 warmup_duration=limits.warmup_duration,
                 cooldown_number=limits.cooldown_number,

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
@@ -41,6 +41,7 @@ async def benchmark_generative_text(
     rate: Optional[Union[float, list[float]]],
     max_seconds: Optional[float],
     max_requests: Optional[int],
+    max_error: Optional[float],
     warmup_percent: Optional[float],
     cooldown_percent: Optional[float],
     show_progress: bool,
@@ -107,6 +108,7 @@ async def benchmark_generative_text(
         profile=profile,
         max_number_per_strategy=max_requests,
         max_duration_per_strategy=max_seconds,
+        max_error=max_error,
         warmup_percent_per_strategy=warmup_percent,
         cooldown_percent_per_strategy=cooldown_percent,
     ):

diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py
@@ -419,6 +419,7 @@ def benchmarks_args_str(self) -> str:
             {
                 "max_number": args.max_number,
                 "max_duration": args.max_duration,
+                "max_error": args.max_error,
                 "warmup_number": args.warmup_number,
                 "warmup_duration": args.warmup_duration,
                 "cooldown_number": args.cooldown_number,

diff --git a/src/guidellm/config.py b/src/guidellm/config.py
@@ -113,6 +113,8 @@ class Settings(BaseSettings):
     default_async_loop_sleep: float = 10e-5
     logging: LoggingSettings = LoggingSettings()
     default_sweep_number: int = 10
+    shutdown_poll_interval_seconds: float = 10
+    error_check_window_size: int = 10
 
     # HTTP settings
     request_follow_redirects: bool = True

diff --git a/src/guidellm/objects/pydantic.py b/src/guidellm/objects/pydantic.py
@@ -1,10 +1,11 @@
 from typing import Any, Generic, TypeVar
 
-from loguru import logger
 from pydantic import BaseModel, ConfigDict, Field
 
 __all__ = ["StandardBaseModel", "StatusBreakdown"]
 
+from guidellm import logger
+
 
 class StandardBaseModel(BaseModel):
     """

diff --git a/src/guidellm/request/__init__.py b/src/guidellm/request/__init__.py
@@ -1,6 +1,7 @@
 from .loader import (
     GenerativeRequestLoader,
     GenerativeRequestLoaderDescription,
+    GetInfiniteDatasetLengthError,
     RequestLoader,
     RequestLoaderDescription,
 )
@@ -10,6 +11,7 @@
     "GenerationRequest",
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
+    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
 ]
diff --git a/src/guidellm/request/loader.py b/src/guidellm/request/loader.py
@@ -19,11 +19,16 @@
 __all__ = [
     "GenerativeRequestLoader",
     "GenerativeRequestLoaderDescription",
+    "GetInfiniteDatasetLengthError",
     "RequestLoader",
     "RequestLoaderDescription",
 ]
 
 
+class GetInfiniteDatasetLengthError(Exception):
+    pass
+
+
 class RequestLoaderDescription(StandardBaseModel):
     type_: Literal["request_loader"] = "request_loader"
 
@@ -120,7 +125,11 @@ def __len__(self) -> int:
         if self.iter_type == "finite":
             return self.num_unique_items()
 
-        raise ValueError(f"Unable to determine length of dataset: {self.data}")
+        if self.iter_type != "infinite":
+            raise ValueError(f"Invalid iter_type {self.iter_type}")
+        raise GetInfiniteDatasetLengthError(
+            f"Dataset {self.data} is infinite and thus unable to determine length"
+        )
 
     @property
     def description(self) -> GenerativeRequestLoaderDescription:

diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
@@ -1,3 +1,4 @@
+from collections import deque
 from typing import (
     Generic,
     Literal,
@@ -16,6 +17,9 @@
 ]
 
 
+RequestStatus = Literal["success", "error"]
+
+
 class SchedulerRunInfo(StandardBaseModel):
     """
     Information about the current run of the scheduler.
@@ -46,12 +50,15 @@ class SchedulerRunInfo(StandardBaseModel):
     end_number: float
     processes: int
     strategy: SchedulingStrategy
+    last_requests_statuses: deque[RequestStatus]
+    max_error: Optional[float] = None
 
     created_requests: int = 0
     queued_requests: int = 0
     scheduled_requests: int = 0
     processing_requests: int = 0
     completed_requests: int = 0
+    errored_requests: int = 0
 
 
 class SchedulerRequestInfo(StandardBaseModel):