vllm-project · markurtz · Apr 17, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# Output files
+benchmarks.json
+benchmarks.yaml
+benchmarks.csv
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/src/guidellm/__init__.py b/src/guidellm/__init__.py
@@ -21,7 +21,29 @@
     hf_logging.set_verbosity_error()
     logging.getLogger("transformers").setLevel(logging.ERROR)
 
-from .config import settings
+from .config import (
+    settings,
+    DatasetSettings,
+    Environment,
+    LoggingSettings,
+    OpenAISettings,
+    print_config,
+    Settings,
+    reload_settings,
+)
 from .logger import configure_logger, logger
 
-__all__ = ["configure_logger", "logger", "settings", "generate_benchmark_report"]
+__all__ = [
+    # Config
+    "DatasetSettings",
+    "Environment",
+    "LoggingSettings",
+    "OpenAISettings",
+    "print_config",
+    "Settings",
+    "reload_settings",
+    "settings",
+    # Logger
+    "logger",
+    "configure_logger",
+]
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
@@ -210,6 +210,15 @@ def cli():
     callback=parse_json,
     help="A JSON string of extra data to save with the output benchmarks",
 )
+@click.option(
+    "--output-sampling",
+    type=int,
+    help=(
+        "The number of samples to save in the output file. "
+        "If None (default), will save all samples."
+    ),
+    default=None,
+)
 @click.option(
     "--random-seed",
     default=42,
@@ -237,6 +246,7 @@ def benchmark(
     disable_console_outputs,
     output_path,
     output_extras,
+    output_sampling,
     random_seed,
 ):
     asyncio.run(
@@ -261,6 +271,7 @@ def benchmark(
             output_console=not disable_console_outputs,
             output_path=output_path,
             output_extras=output_extras,
+            output_sampling=output_sampling,
             random_seed=random_seed,
         )
     )

diff --git a/src/guidellm/benchmark/__init__.py b/src/guidellm/benchmark/__init__.py
@@ -1,7 +1,19 @@
 from .aggregator import AggregatorT, BenchmarkAggregator, GenerativeBenchmarkAggregator
-from .benchmark import Benchmark, BenchmarkT, GenerativeBenchmark
+from .benchmark import (
+    Benchmark,
+    BenchmarkArgs,
+    BenchmarkMetrics,
+    BenchmarkRunStats,
+    BenchmarkT,
+    GenerativeBenchmark,
+    GenerativeMetrics,
+    GenerativeTextErrorStats,
+    GenerativeTextResponseStats,
+    StatusBreakdown,
+)
 from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
 from .entrypoints import benchmark_generative_text
+from .output import GenerativeBenchmarksConsole, GenerativeBenchmarksReport
 from .profile import (
     AsyncProfile,
     ConcurrentProfile,
@@ -12,17 +24,39 @@
     ThroughputProfile,
     create_profile,
 )
+from .progress import (
+    BenchmarkerProgressDisplay,
+    BenchmarkerTaskProgressState,
+    GenerativeTextBenchmarkerProgressDisplay,
+    GenerativeTextBenchmarkerTaskProgressState,
+)
 
 __all__ = [
+    # Aggregator
     "AggregatorT",
-    "BenchmarkT",
-    "Benchmark",
     "BenchmarkAggregator",
-    "GenerativeBenchmark",
     "GenerativeBenchmarkAggregator",
+    # Benchmark
+    "Benchmark",
+    "BenchmarkArgs",
+    "BenchmarkMetrics",
+    "BenchmarkRunStats",
+    "BenchmarkT",
+    "GenerativeBenchmark",
+    "GenerativeMetrics",
+    "GenerativeTextErrorStats",
+    "GenerativeTextResponseStats",
+    "StatusBreakdown",
+    # Benchmarker
     "Benchmarker",
     "BenchmarkerResult",
     "GenerativeBenchmarker",
+    # Entry points
+    "benchmark_generative_text",
+    # Output
+    "GenerativeBenchmarksConsole",
+    "GenerativeBenchmarksReport",
+    # Profile
     "AsyncProfile",
     "ConcurrentProfile",
     "Profile",
@@ -31,5 +65,9 @@
     "SynchronousProfile",
     "ThroughputProfile",
     "create_profile",
-    "benchmark_generative_text",
+    # Progress
+    "BenchmarkerProgressDisplay",
+    "BenchmarkerTaskProgressState",
+    "GenerativeTextBenchmarkerProgressDisplay",
+    "GenerativeTextBenchmarkerTaskProgressState",
 ]
diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
@@ -457,7 +457,12 @@ def time_per_output_token_ms(self) -> Optional[float]:  # type: ignore[override]
             This includes the time to generate the first token and all other tokens.
             None if the output_tokens is None or 0.
         """
-        if self.output_tokens is None or self.output_tokens == 0:
+        if (
+            self.output_tokens is None
+            or self.output_tokens == 0
+            or self.first_token_time is None
+            or self.last_token_time is None
+        ):
             return None
 
         return super().time_per_output_token_ms
@@ -614,41 +619,46 @@ def duration(self) -> float:
         ),
     )
 
-    def create_sampled(self, sample_size: int) -> "GenerativeBenchmark":
+    def set_sample_size(self, sample_size: Optional[int]) -> "GenerativeBenchmark":
         """
-        Create a new benchmark instance with a random sample of the completed and
-        errored requests based on the given sample sizes. If the sample sizes are
-        larger than the total number of requests, the sample sizes are capped at
-        the total number of requests.
+        Set the sample size for the benchmark. This will randomly sample the
+        requests for each status type to the given sample size or the maximum
+        number of requests for that status type, whichever is smaller.
+        This is applied to requests.successful, requests.errored, and
+        requests.incomplete.
+        If None, no sampling is applied and the state is kept.
 
         :param sample_size: The number of requests to sample for each status type.
-        :return: A new benchmark instance with the sampled requests.
-        :raises ValueError: If the sample sizes are negative.
+        :return: The benchmark with the sampled requests.
+        :raises ValueError: If the sample size is invalid.
         """
-        if sample_size < 0:
-            raise ValueError(f"Sample size must be non-negative, given {sample_size}")
 
-        sample_size = min(sample_size, len(self.requests.successful))
-        error_sample_size = min(sample_size, len(self.requests.errored))
-        incomplete_sample_size = min(sample_size, len(self.requests.incomplete))
+        if sample_size is not None:
+            if sample_size < 0 or not isinstance(sample_size, int):
+                raise ValueError(
+                    f"Sample size must be non-negative integer, given {sample_size}"
+                )
 
-        sampled_instance = self.model_copy()
-        sampled_instance.requests.successful = random.sample(
-            self.requests.successful, sample_size
-        )
-        sampled_instance.requests.errored = random.sample(
-            self.requests.errored, error_sample_size
-        )
-        sampled_instance.requests.incomplete = random.sample(
-            self.requests.incomplete, incomplete_sample_size
-        )
-        sampled_instance.request_samples = StatusBreakdown(
-            successful=len(sampled_instance.requests.successful),
-            incomplete=len(sampled_instance.requests.incomplete),
-            errored=len(sampled_instance.requests.errored),
-        )
+            sample_size = min(sample_size, len(self.requests.successful))
+            error_sample_size = min(sample_size, len(self.requests.errored))
+            incomplete_sample_size = min(sample_size, len(self.requests.incomplete))
+
+            self.requests.successful = random.sample(
+                self.requests.successful, sample_size
+            )
+            self.requests.errored = random.sample(
+                self.requests.errored, error_sample_size
+            )
+            self.requests.incomplete = random.sample(
+                self.requests.incomplete, incomplete_sample_size
+            )
+            self.request_samples = StatusBreakdown(
+                successful=len(self.requests.successful),
+                incomplete=len(self.requests.incomplete),
+                errored=len(self.requests.errored),
+            )
 
-        return sampled_instance
+        return self
 
     @staticmethod
     def from_stats(

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
@@ -1,17 +1,16 @@
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Literal, Optional, Union
+from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union
 
 from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from transformers import (  # type: ignore[import]
     PreTrainedTokenizerBase,
 )
 
 from guidellm.backend import Backend, BackendType
-from guidellm.benchmark.benchmark import GenerativeBenchmark
 from guidellm.benchmark.benchmarker import GenerativeBenchmarker
 from guidellm.benchmark.output import (
     GenerativeBenchmarksConsole,
-    save_generative_benchmarks,
+    GenerativeBenchmarksReport,
 )
 from guidellm.benchmark.profile import ProfileType, create_profile
 from guidellm.benchmark.progress import GenerativeTextBenchmarkerProgressDisplay
@@ -48,8 +47,9 @@ async def benchmark_generative_text(
     output_console: bool,
     output_path: Optional[Union[str, Path]],
     output_extras: Optional[Dict[str, Any]],
+    output_sampling: Optional[int],
     random_seed: int,
-) -> List[GenerativeBenchmark]:
+) -> Tuple[GenerativeBenchmarksReport, Optional[Path]]:
     console = GenerativeBenchmarksConsole(enabled=show_progress)
     console.print_line("Creating backend...")
     backend = Backend.create(
@@ -100,7 +100,7 @@ async def benchmark_generative_text(
         if show_progress
         else None
     )
-    benchmarks = []
+    report = GenerativeBenchmarksReport()
 
     async for result in benchmarker.run(
         profile=profile,
@@ -115,15 +115,26 @@ async def benchmark_generative_text(
         if result.type_ == "benchmark_compiled":
             if result.current_benchmark is None:
                 raise ValueError("Current benchmark is None")
-            benchmarks.append(result.current_benchmark)
+            report.benchmarks.append(
+                result.current_benchmark.set_sample_size(output_sampling)
+            )
 
     if output_console:
-        console.benchmarks = benchmarks
+        orig_enabled = console.enabled
+        console.enabled = True
+        console.benchmarks = report.benchmarks
         console.print_benchmarks_metadata()
         console.print_benchmarks_info()
         console.print_benchmarks_stats()
+        console.enabled = orig_enabled
 
     if output_path:
-        save_generative_benchmarks(benchmarks=benchmarks, path=output_path)
+        console.print_line("\nSaving benchmarks report...")
+        saved_path = report.save_file(output_path)
+        console.print_line(f"Benchmarks report saved to {saved_path}")
+    else:
+        saved_path = None
 
-    return benchmarks
+    console.print_line("\nBenchmarking complete.")
+
+    return report, saved_path