Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Output files
benchmarks.json
benchmarks.yaml
benchmarks.csv

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
26 changes: 24 additions & 2 deletions src/guidellm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,29 @@
hf_logging.set_verbosity_error()
logging.getLogger("transformers").setLevel(logging.ERROR)

from .config import settings
from .config import (
settings,
DatasetSettings,
Environment,
LoggingSettings,
OpenAISettings,
print_config,
Settings,
reload_settings,
)
from .logger import configure_logger, logger

__all__ = ["configure_logger", "logger", "settings", "generate_benchmark_report"]
__all__ = [
# Config
"DatasetSettings",
"Environment",
"LoggingSettings",
"OpenAISettings",
"print_config",
"Settings",
"reload_settings",
"settings",
# Logger
"logger",
"configure_logger",
]
11 changes: 11 additions & 0 deletions src/guidellm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,15 @@ def cli():
callback=parse_json,
help="A JSON string of extra data to save with the output benchmarks",
)
@click.option(
"--output-sampling",
type=int,
help=(
"The number of samples to save in the output file. "
"If None (default), will save all samples."
),
default=None,
)
@click.option(
"--random-seed",
default=42,
Expand Down Expand Up @@ -237,6 +246,7 @@ def benchmark(
disable_console_outputs,
output_path,
output_extras,
output_sampling,
random_seed,
):
asyncio.run(
Expand All @@ -261,6 +271,7 @@ def benchmark(
output_console=not disable_console_outputs,
output_path=output_path,
output_extras=output_extras,
output_sampling=output_sampling,
random_seed=random_seed,
)
)
Expand Down
48 changes: 43 additions & 5 deletions src/guidellm/benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
from .aggregator import AggregatorT, BenchmarkAggregator, GenerativeBenchmarkAggregator
from .benchmark import Benchmark, BenchmarkT, GenerativeBenchmark
from .benchmark import (
Benchmark,
BenchmarkArgs,
BenchmarkMetrics,
BenchmarkRunStats,
BenchmarkT,
GenerativeBenchmark,
GenerativeMetrics,
GenerativeTextErrorStats,
GenerativeTextResponseStats,
StatusBreakdown,
)
from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
from .entrypoints import benchmark_generative_text
from .output import GenerativeBenchmarksConsole, GenerativeBenchmarksReport
from .profile import (
AsyncProfile,
ConcurrentProfile,
Expand All @@ -12,17 +24,39 @@
ThroughputProfile,
create_profile,
)
from .progress import (
BenchmarkerProgressDisplay,
BenchmarkerTaskProgressState,
GenerativeTextBenchmarkerProgressDisplay,
GenerativeTextBenchmarkerTaskProgressState,
)

__all__ = [
# Aggregator
"AggregatorT",
"BenchmarkT",
"Benchmark",
"BenchmarkAggregator",
"GenerativeBenchmark",
"GenerativeBenchmarkAggregator",
# Benchmark
"Benchmark",
"BenchmarkArgs",
"BenchmarkMetrics",
"BenchmarkRunStats",
"BenchmarkT",
"GenerativeBenchmark",
"GenerativeMetrics",
"GenerativeTextErrorStats",
"GenerativeTextResponseStats",
"StatusBreakdown",
# Benchmarker
"Benchmarker",
"BenchmarkerResult",
"GenerativeBenchmarker",
# Entry points
"benchmark_generative_text",
# Output
"GenerativeBenchmarksConsole",
"GenerativeBenchmarksReport",
# Profile
"AsyncProfile",
"ConcurrentProfile",
"Profile",
Expand All @@ -31,5 +65,9 @@
"SynchronousProfile",
"ThroughputProfile",
"create_profile",
"benchmark_generative_text",
# Progress
"BenchmarkerProgressDisplay",
"BenchmarkerTaskProgressState",
"GenerativeTextBenchmarkerProgressDisplay",
"GenerativeTextBenchmarkerTaskProgressState",
]
68 changes: 39 additions & 29 deletions src/guidellm/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,12 @@ def time_per_output_token_ms(self) -> Optional[float]: # type: ignore[override]
This includes the time to generate the first token and all other tokens.
None if the output_tokens is None or 0.
"""
if self.output_tokens is None or self.output_tokens == 0:
if (
self.output_tokens is None
or self.output_tokens == 0
or self.first_token_time is None
or self.last_token_time is None
):
return None

return super().time_per_output_token_ms
Expand Down Expand Up @@ -614,41 +619,46 @@ def duration(self) -> float:
),
)

def create_sampled(self, sample_size: int) -> "GenerativeBenchmark":
def set_sample_size(self, sample_size: Optional[int]) -> "GenerativeBenchmark":
"""
Create a new benchmark instance with a random sample of the completed and
errored requests based on the given sample sizes. If the sample sizes are
larger than the total number of requests, the sample sizes are capped at
the total number of requests.
Set the sample size for the benchmark. This will randomly sample the
requests for each status type to the given sample size or the maximum
number of requests for that status type, whichever is smaller.
This is applied to requests.successful, requests.errored, and
requests.incomplete.
If None, no sampling is applied and the state is kept.

:param sample_size: The number of requests to sample for each status type.
:return: A new benchmark instance with the sampled requests.
:raises ValueError: If the sample sizes are negative.
:return: The benchmark with the sampled requests.
:raises ValueError: If the sample size is invalid.
"""
if sample_size < 0:
raise ValueError(f"Sample size must be non-negative, given {sample_size}")

sample_size = min(sample_size, len(self.requests.successful))
error_sample_size = min(sample_size, len(self.requests.errored))
incomplete_sample_size = min(sample_size, len(self.requests.incomplete))
if sample_size is not None:
if sample_size < 0 or not isinstance(sample_size, int):
raise ValueError(
f"Sample size must be non-negative integer, given {sample_size}"
)

sampled_instance = self.model_copy()
sampled_instance.requests.successful = random.sample(
self.requests.successful, sample_size
)
sampled_instance.requests.errored = random.sample(
self.requests.errored, error_sample_size
)
sampled_instance.requests.incomplete = random.sample(
self.requests.incomplete, incomplete_sample_size
)
sampled_instance.request_samples = StatusBreakdown(
successful=len(sampled_instance.requests.successful),
incomplete=len(sampled_instance.requests.incomplete),
errored=len(sampled_instance.requests.errored),
)
sample_size = min(sample_size, len(self.requests.successful))
error_sample_size = min(sample_size, len(self.requests.errored))
incomplete_sample_size = min(sample_size, len(self.requests.incomplete))

self.requests.successful = random.sample(
self.requests.successful, sample_size
)
self.requests.errored = random.sample(
self.requests.errored, error_sample_size
)
self.requests.incomplete = random.sample(
self.requests.incomplete, incomplete_sample_size
)
self.request_samples = StatusBreakdown(
successful=len(self.requests.successful),
incomplete=len(self.requests.incomplete),
errored=len(self.requests.errored),
)

return sampled_instance
return self

@staticmethod
def from_stats(
Expand Down
29 changes: 20 additions & 9 deletions src/guidellm/benchmark/entrypoints.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
from pathlib import Path
from typing import Any, Dict, Iterable, List, Literal, Optional, Union
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union

from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
from transformers import ( # type: ignore[import]
PreTrainedTokenizerBase,
)

from guidellm.backend import Backend, BackendType
from guidellm.benchmark.benchmark import GenerativeBenchmark
from guidellm.benchmark.benchmarker import GenerativeBenchmarker
from guidellm.benchmark.output import (
GenerativeBenchmarksConsole,
save_generative_benchmarks,
GenerativeBenchmarksReport,
)
from guidellm.benchmark.profile import ProfileType, create_profile
from guidellm.benchmark.progress import GenerativeTextBenchmarkerProgressDisplay
Expand Down Expand Up @@ -48,8 +47,9 @@ async def benchmark_generative_text(
output_console: bool,
output_path: Optional[Union[str, Path]],
output_extras: Optional[Dict[str, Any]],
output_sampling: Optional[int],
random_seed: int,
) -> List[GenerativeBenchmark]:
) -> Tuple[GenerativeBenchmarksReport, Optional[Path]]:
console = GenerativeBenchmarksConsole(enabled=show_progress)
console.print_line("Creating backend...")
backend = Backend.create(
Expand Down Expand Up @@ -100,7 +100,7 @@ async def benchmark_generative_text(
if show_progress
else None
)
benchmarks = []
report = GenerativeBenchmarksReport()

async for result in benchmarker.run(
profile=profile,
Expand All @@ -115,15 +115,26 @@ async def benchmark_generative_text(
if result.type_ == "benchmark_compiled":
if result.current_benchmark is None:
raise ValueError("Current benchmark is None")
benchmarks.append(result.current_benchmark)
report.benchmarks.append(
result.current_benchmark.set_sample_size(output_sampling)
)

if output_console:
console.benchmarks = benchmarks
orig_enabled = console.enabled
console.enabled = True
console.benchmarks = report.benchmarks
console.print_benchmarks_metadata()
console.print_benchmarks_info()
console.print_benchmarks_stats()
console.enabled = orig_enabled

if output_path:
save_generative_benchmarks(benchmarks=benchmarks, path=output_path)
console.print_line("\nSaving benchmarks report...")
saved_path = report.save_file(output_path)
console.print_line(f"Benchmarks report saved to {saved_path}")
else:
saved_path = None

return benchmarks
console.print_line("\nBenchmarking complete.")

return report, saved_path
Loading