Skip to content

Commit 24f753a

Browse files
committed
Finalize implementation, fix bugs, and ensure unit tests are passing / in place
1 parent 31239d9 commit 24f753a

22 files changed

+1177
-1288
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,14 @@ dev = [
4949
"tox~=4.16.0",
5050

5151
# testing
52+
"lorem~=0.1.1",
5253
"pytest~=8.2.2",
5354
"pytest-asyncio~=0.23.8",
5455
"pytest-cov~=5.0.0",
5556
"pytest-mock~=3.14.0",
5657
"pytest-rerunfailures~=14.0",
5758
"requests-mock~=1.12.1",
59+
"respx~=0.22.0",
5860

5961
# code quality
6062
"mypy~=1.10.1",

src/guidellm/backend/__init__.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
from .backend import (
22
Backend,
33
BackendType,
4-
StreamingRequestArgs,
5-
StreamingResponse,
6-
StreamingResponseTimings,
7-
StreamingResponseType,
8-
StreamingTextResponseStats,
94
)
105
from .openai import OpenAIHTTPBackend
6+
from .response import (
7+
RequestArgs,
8+
ResponseSummary,
9+
StreamingResponseType,
10+
StreamingTextResponse,
11+
)
1112

1213
__all__ = [
14+
"StreamingResponseType",
15+
"StreamingTextResponse",
16+
"RequestArgs",
17+
"ResponseSummary",
1318
"Backend",
1419
"BackendType",
15-
"StreamingResponseType",
16-
"StreamingRequestArgs",
17-
"StreamingResponseTimings",
18-
"StreamingTextResponseStats",
19-
"StreamingResponse",
2020
"OpenAIHTTPBackend",
2121
]

src/guidellm/backend/backend.py

Lines changed: 18 additions & 168 deletions
Original file line numberDiff line numberDiff line change
@@ -5,172 +5,17 @@
55

66
from loguru import logger
77
from PIL import Image
8-
from pydantic import BaseModel, Field, computed_field
98

10-
from guidellm.config import settings
9+
from guidellm.backend.response import ResponseSummary, StreamingTextResponse
1110

1211
__all__ = [
1312
"Backend",
1413
"BackendType",
15-
"StreamingResponseType",
16-
"StreamingRequestArgs",
17-
"StreamingResponseTimings",
18-
"StreamingTextResponseStats",
19-
"StreamingResponse",
2014
]
2115

2216

2317
BackendType = Literal["openai_http"]
2418

25-
StreamingResponseType = Literal["start", "iter", "final"]
26-
27-
28-
class StreamingRequestArgs(BaseModel):
29-
"""
30-
A model representing the arguments for a streaming request to a backend.
31-
Biases towards an HTTP request, but can be used for other types of backends.
32-
33-
:param target: The target URL or function for the request.
34-
:param headers: The headers, if any, included in the request such as authorization.
35-
:param payload: The payload / arguments for the request including the prompt /
36-
content and other configurations.
37-
:param timeout: The timeout for the request in seconds, if any.
38-
:param http2: Whether HTTP/2 was used for the request, if applicable.
39-
"""
40-
41-
target: str
42-
headers: Dict[str, str]
43-
payload: Dict[str, Any]
44-
timeout: Optional[float] = None
45-
http2: Optional[bool] = None
46-
47-
48-
class StreamingResponseTimings(BaseModel):
49-
"""
50-
A model representing the performance timings for a streaming response
51-
from a backend. Includes the start time of the request, the end time of
52-
the request if completed, the delta time for the latest iteration,
53-
and the list of timing values for each iteration.
54-
55-
:param request_start: The absolute start time of the request in seconds.
56-
:param values: The list of absolute timing values for each iteration in seconds,
57-
if any have occurred so far.
58-
The first value is the time the first token was received.
59-
The last value is the time the last token was received.
60-
All values in between are the times each iteration was received, which
61-
may or may not correspond to a token depending on the backend's implementation.
62-
:param request_end: The absolute end time of the request in seconds, if completed.
63-
:param delta: The time in seconds for the latest iteration, if any.
64-
"""
65-
66-
request_start: Optional[float] = None
67-
values: List[float] = Field(default_factory=list)
68-
request_end: Optional[float] = None
69-
delta: Optional[float] = None
70-
71-
72-
class StreamingTextResponseStats(BaseModel):
73-
"""
74-
A model representing the statistics for a streaming text response from a backend.
75-
request_* values are the numbers passed in to the backend's request implementation,
76-
including any measured prompt_tokens along with the number of output_tokens that
77-
were requested. response_* values are the numbers returned from the backend's
78-
response implementation, if any, including any measured prompt_tokens along with
79-
the number of output_tokens that were returned.
80-
81-
:param request_prompt_tokens: The number of prompt tokens requested for the request.
82-
:param request_output_tokens: The number of output tokens requested for the request.
83-
:param response_prompt_tokens: The number of prompt tokens returned in the response.
84-
:param response_output_tokens: The number of output tokens returned in the response.
85-
:param response_stream_iterations: The number of iterations that have been returned
86-
from the backend so far, or if at the end, the total number of iterations that
87-
were returned.
88-
"""
89-
90-
request_prompt_tokens: Optional[int] = None
91-
request_output_tokens: Optional[int] = None
92-
response_prompt_tokens: Optional[int] = None
93-
response_output_tokens: Optional[int] = None
94-
response_stream_iterations: int = 0
95-
96-
@computed_field # type: ignore[misc]
97-
@property
98-
def prompt_tokens_count(self) -> Optional[int]:
99-
if settings.preferred_prompt_tokens_source == "backend":
100-
if not self.response_prompt_tokens:
101-
logger.warning(
102-
"preferred_prompt_tokens_source is set to 'backend', "
103-
" but no prompt tokens were returned by the backend. "
104-
"Falling back to local, if available."
105-
)
106-
return self.response_prompt_tokens or self.request_prompt_tokens
107-
108-
if settings.preferred_prompt_tokens_source == "local":
109-
if not self.request_prompt_tokens:
110-
logger.warning(
111-
"preferred_prompt_tokens_source is set to 'local', "
112-
"but no prompt tokens were provided in the request. "
113-
"Falling back to backend, if available."
114-
)
115-
return self.request_prompt_tokens or self.response_prompt_tokens
116-
117-
return self.response_prompt_tokens or self.request_prompt_tokens
118-
119-
@computed_field # type: ignore[misc]
120-
@property
121-
def output_tokens_count(self) -> Optional[int]:
122-
if settings.preferred_output_tokens_source == "backend":
123-
if not self.response_output_tokens:
124-
logger.warning(
125-
"preferred_output_tokens_source is set to 'backend', "
126-
"but no output tokens were returned by the backend. "
127-
"Falling back to local, if available."
128-
)
129-
return self.response_output_tokens or self.request_output_tokens
130-
131-
if settings.preferred_output_tokens_source == "local":
132-
if not self.request_output_tokens:
133-
logger.warning(
134-
"preferred_output_tokens_source is set to 'local', "
135-
"but no output tokens were provided in the request. "
136-
"Falling back to backend, if available."
137-
)
138-
return self.request_output_tokens or self.response_output_tokens
139-
140-
return self.response_output_tokens or self.request_output_tokens
141-
142-
143-
class StreamingResponse(BaseModel):
144-
"""
145-
A model representing a response from a streaming request to a backend.
146-
Includes the type of response, the request arguments, the performance timings,
147-
the statistics, the delta time for the latest iteration,
148-
and the content of the response.
149-
150-
:param type_: The type of response, either 'start' for the initial response,
151-
'iter' for intermediate streaming output, or 'final' for the final result.
152-
The response cycle from a backend will always start with a 'start' response,
153-
followed by zero or more 'iter' responses, and ending with a 'final' response.
154-
:param id_: The unique identifier for the request, if any.
155-
Used for tracking purposes.
156-
:param request_args: The arguments for the request that generated this response.
157-
:param timings: The performance timings for the response.
158-
:param stats: The statistics for the response.
159-
:param delta: The delta content for the latest iteration, if any.
160-
:param content: The returned content for the response, continuously appended to for
161-
each iteration.
162-
"""
163-
164-
type_: StreamingResponseType = "start"
165-
id_: Optional[str] = None
166-
request_args: StreamingRequestArgs
167-
timings: StreamingResponseTimings = Field(default_factory=StreamingResponseTimings)
168-
stats: StreamingTextResponseStats = Field(
169-
default_factory=StreamingTextResponseStats
170-
)
171-
delta: Any = None
172-
content: Any = ""
173-
17419

17520
class Backend(ABC):
17621
"""
@@ -196,6 +41,11 @@ def register(cls, backend_type: BackendType):
19641
:return: The decorated backend class.
19742
:rtype: Type[Backend]
19843
"""
44+
if backend_type in cls._registry:
45+
raise ValueError(f"Backend type already registered: {backend_type}")
46+
47+
if not issubclass(cls, Backend):
48+
raise TypeError("Only subclasses of Backend can be registered")
19949

20050
def inner_wrapper(wrapped_class: Type["Backend"]):
20151
cls._registry[backend_type] = wrapped_class
@@ -295,11 +145,11 @@ def available_models(self) -> List[str]:
295145
async def text_completions(
296146
self,
297147
prompt: Union[str, List[str]],
298-
id_: Optional[str] = None,
148+
request_id: Optional[str] = None,
299149
prompt_token_count: Optional[int] = None,
300150
output_token_count: Optional[int] = None,
301151
**kwargs,
302-
) -> AsyncGenerator[StreamingResponse, None]:
152+
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
303153
"""
304154
Generate text only completions for the given prompt.
305155
Does not support multiple modalities, complicated chat interfaces,
@@ -308,16 +158,16 @@ async def text_completions(
308158
:param prompt: The prompt (or list of prompts) to generate a completion for.
309159
If a list is supplied, these are concatenated and run through the model
310160
for a single prompt.
311-
:param id_: The unique identifier for the request, if any.
161+
:param request_id: The unique identifier for the request, if any.
312162
Added to logging statements and the response for tracking purposes.
313163
:param prompt_token_count: The number of tokens measured in the prompt, if any.
314164
Returned in the response stats for later analysis, if applicable.
315165
:param output_token_count: If supplied, the number of tokens to enforce
316166
generation of for the output for this request.
317167
:param kwargs: Additional keyword arguments to pass with the request.
318-
:return: An async generator that yields StreamingResponse objects containing the
319-
response content. Will always start with a 'start' response,
320-
followed by 0 or more 'iter' responses, and ending with a 'final' response.
168+
:return: An async generator that yields a StreamingTextResponse for start,
169+
a StreamingTextResponse for each received iteration,
170+
and a ResponseSummary for the final response.
321171
"""
322172
...
323173

@@ -329,12 +179,12 @@ async def chat_completions(
329179
List[Union[str, Dict[str, Union[str, Dict[str, str]]], Path, Image.Image]],
330180
Any,
331181
],
332-
id_: Optional[str] = None,
182+
request_id: Optional[str] = None,
333183
prompt_token_count: Optional[int] = None,
334184
output_token_count: Optional[int] = None,
335185
raw_content: bool = False,
336186
**kwargs,
337-
) -> AsyncGenerator[StreamingResponse, None]:
187+
) -> AsyncGenerator[Union[StreamingTextResponse, ResponseSummary], None]:
338188
"""
339189
Generate chat completions for the given content.
340190
Supports multiple modalities, complicated chat interfaces, and chat templates.
@@ -359,15 +209,15 @@ async def chat_completions(
359209
"input_audio": {"data": f"{base64_bytes}", "format": "wav}].
360210
Additionally, if raw_content=True then the content is passed directly to the
361211
backend without any processing.
362-
:param id_: The unique identifier for the request, if any.
212+
:param request_id: The unique identifier for the request, if any.
363213
Added to logging statements and the response for tracking purposes.
364214
:param prompt_token_count: The number of tokens measured in the prompt, if any.
365215
Returned in the response stats for later analysis, if applicable.
366216
:param output_token_count: If supplied, the number of tokens to enforce
367217
generation of for the output for this request.
368218
:param kwargs: Additional keyword arguments to pass with the request.
369-
:return: An async generator that yields StreamingResponse objects containing the
370-
response content. Will always start with a 'start' response,
371-
followed by 0 or more 'iter' responses, and ending with a 'final' response.
219+
:return: An async generator that yields a StreamingTextResponse for start,
220+
a StreamingTextResponse for each received iteration,
221+
and a ResponseSummary for the final response.
372222
"""
373223
...

0 commit comments

Comments
 (0)