5
5
6
6
from loguru import logger
7
7
from PIL import Image
8
- from pydantic import BaseModel , Field , computed_field
9
8
10
- from guidellm .config import settings
9
+ from guidellm .backend . response import ResponseSummary , StreamingTextResponse
11
10
12
11
__all__ = [
13
12
"Backend" ,
14
13
"BackendType" ,
15
- "StreamingResponseType" ,
16
- "StreamingRequestArgs" ,
17
- "StreamingResponseTimings" ,
18
- "StreamingTextResponseStats" ,
19
- "StreamingResponse" ,
20
14
]
21
15
22
16
23
17
BackendType = Literal ["openai_http" ]
24
18
25
- StreamingResponseType = Literal ["start" , "iter" , "final" ]
26
-
27
-
28
- class StreamingRequestArgs (BaseModel ):
29
- """
30
- A model representing the arguments for a streaming request to a backend.
31
- Biases towards an HTTP request, but can be used for other types of backends.
32
-
33
- :param target: The target URL or function for the request.
34
- :param headers: The headers, if any, included in the request such as authorization.
35
- :param payload: The payload / arguments for the request including the prompt /
36
- content and other configurations.
37
- :param timeout: The timeout for the request in seconds, if any.
38
- :param http2: Whether HTTP/2 was used for the request, if applicable.
39
- """
40
-
41
- target : str
42
- headers : Dict [str , str ]
43
- payload : Dict [str , Any ]
44
- timeout : Optional [float ] = None
45
- http2 : Optional [bool ] = None
46
-
47
-
48
- class StreamingResponseTimings (BaseModel ):
49
- """
50
- A model representing the performance timings for a streaming response
51
- from a backend. Includes the start time of the request, the end time of
52
- the request if completed, the delta time for the latest iteration,
53
- and the list of timing values for each iteration.
54
-
55
- :param request_start: The absolute start time of the request in seconds.
56
- :param values: The list of absolute timing values for each iteration in seconds,
57
- if any have occurred so far.
58
- The first value is the time the first token was received.
59
- The last value is the time the last token was received.
60
- All values in between are the times each iteration was received, which
61
- may or may not correspond to a token depending on the backend's implementation.
62
- :param request_end: The absolute end time of the request in seconds, if completed.
63
- :param delta: The time in seconds for the latest iteration, if any.
64
- """
65
-
66
- request_start : Optional [float ] = None
67
- values : List [float ] = Field (default_factory = list )
68
- request_end : Optional [float ] = None
69
- delta : Optional [float ] = None
70
-
71
-
72
- class StreamingTextResponseStats (BaseModel ):
73
- """
74
- A model representing the statistics for a streaming text response from a backend.
75
- request_* values are the numbers passed in to the backend's request implementation,
76
- including any measured prompt_tokens along with the number of output_tokens that
77
- were requested. response_* values are the numbers returned from the backend's
78
- response implementation, if any, including any measured prompt_tokens along with
79
- the number of output_tokens that were returned.
80
-
81
- :param request_prompt_tokens: The number of prompt tokens requested for the request.
82
- :param request_output_tokens: The number of output tokens requested for the request.
83
- :param response_prompt_tokens: The number of prompt tokens returned in the response.
84
- :param response_output_tokens: The number of output tokens returned in the response.
85
- :param response_stream_iterations: The number of iterations that have been returned
86
- from the backend so far, or if at the end, the total number of iterations that
87
- were returned.
88
- """
89
-
90
- request_prompt_tokens : Optional [int ] = None
91
- request_output_tokens : Optional [int ] = None
92
- response_prompt_tokens : Optional [int ] = None
93
- response_output_tokens : Optional [int ] = None
94
- response_stream_iterations : int = 0
95
-
96
- @computed_field # type: ignore[misc]
97
- @property
98
- def prompt_tokens_count (self ) -> Optional [int ]:
99
- if settings .preferred_prompt_tokens_source == "backend" :
100
- if not self .response_prompt_tokens :
101
- logger .warning (
102
- "preferred_prompt_tokens_source is set to 'backend', "
103
- " but no prompt tokens were returned by the backend. "
104
- "Falling back to local, if available."
105
- )
106
- return self .response_prompt_tokens or self .request_prompt_tokens
107
-
108
- if settings .preferred_prompt_tokens_source == "local" :
109
- if not self .request_prompt_tokens :
110
- logger .warning (
111
- "preferred_prompt_tokens_source is set to 'local', "
112
- "but no prompt tokens were provided in the request. "
113
- "Falling back to backend, if available."
114
- )
115
- return self .request_prompt_tokens or self .response_prompt_tokens
116
-
117
- return self .response_prompt_tokens or self .request_prompt_tokens
118
-
119
- @computed_field # type: ignore[misc]
120
- @property
121
- def output_tokens_count (self ) -> Optional [int ]:
122
- if settings .preferred_output_tokens_source == "backend" :
123
- if not self .response_output_tokens :
124
- logger .warning (
125
- "preferred_output_tokens_source is set to 'backend', "
126
- "but no output tokens were returned by the backend. "
127
- "Falling back to local, if available."
128
- )
129
- return self .response_output_tokens or self .request_output_tokens
130
-
131
- if settings .preferred_output_tokens_source == "local" :
132
- if not self .request_output_tokens :
133
- logger .warning (
134
- "preferred_output_tokens_source is set to 'local', "
135
- "but no output tokens were provided in the request. "
136
- "Falling back to backend, if available."
137
- )
138
- return self .request_output_tokens or self .response_output_tokens
139
-
140
- return self .response_output_tokens or self .request_output_tokens
141
-
142
-
143
- class StreamingResponse (BaseModel ):
144
- """
145
- A model representing a response from a streaming request to a backend.
146
- Includes the type of response, the request arguments, the performance timings,
147
- the statistics, the delta time for the latest iteration,
148
- and the content of the response.
149
-
150
- :param type_: The type of response, either 'start' for the initial response,
151
- 'iter' for intermediate streaming output, or 'final' for the final result.
152
- The response cycle from a backend will always start with a 'start' response,
153
- followed by zero or more 'iter' responses, and ending with a 'final' response.
154
- :param id_: The unique identifier for the request, if any.
155
- Used for tracking purposes.
156
- :param request_args: The arguments for the request that generated this response.
157
- :param timings: The performance timings for the response.
158
- :param stats: The statistics for the response.
159
- :param delta: The delta content for the latest iteration, if any.
160
- :param content: The returned content for the response, continuously appended to for
161
- each iteration.
162
- """
163
-
164
- type_ : StreamingResponseType = "start"
165
- id_ : Optional [str ] = None
166
- request_args : StreamingRequestArgs
167
- timings : StreamingResponseTimings = Field (default_factory = StreamingResponseTimings )
168
- stats : StreamingTextResponseStats = Field (
169
- default_factory = StreamingTextResponseStats
170
- )
171
- delta : Any = None
172
- content : Any = ""
173
-
174
19
175
20
class Backend (ABC ):
176
21
"""
@@ -196,6 +41,11 @@ def register(cls, backend_type: BackendType):
196
41
:return: The decorated backend class.
197
42
:rtype: Type[Backend]
198
43
"""
44
+ if backend_type in cls ._registry :
45
+ raise ValueError (f"Backend type already registered: { backend_type } " )
46
+
47
+ if not issubclass (cls , Backend ):
48
+ raise TypeError ("Only subclasses of Backend can be registered" )
199
49
200
50
def inner_wrapper (wrapped_class : Type ["Backend" ]):
201
51
cls ._registry [backend_type ] = wrapped_class
@@ -295,11 +145,11 @@ def available_models(self) -> List[str]:
295
145
async def text_completions (
296
146
self ,
297
147
prompt : Union [str , List [str ]],
298
- id_ : Optional [str ] = None ,
148
+ request_id : Optional [str ] = None ,
299
149
prompt_token_count : Optional [int ] = None ,
300
150
output_token_count : Optional [int ] = None ,
301
151
** kwargs ,
302
- ) -> AsyncGenerator [StreamingResponse , None ]:
152
+ ) -> AsyncGenerator [Union [ StreamingTextResponse , ResponseSummary ] , None ]:
303
153
"""
304
154
Generate text only completions for the given prompt.
305
155
Does not support multiple modalities, complicated chat interfaces,
@@ -308,16 +158,16 @@ async def text_completions(
308
158
:param prompt: The prompt (or list of prompts) to generate a completion for.
309
159
If a list is supplied, these are concatenated and run through the model
310
160
for a single prompt.
311
- :param id_ : The unique identifier for the request, if any.
161
+ :param request_id : The unique identifier for the request, if any.
312
162
Added to logging statements and the response for tracking purposes.
313
163
:param prompt_token_count: The number of tokens measured in the prompt, if any.
314
164
Returned in the response stats for later analysis, if applicable.
315
165
:param output_token_count: If supplied, the number of tokens to enforce
316
166
generation of for the output for this request.
317
167
:param kwargs: Additional keyword arguments to pass with the request.
318
- :return: An async generator that yields StreamingResponse objects containing the
319
- response content. Will always start with a 'start' response ,
320
- followed by 0 or more 'iter' responses, and ending with a ' final' response.
168
+ :return: An async generator that yields a StreamingTextResponse for start,
169
+ a StreamingTextResponse for each received iteration ,
170
+ and a ResponseSummary for the final response.
321
171
"""
322
172
...
323
173
@@ -329,12 +179,12 @@ async def chat_completions(
329
179
List [Union [str , Dict [str , Union [str , Dict [str , str ]]], Path , Image .Image ]],
330
180
Any ,
331
181
],
332
- id_ : Optional [str ] = None ,
182
+ request_id : Optional [str ] = None ,
333
183
prompt_token_count : Optional [int ] = None ,
334
184
output_token_count : Optional [int ] = None ,
335
185
raw_content : bool = False ,
336
186
** kwargs ,
337
- ) -> AsyncGenerator [StreamingResponse , None ]:
187
+ ) -> AsyncGenerator [Union [ StreamingTextResponse , ResponseSummary ] , None ]:
338
188
"""
339
189
Generate chat completions for the given content.
340
190
Supports multiple modalities, complicated chat interfaces, and chat templates.
@@ -359,15 +209,15 @@ async def chat_completions(
359
209
"input_audio": {"data": f"{base64_bytes}", "format": "wav}].
360
210
Additionally, if raw_content=True then the content is passed directly to the
361
211
backend without any processing.
362
- :param id_ : The unique identifier for the request, if any.
212
+ :param request_id : The unique identifier for the request, if any.
363
213
Added to logging statements and the response for tracking purposes.
364
214
:param prompt_token_count: The number of tokens measured in the prompt, if any.
365
215
Returned in the response stats for later analysis, if applicable.
366
216
:param output_token_count: If supplied, the number of tokens to enforce
367
217
generation of for the output for this request.
368
218
:param kwargs: Additional keyword arguments to pass with the request.
369
- :return: An async generator that yields StreamingResponse objects containing the
370
- response content. Will always start with a 'start' response ,
371
- followed by 0 or more 'iter' responses, and ending with a ' final' response.
219
+ :return: An async generator that yields a StreamingTextResponse for start,
220
+ a StreamingTextResponse for each received iteration ,
221
+ and a ResponseSummary for the final response.
372
222
"""
373
223
...
0 commit comments