@@ -33,17 +33,18 @@ def tokenizer_mock():
33
33
)
34
34
return tokenizer
35
35
36
+
36
37
@pytest .mark .smoke
37
38
@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
38
39
@patch (f"{ process_dataset .__module__ } .check_load_processor" )
39
40
@patch (f"{ process_dataset .__module__ } .Dataset" )
40
41
@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
41
42
def test_strategy_handler_called (
42
- mock_sampler ,
43
- mock_dataset_class ,
44
- mock_check_processor ,
45
- mock_load_dataset ,
46
- tokenizer_mock ,
43
+ mock_sampler ,
44
+ mock_dataset_class ,
45
+ mock_check_processor ,
46
+ mock_load_dataset ,
47
+ tokenizer_mock ,
47
48
):
48
49
mock_handler = MagicMock (return_value = "processed_prompt" )
49
50
with patch .dict (STRATEGY_HANDLERS , {ShortPromptStrategy .IGNORE : mock_handler }):
@@ -68,18 +69,21 @@ def test_strategy_handler_called(
68
69
mock_load_dataset .assert_called_once ()
69
70
mock_check_processor .assert_called_once ()
70
71
72
+
71
73
@pytest .mark .sanity
72
74
def test_handle_ignore_strategy_too_short (tokenizer_mock ):
73
75
result = handle_ignore_strategy ("short" , 10 , tokenizer_mock )
74
76
assert result is None
75
77
tokenizer_mock .encode .assert_called_with ("short" )
76
78
79
+
77
80
@pytest .mark .sanity
78
81
def test_handle_ignore_strategy_sufficient_length (tokenizer_mock ):
79
82
result = handle_ignore_strategy ("long prompt" , 5 , tokenizer_mock )
80
83
assert result == "long prompt"
81
84
tokenizer_mock .encode .assert_called_with ("long prompt" )
82
85
86
+
83
87
@pytest .mark .sanity
84
88
def test_handle_concatenate_strategy_enough_prompts (tokenizer_mock ):
85
89
dataset_iter = iter ([{"prompt" : "longer" }])
@@ -88,6 +92,7 @@ def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
88
92
)
89
93
assert result == "short\n longer"
90
94
95
+
91
96
@pytest .mark .sanity
92
97
def test_handle_concatenate_strategy_not_enough_prompts (tokenizer_mock ):
93
98
dataset_iter : Iterator = iter ([])
@@ -96,35 +101,39 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
96
101
)
97
102
assert result is None
98
103
104
+
99
105
@pytest .mark .sanity
100
106
def test_handle_pad_strategy (tokenizer_mock ):
101
107
result = handle_pad_strategy ("short" , 10 , tokenizer_mock , "p" )
102
108
assert result == "shortppppp"
103
109
110
+
104
111
@pytest .mark .sanity
105
112
def test_handle_error_strategy_valid_prompt (tokenizer_mock ):
106
113
result = handle_error_strategy ("valid prompt" , 5 , tokenizer_mock )
107
114
assert result == "valid prompt"
108
115
tokenizer_mock .encode .assert_called_with ("valid prompt" )
109
116
117
+
110
118
@pytest .mark .sanity
111
119
def test_handle_error_strategy_too_short_prompt (tokenizer_mock ):
112
120
with pytest .raises (PromptTooShortError ):
113
121
handle_error_strategy ("short" , 10 , tokenizer_mock )
114
122
123
+
115
124
@pytest .mark .smoke
116
125
@patch ("guidellm.preprocess.dataset.save_dataset_to_file" )
117
126
@patch ("guidellm.preprocess.dataset.Dataset" )
118
127
@patch ("guidellm.preprocess.dataset.guidellm_load_dataset" )
119
128
@patch ("guidellm.preprocess.dataset.check_load_processor" )
120
129
@patch ("guidellm.preprocess.dataset.IntegerRangeSampler" )
121
130
def test_process_dataset_non_empty (
122
- mock_sampler ,
123
- mock_check_processor ,
124
- mock_load_dataset ,
125
- mock_dataset_class ,
126
- mock_save_to_file ,
127
- tokenizer_mock ,
131
+ mock_sampler ,
132
+ mock_check_processor ,
133
+ mock_load_dataset ,
134
+ mock_dataset_class ,
135
+ mock_save_to_file ,
136
+ tokenizer_mock ,
128
137
):
129
138
from guidellm .preprocess .dataset import process_dataset
130
139
@@ -159,17 +168,18 @@ def test_process_dataset_non_empty(
159
168
assert "output_tokens_count" in item
160
169
assert len (tokenizer_mock .encode (item ["prompt" ])) <= 3
161
170
171
+
162
172
@pytest .mark .sanity
163
173
@patch (f"{ process_dataset .__module__ } .Dataset" )
164
174
@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
165
175
@patch (f"{ process_dataset .__module__ } .check_load_processor" )
166
176
@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
167
177
def test_process_dataset_empty_after_processing (
168
- mock_sampler ,
169
- mock_check_processor ,
170
- mock_load_dataset ,
171
- mock_dataset_class ,
172
- tokenizer_mock ,
178
+ mock_sampler ,
179
+ mock_check_processor ,
180
+ mock_load_dataset ,
181
+ mock_dataset_class ,
182
+ tokenizer_mock ,
173
183
):
174
184
mock_dataset = [{"prompt" : "" }]
175
185
mock_load_dataset .return_value = (mock_dataset , {"prompt_column" : "prompt" })
@@ -188,19 +198,20 @@ def test_process_dataset_empty_after_processing(
188
198
mock_check_processor .assert_called_once ()
189
199
mock_dataset_class .from_list .assert_not_called ()
190
200
201
+
191
202
@pytest .mark .smoke
192
203
@patch (f"{ process_dataset .__module__ } .push_dataset_to_hub" )
193
204
@patch (f"{ process_dataset .__module__ } .Dataset" )
194
205
@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
195
206
@patch (f"{ process_dataset .__module__ } .check_load_processor" )
196
207
@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
197
208
def test_process_dataset_push_to_hub_called (
198
- mock_sampler ,
199
- mock_check_processor ,
200
- mock_load_dataset ,
201
- mock_dataset_class ,
202
- mock_push ,
203
- tokenizer_mock ,
209
+ mock_sampler ,
210
+ mock_check_processor ,
211
+ mock_load_dataset ,
212
+ mock_dataset_class ,
213
+ mock_push ,
214
+ tokenizer_mock ,
204
215
):
205
216
mock_dataset = [{"prompt" : "abc" }]
206
217
mock_load_dataset .return_value = (mock_dataset , {"prompt_column" : "prompt" })
@@ -221,19 +232,20 @@ def test_process_dataset_push_to_hub_called(
221
232
)
222
233
mock_push .assert_called_once_with ("id123" , mock_dataset_obj )
223
234
235
+
224
236
@pytest .mark .sanity
225
237
@patch (f"{ process_dataset .__module__ } .push_dataset_to_hub" )
226
238
@patch (f"{ process_dataset .__module__ } .Dataset" )
227
239
@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
228
240
@patch (f"{ process_dataset .__module__ } .check_load_processor" )
229
241
@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
230
242
def test_process_dataset_push_to_hub_not_called (
231
- mock_sampler ,
232
- mock_check_processor ,
233
- mock_load_dataset ,
234
- mock_dataset_class ,
235
- mock_push ,
236
- tokenizer_mock ,
243
+ mock_sampler ,
244
+ mock_check_processor ,
245
+ mock_load_dataset ,
246
+ mock_dataset_class ,
247
+ mock_push ,
248
+ tokenizer_mock ,
237
249
):
238
250
mock_dataset = [{"prompt" : "abc" }]
239
251
mock_load_dataset .return_value = (mock_dataset , {"prompt_column" : "prompt" })
@@ -253,13 +265,15 @@ def test_process_dataset_push_to_hub_not_called(
253
265
)
254
266
mock_push .assert_not_called ()
255
267
268
+
256
269
@pytest .mark .regression
257
270
def test_push_dataset_to_hub_success ():
258
271
os .environ ["HF_TOKEN" ] = "token"
259
272
mock_dataset = MagicMock (spec = Dataset )
260
273
push_dataset_to_hub ("dataset_id" , mock_dataset )
261
274
mock_dataset .push_to_hub .assert_called_once_with ("dataset_id" , token = "token" )
262
275
276
+
263
277
@pytest .mark .regression
264
278
def test_push_dataset_to_hub_error_no_env ():
265
279
if "HF_TOKEN" in os .environ :
@@ -268,13 +282,15 @@ def test_push_dataset_to_hub_error_no_env():
268
282
with pytest .raises (ValueError , match = "hub_dataset_id and HF_TOKEN" ):
269
283
push_dataset_to_hub ("dataset_id" , mock_dataset )
270
284
285
+
271
286
@pytest .mark .regression
272
287
def test_push_dataset_to_hub_error_no_id ():
273
288
os .environ ["HF_TOKEN" ] = "token"
274
289
mock_dataset = MagicMock (spec = Dataset )
275
290
with pytest .raises (ValueError , match = "hub_dataset_id and HF_TOKEN" ):
276
291
push_dataset_to_hub (None , mock_dataset )
277
292
293
+
278
294
@pytest .mark .regression
279
295
@patch .object (Path , "mkdir" )
280
296
def test_save_dataset_to_file_csv (mock_mkdir ):
@@ -284,6 +300,7 @@ def test_save_dataset_to_file_csv(mock_mkdir):
284
300
mock_dataset .to_csv .assert_called_once_with (output_path )
285
301
mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
286
302
303
+
287
304
@pytest .mark .regression
288
305
@patch .object (Path , "mkdir" )
289
306
def test_save_dataset_to_file_csv_capitalized (mock_mkdir ):
@@ -293,6 +310,7 @@ def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
293
310
mock_dataset .to_csv .assert_called_once_with (output_path )
294
311
mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
295
312
313
+
296
314
@pytest .mark .regression
297
315
@patch .object (Path , "mkdir" )
298
316
def test_save_dataset_to_file_json (mock_mkdir ):
@@ -302,6 +320,7 @@ def test_save_dataset_to_file_json(mock_mkdir):
302
320
mock_dataset .to_json .assert_called_once_with (output_path )
303
321
mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
304
322
323
+
305
324
@pytest .mark .regression
306
325
@patch .object (Path , "mkdir" )
307
326
def test_save_dataset_to_file_json_capitalized (mock_mkdir ):
@@ -311,6 +330,7 @@ def test_save_dataset_to_file_json_capitalized(mock_mkdir):
311
330
mock_dataset .to_json .assert_called_once_with (output_path )
312
331
mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
313
332
333
+
314
334
@pytest .mark .regression
315
335
@patch .object (Path , "mkdir" )
316
336
def test_save_dataset_to_file_jsonl (mock_mkdir ):
@@ -320,6 +340,7 @@ def test_save_dataset_to_file_jsonl(mock_mkdir):
320
340
mock_dataset .to_json .assert_called_once_with (output_path )
321
341
mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
322
342
343
+
323
344
@pytest .mark .regression
324
345
@patch .object (Path , "mkdir" )
325
346
def test_save_dataset_to_file_jsonl_capitalized (mock_mkdir ):
@@ -329,6 +350,7 @@ def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
329
350
mock_dataset .to_json .assert_called_once_with (output_path )
330
351
mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
331
352
353
+
332
354
@pytest .mark .regression
333
355
@patch .object (Path , "mkdir" )
334
356
def test_save_dataset_to_file_parquet (mock_mkdir ):
@@ -338,6 +360,7 @@ def test_save_dataset_to_file_parquet(mock_mkdir):
338
360
mock_dataset .to_parquet .assert_called_once_with (output_path )
339
361
mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
340
362
363
+
341
364
@pytest .mark .regression
342
365
@patch .object (Path , "mkdir" )
343
366
def test_save_dataset_to_file_unsupported_type (mock_mkdir ):
0 commit comments