Skip to content

Commit c48472a

Browse files
committed
Fixed CR comments
1 parent 448d609 commit c48472a

File tree

6 files changed

+149
-125
lines changed

6 files changed

+149
-125
lines changed

src/guidellm/preprocess/dataset.py

Lines changed: 14 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,7 @@
1313

1414
from guidellm.dataset import load_dataset as guidellm_load_dataset
1515
from guidellm.utils import IntegerRangeSampler, check_load_processor
16-
17-
SUPPORTED_TYPES = {
18-
".json",
19-
".jsonl",
20-
".csv",
21-
".parquet",
22-
}
16+
from guidellm.utils.hf_datasets import SUPPORTED_TYPES, save_dataset_to_file
2317

2418

2519
class PromptTooShortError(Exception):
@@ -94,6 +88,7 @@ def handle_pad_strategy(
9488
min_prompt_tokens: int,
9589
tokenizer: PreTrainedTokenizerBase,
9690
pad_char: str,
91+
pad_multiplier: int = 2,
9792
**_kwargs,
9893
) -> str:
9994
"""
@@ -103,13 +98,18 @@ def handle_pad_strategy(
10398
:param min_prompt_tokens: Desired minimum token count.
10499
:param tokenizer: Tokenizer used to count tokens.
105100
:param pad_char: Character used for padding.
101+
:param pad_multiplier: Multiplier for padding character length.
106102
:return: Padded prompt string.
107103
"""
108104

109-
while len(tokenizer.encode(current_prompt)) < min_prompt_tokens:
110-
current_prompt += pad_char
111-
return current_prompt
112-
105+
tokens = tokenizer.encode(current_prompt)
106+
pad_count = 1
107+
prompt = current_prompt
108+
while len(tokens) < min_prompt_tokens:
109+
prompt += pad_char * pad_count
110+
tokens = tokenizer.encode(prompt)
111+
pad_count *= pad_multiplier
112+
return prompt
113113

114114
def handle_error_strategy(
115115
current_prompt: str,
@@ -221,31 +221,6 @@ def parse_config_file(data: Union[str, Path]) -> "TokensConfig":
221221
return TokensConfig(**config_dict)
222222

223223

224-
def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
225-
"""
226-
Saves a HuggingFace Dataset to file in a supported format.
227-
228-
:param dataset: Dataset to save.
229-
:param output_path: Output file path (.json, .jsonl, .csv, .parquet).
230-
:raises ValueError: If the file extension is not supported.
231-
"""
232-
output_path = Path(output_path)
233-
output_path.parent.mkdir(parents=True, exist_ok=True)
234-
suffix = output_path.suffix.lower()
235-
236-
if suffix == ".csv":
237-
dataset.to_csv(output_path)
238-
elif suffix in {".json", ".jsonl"}:
239-
dataset.to_json(output_path)
240-
elif suffix == ".parquet":
241-
dataset.to_parquet(output_path)
242-
else:
243-
raise ValueError(
244-
f"Unsupported file suffix '{suffix}' in output_path'{output_path}'."
245-
f" Only {SUPPORTED_TYPES} are supported."
246-
)
247-
248-
249224
def _validate_output_suffix(output_path: Union[str, Path]) -> None:
250225
output_path = Path(output_path)
251226
suffix = output_path.suffix.lower()
@@ -351,8 +326,8 @@ def process_dataset(
351326
if prompt_text is None:
352327
continue
353328

354-
if len(tokenizer.encode(prompt_text)) > target_prompt_len:
355-
tokens = tokenizer.encode(prompt_text)
329+
tokens = tokenizer.encode(prompt_text)
330+
if len(tokens) > target_prompt_len:
356331
prompt_text = tokenizer.decode(tokens[:target_prompt_len])
357332

358333
processed_prompt = prompt_row.copy()
@@ -370,7 +345,7 @@ def process_dataset(
370345

371346
processed_dataset = Dataset.from_list(processed_prompts)
372347
save_dataset_to_file(processed_dataset, output_path)
373-
logger.info(f"Conversion complete. Dataset saved to: {output_path}")
348+
logger.info(f"Conversion completed. Dataset saved to: {output_path}")
374349

375350
if push_to_hub:
376351
push_dataset_to_hub(hub_dataset_id, processed_dataset)

src/guidellm/utils/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
from .colors import Colors
2+
from .hf_datasets import (
3+
save_dataset_to_file,
4+
SUPPORTED_TYPES,
5+
)
26
from .hf_transformers import (
37
check_load_processor,
48
)
@@ -22,6 +26,8 @@
2226
"filter_text",
2327
"is_puncutation",
2428
"load_text",
29+
"save_dataset_to_file",
2530
"split_text",
2631
"split_text_list_by_length",
32+
"SUPPORTED_TYPES",
2733
]

src/guidellm/utils/hf_datasets.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from pathlib import Path
2+
from typing import Union
3+
4+
from datasets import Dataset
5+
6+
SUPPORTED_TYPES = {
7+
".json",
8+
".jsonl",
9+
".csv",
10+
".parquet",
11+
}
12+
13+
14+
def save_dataset_to_file(dataset: Dataset, output_path: Union[str, Path]) -> None:
15+
"""
16+
Saves a HuggingFace Dataset to file in a supported format.
17+
18+
:param dataset: Dataset to save.
19+
:param output_path: Output file path (.json, .jsonl, .csv, .parquet).
20+
:raises ValueError: If the file extension is not supported.
21+
"""
22+
output_path = Path(output_path)
23+
output_path.parent.mkdir(parents=True, exist_ok=True)
24+
suffix = output_path.suffix.lower()
25+
26+
if suffix == ".csv":
27+
dataset.to_csv(output_path)
28+
elif suffix in {".json", ".jsonl"}:
29+
dataset.to_json(output_path)
30+
elif suffix == ".parquet":
31+
dataset.to_parquet(output_path)
32+
else:
33+
raise ValueError(
34+
f"Unsupported file suffix '{suffix}' in output_path'{output_path}'."
35+
f" Only {SUPPORTED_TYPES} are supported."
36+
)

tests/unit/preprocess/test_dataset.py

Lines changed: 6 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
from pathlib import Path
32
from typing import TYPE_CHECKING
43
from unittest.mock import MagicMock, patch
54

@@ -20,7 +19,6 @@
2019
handle_pad_strategy,
2120
process_dataset,
2221
push_dataset_to_hub,
23-
save_dataset_to_file,
2422
)
2523

2624

@@ -105,7 +103,7 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
105103
@pytest.mark.sanity
106104
def test_handle_pad_strategy(tokenizer_mock):
107105
result = handle_pad_strategy("short", 10, tokenizer_mock, "p")
108-
assert result == "shortppppp"
106+
assert result.startswith("shortppppp")
109107

110108

111109
@pytest.mark.sanity
@@ -122,11 +120,11 @@ def test_handle_error_strategy_too_short_prompt(tokenizer_mock):
122120

123121

124122
@pytest.mark.smoke
125-
@patch("guidellm.preprocess.dataset.save_dataset_to_file")
126-
@patch("guidellm.preprocess.dataset.Dataset")
127-
@patch("guidellm.preprocess.dataset.guidellm_load_dataset")
128-
@patch("guidellm.preprocess.dataset.check_load_processor")
129-
@patch("guidellm.preprocess.dataset.IntegerRangeSampler")
123+
@patch(f"{process_dataset.__module__}.save_dataset_to_file")
124+
@patch(f"{process_dataset.__module__}.Dataset")
125+
@patch(f"{process_dataset.__module__}.guidellm_load_dataset")
126+
@patch(f"{process_dataset.__module__}.check_load_processor")
127+
@patch(f"{process_dataset.__module__}.IntegerRangeSampler")
130128
def test_process_dataset_non_empty(
131129
mock_sampler,
132130
mock_check_processor,
@@ -291,81 +289,3 @@ def test_push_dataset_to_hub_error_no_id():
291289
push_dataset_to_hub(None, mock_dataset)
292290

293291

294-
@pytest.mark.regression
295-
@patch.object(Path, "mkdir")
296-
def test_save_dataset_to_file_csv(mock_mkdir):
297-
mock_dataset = MagicMock(spec=Dataset)
298-
output_path = Path("some/path/output.csv")
299-
save_dataset_to_file(mock_dataset, output_path)
300-
mock_dataset.to_csv.assert_called_once_with(output_path)
301-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
302-
303-
304-
@pytest.mark.regression
305-
@patch.object(Path, "mkdir")
306-
def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
307-
mock_dataset = MagicMock(spec=Dataset)
308-
output_path = Path("some/path/output.CSV")
309-
save_dataset_to_file(mock_dataset, output_path)
310-
mock_dataset.to_csv.assert_called_once_with(output_path)
311-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
312-
313-
314-
@pytest.mark.regression
315-
@patch.object(Path, "mkdir")
316-
def test_save_dataset_to_file_json(mock_mkdir):
317-
mock_dataset = MagicMock(spec=Dataset)
318-
output_path = Path("some/path/output.json")
319-
save_dataset_to_file(mock_dataset, output_path)
320-
mock_dataset.to_json.assert_called_once_with(output_path)
321-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
322-
323-
324-
@pytest.mark.regression
325-
@patch.object(Path, "mkdir")
326-
def test_save_dataset_to_file_json_capitalized(mock_mkdir):
327-
mock_dataset = MagicMock(spec=Dataset)
328-
output_path = Path("some/path/output.JSON")
329-
save_dataset_to_file(mock_dataset, output_path)
330-
mock_dataset.to_json.assert_called_once_with(output_path)
331-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
332-
333-
334-
@pytest.mark.regression
335-
@patch.object(Path, "mkdir")
336-
def test_save_dataset_to_file_jsonl(mock_mkdir):
337-
mock_dataset = MagicMock(spec=Dataset)
338-
output_path = Path("some/path/output.jsonl")
339-
save_dataset_to_file(mock_dataset, output_path)
340-
mock_dataset.to_json.assert_called_once_with(output_path)
341-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
342-
343-
344-
@pytest.mark.regression
345-
@patch.object(Path, "mkdir")
346-
def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
347-
mock_dataset = MagicMock(spec=Dataset)
348-
output_path = Path("some/path/output.JSONL")
349-
save_dataset_to_file(mock_dataset, output_path)
350-
mock_dataset.to_json.assert_called_once_with(output_path)
351-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
352-
353-
354-
@pytest.mark.regression
355-
@patch.object(Path, "mkdir")
356-
def test_save_dataset_to_file_parquet(mock_mkdir):
357-
mock_dataset = MagicMock(spec=Dataset)
358-
output_path = Path("some/path/output.parquet")
359-
save_dataset_to_file(mock_dataset, output_path)
360-
mock_dataset.to_parquet.assert_called_once_with(output_path)
361-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
362-
363-
364-
@pytest.mark.regression
365-
@patch.object(Path, "mkdir")
366-
def test_save_dataset_to_file_unsupported_type(mock_mkdir):
367-
mock_dataset = MagicMock(spec=Dataset)
368-
output_path = Path("some/path/output.txt")
369-
with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"):
370-
save_dataset_to_file(mock_dataset, output_path)
371-
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)

tests/unit/utils/__init__.py

Whitespace-only changes.

tests/unit/utils/test_hf_datasets.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
from pathlib import Path
2+
from unittest.mock import patch, MagicMock
3+
4+
import pytest
5+
from datasets import Dataset
6+
7+
from guidellm.utils import save_dataset_to_file
8+
9+
10+
@pytest.mark.regression
11+
@patch.object(Path, "mkdir")
12+
def test_save_dataset_to_file_csv(mock_mkdir):
13+
mock_dataset = MagicMock(spec=Dataset)
14+
output_path = Path("some/path/output.csv")
15+
save_dataset_to_file(mock_dataset, output_path)
16+
mock_dataset.to_csv.assert_called_once_with(output_path)
17+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
18+
19+
20+
@pytest.mark.regression
21+
@patch.object(Path, "mkdir")
22+
def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
23+
mock_dataset = MagicMock(spec=Dataset)
24+
output_path = Path("some/path/output.CSV")
25+
save_dataset_to_file(mock_dataset, output_path)
26+
mock_dataset.to_csv.assert_called_once_with(output_path)
27+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
28+
29+
30+
@pytest.mark.regression
31+
@patch.object(Path, "mkdir")
32+
def test_save_dataset_to_file_json(mock_mkdir):
33+
mock_dataset = MagicMock(spec=Dataset)
34+
output_path = Path("some/path/output.json")
35+
save_dataset_to_file(mock_dataset, output_path)
36+
mock_dataset.to_json.assert_called_once_with(output_path)
37+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
38+
39+
40+
@pytest.mark.regression
41+
@patch.object(Path, "mkdir")
42+
def test_save_dataset_to_file_json_capitalized(mock_mkdir):
43+
mock_dataset = MagicMock(spec=Dataset)
44+
output_path = Path("some/path/output.JSON")
45+
save_dataset_to_file(mock_dataset, output_path)
46+
mock_dataset.to_json.assert_called_once_with(output_path)
47+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
48+
49+
50+
@pytest.mark.regression
51+
@patch.object(Path, "mkdir")
52+
def test_save_dataset_to_file_jsonl(mock_mkdir):
53+
mock_dataset = MagicMock(spec=Dataset)
54+
output_path = Path("some/path/output.jsonl")
55+
save_dataset_to_file(mock_dataset, output_path)
56+
mock_dataset.to_json.assert_called_once_with(output_path)
57+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
58+
59+
60+
@pytest.mark.regression
61+
@patch.object(Path, "mkdir")
62+
def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
63+
mock_dataset = MagicMock(spec=Dataset)
64+
output_path = Path("some/path/output.JSONL")
65+
save_dataset_to_file(mock_dataset, output_path)
66+
mock_dataset.to_json.assert_called_once_with(output_path)
67+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
68+
69+
70+
@pytest.mark.regression
71+
@patch.object(Path, "mkdir")
72+
def test_save_dataset_to_file_parquet(mock_mkdir):
73+
mock_dataset = MagicMock(spec=Dataset)
74+
output_path = Path("some/path/output.parquet")
75+
save_dataset_to_file(mock_dataset, output_path)
76+
mock_dataset.to_parquet.assert_called_once_with(output_path)
77+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)
78+
79+
80+
@pytest.mark.regression
81+
@patch.object(Path, "mkdir")
82+
def test_save_dataset_to_file_unsupported_type(mock_mkdir):
83+
mock_dataset = MagicMock(spec=Dataset)
84+
output_path = Path("some/path/output.txt")
85+
with pytest.raises(ValueError, match=r"Unsupported file suffix '.txt'.*"):
86+
save_dataset_to_file(mock_dataset, output_path)
87+
mock_mkdir.assert_called_once_with(parents=True, exist_ok=True)

0 commit comments

Comments
 (0)