Skip to content

Commit e6353e2

Browse files
committed
PR feedback
1 parent 534ce4f commit e6353e2

File tree

4 files changed

+10
-10
lines changed

4 files changed

+10
-10
lines changed

src/oumi/builders/data.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
)
2525
from oumi.datasets.trl_dpo_preprocessor import trl_dpo_chat_preprocessor_fn
2626
from oumi.datasets.ultrachat_200k import trl_sft_ultrachat_200k_preprocessor_fn
27-
from oumi.utils.hf_datasets_utils import is_disk_cached_hf_dataset
27+
from oumi.utils.hf_datasets_utils import is_cached_to_disk_hf_dataset
2828
from oumi.utils.logging import logger
2929

3030
DatasetType = TypeVar("DatasetType", datasets.Dataset, datasets.IterableDataset)
@@ -371,7 +371,7 @@ def _load_dataset(
371371
return dataset.to_hf()
372372

373373
dataset_name_or_path: Path = Path(dataset_params.dataset_name)
374-
if is_disk_cached_hf_dataset(dataset_name_or_path):
374+
if is_cached_to_disk_hf_dataset(dataset_name_or_path):
375375
return datasets.Dataset.load_from_disk(dataset_name_or_path)
376376
else:
377377
return datasets.load_dataset(

src/oumi/core/datasets/base_dataset.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from oumi.core.tokenizers import BaseTokenizer
1111
from oumi.core.types.turn import Conversation
12-
from oumi.utils.hf_datasets_utils import is_disk_cached_hf_dataset
12+
from oumi.utils.hf_datasets_utils import is_cached_to_disk_hf_dataset
1313
from oumi.utils.logging import logger
1414

1515

@@ -135,7 +135,7 @@ def _load_data(self) -> pd.DataFrame:
135135
and dataset_path.is_file()
136136
):
137137
result = self._load_parquet_dataset(self.dataset_name_or_path)
138-
elif is_disk_cached_hf_dataset(self.dataset_name_or_path):
138+
elif is_cached_to_disk_hf_dataset(self.dataset_name_or_path):
139139
result = self._load_dataset_from_disk(self.dataset_name_or_path)
140140
else:
141141
raise ValueError(

src/oumi/utils/hf_datasets_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from oumi.utils.logging import logger
55

66

7-
def is_disk_cached_hf_dataset(dataset_name_or_path: Union[str, Path]) -> bool:
7+
def is_cached_to_disk_hf_dataset(dataset_name_or_path: Union[str, Path]) -> bool:
88
"""Detects whether a dataset was saved using `dataset.save_to_disk()`.
99
1010
Such datasets should be loaded using `datasets.Daataset.load_from_disk()`

tests/utils/test_hf_datasets_utils.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import datasets
55

6-
from oumi.utils.hf_datasets_utils import is_disk_cached_hf_dataset
6+
from oumi.utils.hf_datasets_utils import is_cached_to_disk_hf_dataset
77

88

99
def test_is_saved_to_disk_hf_dataset():
@@ -12,16 +12,16 @@ def test_is_saved_to_disk_hf_dataset():
1212
{"pokemon": ["bulbasaur", "squirtle"], "type": ["grass", "water"]}
1313
)
1414
ds_dir = Path(output_temp_dir) / "toy_dataset"
15-
assert not is_disk_cached_hf_dataset(ds_dir)
15+
assert not is_cached_to_disk_hf_dataset(ds_dir)
1616

1717
ds_dir.mkdir(parents=True, exist_ok=True)
18-
assert not is_disk_cached_hf_dataset(ds_dir)
18+
assert not is_cached_to_disk_hf_dataset(ds_dir)
1919

2020
ds.save_to_disk(ds_dir, num_shards=2)
21-
assert is_disk_cached_hf_dataset(ds_dir)
21+
assert is_cached_to_disk_hf_dataset(ds_dir)
2222

2323
for filename in ("dataset_info.json", "state.json"):
2424
sub_path: Path = Path(ds_dir) / filename
2525
assert sub_path.exists() and sub_path.is_file()
2626
sub_path.unlink()
27-
assert not is_disk_cached_hf_dataset(ds_dir)
27+
assert not is_cached_to_disk_hf_dataset(ds_dir)

0 commit comments

Comments
 (0)