Merge branch 'main' into jgreer013/async_inference_writes

jgreer013 · web-flow · commit b05cb728a956 · 2024-09-27T14:34:40.000-07:00
diff --git a/src/oumi/builders/data.py b/src/oumi/builders/data.py
@@ -1,5 +1,6 @@
 import copy
 import warnings
+from pathlib import Path
 from typing import Callable, List, Optional, Sequence, TypeVar, Union, cast
 
 import datasets
@@ -23,6 +24,7 @@
 )
 from oumi.datasets.trl_dpo_preprocessor import trl_dpo_chat_preprocessor_fn
 from oumi.datasets.ultrachat_200k import trl_sft_ultrachat_200k_preprocessor_fn
+from oumi.utils.hf_datasets_utils import is_cached_to_disk_hf_dataset
 from oumi.utils.logging import logger
 
 DatasetType = TypeVar("DatasetType", datasets.Dataset, datasets.IterableDataset)
@@ -368,11 +370,15 @@ def _load_dataset(
             )
             return dataset.to_hf()
 
-    return datasets.load_dataset(
-        dataset_params.dataset_name,
-        name=dataset_params.subset,
-        split=dataset_params.split,
-        streaming=stream,
-        trust_remote_code=dataset_params.trust_remote_code,
-        **dataset_params.dataset_kwargs,
-    )
+    dataset_name_or_path: Path = Path(dataset_params.dataset_name)
+    if is_cached_to_disk_hf_dataset(dataset_name_or_path):
+        return datasets.Dataset.load_from_disk(dataset_name_or_path)
+    else:
+        return datasets.load_dataset(
+            dataset_params.dataset_name,
+            name=dataset_params.subset,
+            split=dataset_params.split,
+            streaming=stream,
+            trust_remote_code=dataset_params.trust_remote_code,
+            **dataset_params.dataset_kwargs,
+        )
diff --git a/src/oumi/core/datasets/base_dataset.py b/src/oumi/core/datasets/base_dataset.py
@@ -1,6 +1,6 @@
 import gc
-import os
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import Literal, Optional, Union, cast
 
 import datasets
@@ -9,6 +9,7 @@
 
 from oumi.core.tokenizers import BaseTokenizer
 from oumi.core.types.turn import Conversation
+from oumi.utils.hf_datasets_utils import is_cached_to_disk_hf_dataset
 from oumi.utils.logging import logger
 
 
@@ -125,11 +126,17 @@ def _load_data(self) -> pd.DataFrame:
         Returns:
             dict: The loaded dataset.
         """
-        if os.path.exists(self.dataset_name_or_path):
-            if self.dataset_name_or_path.endswith(".jsonl"):
+        dataset_path = Path(self.dataset_name_or_path)
+        if dataset_path.exists():
+            if self.dataset_name_or_path.endswith(".jsonl") and dataset_path.is_file():
                 result = self._load_jsonl_dataset(self.dataset_name_or_path)
-            elif self.dataset_name_or_path.endswith(".parquet"):
+            elif (
+                self.dataset_name_or_path.endswith(".parquet")
+                and dataset_path.is_file()
+            ):
                 result = self._load_parquet_dataset(self.dataset_name_or_path)
+            elif is_cached_to_disk_hf_dataset(self.dataset_name_or_path):
+                result = self._load_dataset_from_disk(self.dataset_name_or_path)
             else:
                 raise ValueError(
                     f"File format not supported for {self.dataset_name_or_path}"
@@ -202,6 +209,12 @@ def _load_jsonl_dataset(self, path: str) -> pd.DataFrame:
     def _load_parquet_dataset(self, path: str) -> pd.DataFrame:
         return pd.read_parquet(path)
 
+    def _load_dataset_from_disk(self, path: str) -> pd.DataFrame:
+        dataset: datasets.Dataset = datasets.Dataset.load_from_disk(path)
+        result = dataset.to_pandas()
+        del dataset
+        return cast(pd.DataFrame, result)
+
 
 class BaseLMSftDataset(BaseMapDataset, ABC):
     """In-memory dataset for SFT data.
diff --git a/src/oumi/utils/hf_datasets_utils.py b/src/oumi/utils/hf_datasets_utils.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+from typing import Union
+
+from oumi.utils.logging import logger
+
+
+def is_cached_to_disk_hf_dataset(dataset_name_or_path: Union[str, Path]) -> bool:
+    """Detects whether a dataset was saved using `dataset.save_to_disk()`.
+
+    Such datasets should be loaded using `datasets.Daataset.load_from_disk()`
+
+    Returns:
+        Whether the dataset was saved using `dataset.save_to_disk()` method.
+    """
+    if not dataset_name_or_path:
+        return False
+
+    dataset_path: Path = Path(dataset_name_or_path)
+
+    if dataset_path.exists() and dataset_path.is_dir():
+        for file_name in ("dataset_info.json", "state.json"):
+            file_path: Path = dataset_path / file_name
+            if not (file_path.exists() and file_path.is_file()):
+                logger.warning(
+                    f"The dataset {str(dataset_path)} is missing "
+                    f"a required file: {file_name}."
+                )
+                return False
+        return True
+
+    return False
diff --git a/tests/utils/test_debugging_utils.py b/tests/utils/test_debugging_utils.py
@@ -22,7 +22,7 @@ def test_nvidia_gpu_memory_utilization():
     if num_devices > 0:
         for device_index in range(0, num_devices):
             memory_mib = get_nvidia_gpu_memory_utilization(device_index)
-            assert memory_mib > 1024  # Must have at least 1 GB
+            assert memory_mib > 1  # Must have at least 1 MB
             assert memory_mib < 1024 * 1024  # No known GPU has 1 TB of VRAM yet.
             log_nvidia_gpu_memory_utilization(device_index)
 
diff --git a/tests/utils/test_hf_datasets_utils.py b/tests/utils/test_hf_datasets_utils.py
@@ -0,0 +1,27 @@
+import tempfile
+from pathlib import Path
+
+import datasets
+
+from oumi.utils.hf_datasets_utils import is_cached_to_disk_hf_dataset
+
+
+def test_is_saved_to_disk_hf_dataset():
+    with tempfile.TemporaryDirectory() as output_temp_dir:
+        ds = datasets.Dataset.from_dict(
+            {"pokemon": ["bulbasaur", "squirtle"], "type": ["grass", "water"]}
+        )
+        ds_dir = Path(output_temp_dir) / "toy_dataset"
+        assert not is_cached_to_disk_hf_dataset(ds_dir)
+
+        ds_dir.mkdir(parents=True, exist_ok=True)
+        assert not is_cached_to_disk_hf_dataset(ds_dir)
+
+        ds.save_to_disk(ds_dir, num_shards=2)
+        assert is_cached_to_disk_hf_dataset(ds_dir)
+
+        for filename in ("dataset_info.json", "state.json"):
+            sub_path: Path = Path(ds_dir) / filename
+            assert sub_path.exists() and sub_path.is_file()
+            sub_path.unlink()
+            assert not is_cached_to_disk_hf_dataset(ds_dir)