Merge branch 'main' into workers-skip-speedup

tchaton · web-flow · commit a6331a4994fe · 2024-08-06T22:27:33.000+01:00
diff --git a/README.md b/README.md
@@ -265,14 +265,38 @@ dataset = ld.StreamingDataset("s3://my-bucket/my-data", storage_options=aws_stor
 gcp_storage_options={
     "project": os.environ['PROJECT_ID'],
 }
-dataset = ld.StreamingDataset("gcp://my-bucket/my-data", storage_options=gcp_storage_options)
+dataset = ld.StreamingDataset("gs://my-bucket/my-data", storage_options=gcp_storage_options)
 
 # Read data from Azure
 azure_storage_options={
     "account_url": f"https://{os.environ['AZURE_ACCOUNT_NAME']}.blob.core.windows.net",
     "credential": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
 }
 dataset = ld.StreamingDataset("azure://my-bucket/my-data", storage_options=azure_storage_options)
+
+# Read data from Hugging Face
+hf_storage_options={
+    "use_auth_token": os.environ['HF_TOKEN']
+}
+dataset = StreamingDataset("hf://datasets/my-org/my-repo", storage_options=hf_storage_options)
+# Read from a nested directory
+dataset = StreamingDataset("hf://datasets/my-org/my-repo/dataset-1", storage_options=hf_storage_options)
+```
+
+### Upload Data to Hugging Face
+
+To upload data to Hugging Face, you can use the `huggingface-cli` command. Below is the command format:
+> For more information, checkout the [Hugging Face documentation](https://huggingface.co/docs/datasets/main/en/share#huggingface-cli-upload).
+
+```sh
+$ huggingface-cli upload [dataset_repo_id] [local_path] [path_in_repo] --repo-type dataset --token=[HF_TOKEN]
+# Example: Uploading to the root of the repository
+# huggingface-cli upload my-org/my-repo ./my-data --repo-type dataset --token=hf_****
+
+# Example: Uploading to a nested directory within the repository
+# huggingface-cli upload my-org/my-repo ./my-data dataset-1 --repo-type dataset --token=hf_****
+
+# Note: If already logged in, you can skip the token
 ```
 
 </details>  
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -1,5 +1,6 @@
 coverage ==7.5.3
 cryptography==42.0.8
+huggingface-hub==0.24.5
 mosaicml-streaming==0.8.0
 pytest ==8.3.*
 pytest-cov ==5.0.0
diff --git a/src/litdata/constants.py b/src/litdata/constants.py
@@ -26,17 +26,18 @@
 _DEFAULT_CACHE_DIR = os.path.join(Path.home(), ".lightning", "chunks")
 
 # This is required for full pytree serialization / deserialization support
-_TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
-_VIZ_TRACKER_AVAILABLE = RequirementCache("viztracer")
-_LIGHTNING_CLOUD_AVAILABLE = RequirementCache("lightning-cloud")
+_AZURE_STORAGE_AVAILABLE = RequirementCache("azure.storage.blob")
 _BOTO3_AVAILABLE = RequirementCache("boto3")
-_TORCH_AUDIO_AVAILABLE = RequirementCache("torchaudio")
-_ZSTD_AVAILABLE = RequirementCache("zstd")
 _CRYPTOGRAPHY_AVAILABLE = RequirementCache("cryptography")
 _GOOGLE_STORAGE_AVAILABLE = RequirementCache("google.cloud.storage")
-_AZURE_STORAGE_AVAILABLE = RequirementCache("azure.storage.blob")
-_TQDM_AVAILABLE = RequirementCache("tqdm")
+_HUGGINGFACE_HUB_AVAILABLE = RequirementCache("huggingface-hub")
+_LIGHTNING_CLOUD_AVAILABLE = RequirementCache("lightning-cloud")
 _LIGHTNING_SDK_AVAILABLE = RequirementCache("lightning_sdk")
+_TORCH_AUDIO_AVAILABLE = RequirementCache("torchaudio")
+_TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
+_TQDM_AVAILABLE = RequirementCache("tqdm")
+_VIZ_TRACKER_AVAILABLE = RequirementCache("viztracer")
+_ZSTD_AVAILABLE = RequirementCache("zstd")
 
 # DON'T CHANGE ORDER
 _TORCH_DTYPES_MAPPING = {
diff --git a/src/litdata/streaming/dataset.py b/src/litdata/streaming/dataset.py
@@ -197,13 +197,13 @@ def set_num_workers(self, num_workers: int) -> None:
         self.num_workers = num_workers or 1
 
     def get_len(self, num_workers: int, batch_size: int) -> int:
-        self.num_workers = num_workers
-        self.batch_size = batch_size
+        self.set_num_workers(num_workers)
+        self.set_batch_size(batch_size)
         worker_env = _WorkerEnv.detect()
         if self.shuffler is None:
             cache = self._create_cache(worker_env=worker_env)
             self.shuffler = self._create_shuffler(cache)
-        return self.shuffler.get_len(self.distributed_env, num_workers, batch_size, self.current_epoch)
+        return self.shuffler.get_len(self.distributed_env, self.num_workers, self.batch_size, self.current_epoch)
 
     def __iter__(self) -> "StreamingDataset":
         # When the StreamingDataset is used within map or optimize, let's refetch the distributed env.
diff --git a/src/litdata/streaming/downloader.py b/src/litdata/streaming/downloader.py
@@ -20,7 +20,12 @@
 
 from filelock import FileLock, Timeout
 
-from litdata.constants import _AZURE_STORAGE_AVAILABLE, _GOOGLE_STORAGE_AVAILABLE, _INDEX_FILENAME
+from litdata.constants import (
+    _AZURE_STORAGE_AVAILABLE,
+    _GOOGLE_STORAGE_AVAILABLE,
+    _HUGGINGFACE_HUB_AVAILABLE,
+    _INDEX_FILENAME,
+)
 from litdata.streaming.client import S3Client
 
 
@@ -164,6 +169,56 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
             pass
 
 
+class HFDownloader(Downloader):
+    def __init__(
+        self, remote_dir: str, cache_dir: str, chunks: List[Dict[str, Any]], storage_options: Optional[Dict] = {}
+    ):
+        if not _HUGGINGFACE_HUB_AVAILABLE:
+            raise ModuleNotFoundError(str(_HUGGINGFACE_HUB_AVAILABLE))
+
+        super().__init__(remote_dir, cache_dir, chunks, storage_options)
+
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
+        """Download a file from the Hugging Face Hub.
+
+        The remote_filepath should be in the format `hf://<repo_type>/<repo_org>/<repo_name>/path`. For more
+        information, see
+        https://huggingface.co/docs/huggingface_hub/en/guides/hf_file_system#integrations.
+
+        """
+        from huggingface_hub import hf_hub_download
+
+        obj = parse.urlparse(remote_filepath)
+
+        if obj.scheme != "hf":
+            raise ValueError(f"Expected obj.scheme to be `hf`, instead, got {obj.scheme} for remote={remote_filepath}")
+
+        if os.path.exists(local_filepath):
+            return
+
+        try:
+            with FileLock(local_filepath + ".lock", timeout=3 if obj.path.endswith(_INDEX_FILENAME) else 0):
+                # Adapted from https://github.com/mosaicml/streaming/blob/main/streaming/base/storage/download.py#L292
+                # expected URL format: hf://datasets/<repo_org>/<repo_name>/path
+                _, _, _, repo_org, repo_name, path = remote_filepath.split("/", 5)
+                downloaded_path = hf_hub_download(
+                    repo_id=f"{repo_org}/{repo_name}",
+                    filename=path,
+                    local_dir=self._cache_dir,
+                    repo_type="dataset",
+                    **self._storage_options,
+                )
+
+                # Move the downloaded file to the expected location if it's not already there.
+                if downloaded_path != local_filepath and os.path.exists(downloaded_path):
+                    os.rename(downloaded_path, local_filepath)
+                    os.rmdir(os.path.dirname(downloaded_path))
+
+        except Timeout:
+            # another process is responsible to download that file, continue
+            pass
+
+
 class LocalDownloader(Downloader):
     def download_file(self, remote_filepath: str, local_filepath: str) -> None:
         if not os.path.exists(remote_filepath):
@@ -183,6 +238,7 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
     "s3://": S3Downloader,
     "gs://": GCPDownloader,
     "azure://": AzureDownloader,
+    "hf://": HFDownloader,
     "local:": LocalDownloaderWithCache,
     "": LocalDownloader,
 }
diff --git a/src/litdata/streaming/resolver.py b/src/litdata/streaming/resolver.py
@@ -52,7 +52,8 @@ def _resolve_dir(dir_path: Optional[Union[str, Dir]]) -> Dir:
 
     assert isinstance(dir_path, str)
 
-    if dir_path.startswith("s3://") or dir_path.startswith("gs://") or dir_path.startswith("azure://"):
+    cloud_prefixes = ("s3://", "gs://", "azure://", "hf://")
+    if dir_path.startswith(cloud_prefixes):
         return Dir(path=None, url=dir_path)
 
     if dir_path.startswith("local:"):
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -64,6 +64,16 @@ def azure_mock(monkeypatch):
     return azure
 
 
+@pytest.fixture()
+def huggingface_mock(monkeypatch):
+    huggingface_hub = ModuleType("huggingface_hub")
+    monkeypatch.setitem(sys.modules, "huggingface_hub", huggingface_hub)
+    hf_hub_download = ModuleType("hf_hub_download")
+    monkeypatch.setitem(sys.modules, "huggingface_hub.hf_hub_download", hf_hub_download)
+    huggingface_hub.hf_hub_download = hf_hub_download
+    return huggingface_hub
+
+
 @pytest.fixture()
 def lightning_cloud_mock(monkeypatch):
     lightning_cloud = ModuleType("lightning_cloud")
diff --git a/tests/processing/test_functions.py b/tests/processing/test_functions.py
@@ -176,7 +176,7 @@ def test_optimize_append_overwrite(tmpdir):
     assert ds[:] == [(i, i**2, i**3) for i in range(0, 5)]
 
 
-@pytest.mark.skipif(sys.platform == "win32" or sys.platform == "darwin", reason="too slow")
+@pytest.mark.skipif(sys.platform == "win32", reason="too slow")
 def test_optimize_checkpoint_in_none_and_append_mode(tmpdir):
     output_dir = str(tmpdir / "output_dir")
 
@@ -188,6 +188,7 @@ def test_optimize_checkpoint_in_none_and_append_mode(tmpdir):
             chunk_size=1,
             num_workers=2,
             use_checkpoint=True,
+            start_method="fork",
         )
 
     # check that the checkpoints are created
@@ -201,6 +202,7 @@ def test_optimize_checkpoint_in_none_and_append_mode(tmpdir):
         chunk_size=1,
         num_workers=2,
         use_checkpoint=True,
+        start_method="fork",
     )
 
     ds = StreamingDataset(output_dir)
@@ -221,6 +223,7 @@ def test_optimize_checkpoint_in_none_and_append_mode(tmpdir):
             num_workers=2,
             use_checkpoint=True,
             mode="append",
+            start_method="fork",
         )
 
     # check that the checkpoints are created
@@ -240,6 +243,7 @@ def test_optimize_checkpoint_in_none_and_append_mode(tmpdir):
         num_workers=2,
         use_checkpoint=True,
         mode="append",
+        start_method="fork",
     )
 
     ds = StreamingDataset(output_dir)
diff --git a/tests/streaming/test_dataloader.py b/tests/streaming/test_dataloader.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 from litdata.constants import _VIZ_TRACKER_AVAILABLE
-from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader
+from litdata.streaming import Cache, CombinedStreamingDataset, StreamingDataLoader, StreamingDataset
 from litdata.streaming import dataloader as streaming_dataloader_module
 from torch import tensor
 
@@ -187,3 +187,18 @@ def test_custom_collate_multiworker():
 
     # Try calling the state_dict. No error should follow
     _state_dict = dataloader.state_dict()
+
+
+def test_dataloader_no_workers(tmpdir):
+    cache = Cache(input_dir=str(tmpdir), chunk_bytes="64MB")
+    for i in range(1000):
+        cache[i] = i
+
+    cache.done()
+    cache.merge()
+
+    dataset = StreamingDataset(str(tmpdir), shuffle=True)
+    dataloader = StreamingDataLoader(dataset)
+    assert len(dataset) == 1000
+    assert len(dataloader) == 1000
+    assert len(dataset) == 1000
diff --git a/tests/streaming/test_downloader.py b/tests/streaming/test_downloader.py
@@ -1,10 +1,12 @@
+import contextlib
 import os
 from unittest import mock
 from unittest.mock import MagicMock
 
 from litdata.streaming.downloader import (
     AzureDownloader,
     GCPDownloader,
+    HFDownloader,
     LocalDownloaderWithCache,
     S3Downloader,
     shutil,
@@ -72,6 +74,31 @@ def test_azure_downloader(tmpdir, monkeypatch, azure_mock):
     mock_blob_data.readinto.assert_called()
 
 
+@mock.patch("litdata.streaming.downloader._HUGGINGFACE_HUB_AVAILABLE", True)
+def test_hf_downloader(tmpdir, monkeypatch, huggingface_mock):
+    mock_hf_hub_download = MagicMock()
+    huggingface_mock.hf_hub_download = mock_hf_hub_download
+
+    # Initialize the downloader
+    storage_options = {}
+    downloader = HFDownloader("hf://datasets/random_org/random_repo", tmpdir, [], storage_options)
+    local_filepath = os.path.join(tmpdir, "a.txt")
+
+    # ignore filenotfound error for this test TODO: write a better test
+    with contextlib.suppress(FileNotFoundError):
+        downloader.download_file("hf://datasets/random_org/random_repo/a.txt", local_filepath)
+    # Assert that the correct methods were called
+    huggingface_mock.hf_hub_download.assert_called_once()
+    huggingface_mock.hf_hub_download.assert_called_with(
+        repo_id="random_org/random_repo", filename="a.txt", local_dir=tmpdir, repo_type="dataset"
+    )
+
+    # Test that the file is not downloaded if it already exists
+    with contextlib.suppress(FileNotFoundError):
+        downloader.download_file("hf://datasets/random_org/random_repo/a.txt", local_filepath)
+        huggingface_mock.hf_hub_download.assert_not_called()
+
+
 def test_download_with_cache(tmpdir, monkeypatch):
     # Create a file to download/cache
     with open("a.txt", "w") as f: