Lightning-AI · tchaton · Sep 19, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 2, 2024
@@ -4,3 +4,7 @@ filelock
 numpy
 boto3
 requests
+fsspec
+fsspec[s3] # aws s3
+fsspec[gs] # google cloud storage
+fsspec[abfs] # azure blob
@@ -155,7 +155,8 @@ def set_epoch(self, current_epoch: int) -> None:
     def _create_cache(self, worker_env: _WorkerEnv) -> Cache:
         if _should_replace_path(self.input_dir.path):
             cache_path = _try_create_cache_dir(
-                input_dir=self.input_dir.path if self.input_dir.path else self.input_dir.url
+                input_dir=self.input_dir.path if self.input_dir.path else self.input_dir.url,
+                storage_options=self.storage_options,
             )
             if cache_path is not None:
                 self.input_dir.path = cache_path
@@ -438,7 +439,8 @@ def _validate_state_dict(self) -> None:
         # In this case, validate the cache folder is the same.
         if _should_replace_path(state["input_dir_path"]):
             cache_path = _try_create_cache_dir(
-                input_dir=state["input_dir_path"] if state["input_dir_path"] else state["input_dir_url"]
+                input_dir=state["input_dir_path"] if state["input_dir_path"] else state["input_dir_url"],
+                storage_options=self.storage_options,
             )
             if cache_path != self.input_dir.path:
                 raise ValueError(

@@ -18,6 +18,7 @@
 from typing import Any, Dict, List, Optional
 from urllib import parse
 
+import fsspec
 from filelock import FileLock, Timeout
 
 from litdata.constants import _AZURE_STORAGE_AVAILABLE, _GOOGLE_STORAGE_AVAILABLE, _INDEX_FILENAME
@@ -26,12 +27,17 @@
 
 class Downloader(ABC):
     def __init__(
-        self, remote_dir: str, cache_dir: str, chunks: List[Dict[str, Any]], storage_options: Optional[Dict] = {}
+        self,
+        cloud_provider: str,
+        remote_dir: str,
+        cache_dir: str,
+        chunks: List[Dict[str, Any]],
+        storage_options: Optional[Dict] = {},
     ):
         self._remote_dir = remote_dir
         self._cache_dir = cache_dir
         self._chunks = chunks
-        self._storage_options = storage_options or {}
+        self.fs = fsspec.filesystem(cloud_provider, **storage_options)
 
     def download_chunk_from_index(self, chunk_index: int) -> None:
         chunk_filename = self._chunks[chunk_index]["filename"]
@@ -188,10 +194,42 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
 }
 
 
+_DOWNLOADERS = {
+    "s3://": "s3",
+    "gs://": "gs",
+    "azure://": "abfs",
+    "local:": "file",
+    "": "file",
+}
+
+
+class FsspecDownloader(Downloader):
+    def __init__(
+        self,
+        cloud_provider: str,
+        remote_dir: str,
+        cache_dir: str,
+        chunks: List[Dict[str, Any]],
+        storage_options: Dict | None = {},
+    ):
+        remote_dir = remote_dir.replace("local:", "")
+        super().__init__(cloud_provider, remote_dir, cache_dir, chunks, storage_options)
+
+    def download_file(self, remote_filepath: str, local_filepath: str) -> None:
+        if os.path.exists(local_filepath) or remote_filepath == local_filepath:
+            return
+        try:
+            with FileLock(local_filepath + ".lock", timeout=3):
+                self.fs.get(remote_filepath, local_filepath, recursive=True)
+        except Timeout:
+            # another process is responsible to download that file, continue
+            pass
+
+
 def get_downloader_cls(
     remote_dir: str, cache_dir: str, chunks: List[Dict[str, Any]], storage_options: Optional[Dict] = {}
 ) -> Downloader:
-    for k, cls in _DOWNLOADERS.items():
+    for k, fs_cloud_provider in _DOWNLOADERS.items():
         if str(remote_dir).startswith(k):
-            return cls(remote_dir, cache_dir, chunks, storage_options)
+            return FsspecDownloader(fs_cloud_provider, remote_dir, cache_dir, chunks, storage_options)
     raise ValueError(f"The provided `remote_dir` {remote_dir} doesn't have a downloader associated.")
diff --git a/src/litdata/utilities/dataset_utilities.py b/src/litdata/utilities/dataset_utilities.py
@@ -38,7 +38,9 @@ def subsample_streaming_dataset(
 
     # Make sure input_dir contains cache path and remote url
     if _should_replace_path(input_dir.path):
-        cache_path = _try_create_cache_dir(input_dir=input_dir.path if input_dir.path else input_dir.url)
+        cache_path = _try_create_cache_dir(
+            input_dir=input_dir.path if input_dir.path else input_dir.url, storage_options=storage_options
+        )
         if cache_path is not None:
             input_dir.path = cache_path
 
@@ -93,7 +95,7 @@ def _should_replace_path(path: Optional[str]) -> bool:
     return path.startswith("/teamspace/datasets/") or path.startswith("/teamspace/s3_connections/")
 
 
-def _read_updated_at(input_dir: Optional[Dir]) -> str:
+def _read_updated_at(input_dir: Optional[Dir], storage_options: Optional[Dict]) -> str:
     """Read last updated timestamp from index.json file."""
     last_updation_timestamp = "0"
     index_json_content = None
@@ -107,7 +109,7 @@ def _read_updated_at(input_dir: Optional[Dir]) -> str:
         # download index.json file and read last_updation_timestamp
         with tempfile.TemporaryDirectory() as tmp_directory:
             temp_index_filepath = os.path.join(tmp_directory, _INDEX_FILENAME)
-            downloader = get_downloader_cls(input_dir.url, tmp_directory, [])
+            downloader = get_downloader_cls(input_dir.url, tmp_directory, [], storage_options)
             downloader.download_file(os.path.join(input_dir.url, _INDEX_FILENAME), temp_index_filepath)
 
             index_json_content = load_index_file(tmp_directory)
@@ -132,9 +134,9 @@ def _clear_cache_dir_if_updated(input_dir_hash_filepath: str, updated_at_hash: s
             shutil.rmtree(input_dir_hash_filepath)
 
 
-def _try_create_cache_dir(input_dir: Optional[str]) -> Optional[str]:
+def _try_create_cache_dir(input_dir: Optional[str], storage_options: Optional[Dict]) -> Optional[str]:
     resolved_input_dir = _resolve_dir(input_dir)
-    updated_at = _read_updated_at(resolved_input_dir)
+    updated_at = _read_updated_at(resolved_input_dir, storage_options)
 
     if updated_at == "0" and input_dir is not None:
         updated_at = hashlib.md5(input_dir.encode()).hexdigest()  # noqa: S324