Lightning-AI
diff --git a/‎src/litdata/streaming/cache.py‎
Lines changed: 3 additions & 0 deletions b/‎src/litdata/streaming/cache.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/litdata/streaming/config.py‎
Lines changed: 22 additions & 0 deletions b/‎src/litdata/streaming/config.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/litdata/streaming/dataset.py‎
Lines changed: 19 additions & 1 deletion b/‎src/litdata/streaming/dataset.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/litdata/streaming/downloader.py‎
Lines changed: 57 additions & 0 deletions b/‎src/litdata/streaming/downloader.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎src/litdata/streaming/item_loader.py‎
Lines changed: 24 additions & 0 deletions b/‎src/litdata/streaming/item_loader.py‎
Lines changed: 24 additions & 0 deletions
@@ -50,6 +50,7 @@ def __init__(
         session_options: Optional[Dict] = {},
         max_pre_download: int = 2,
         msg_queue: Optional[Queue] = None,
+        on_demand_bytes: bool = False,
     ):
         """The Cache enables to optimise dataset format for cloud training. This is done by grouping several elements
         together in order to accelerate fetching.
@@ -70,6 +71,7 @@ def __init__(
             session_options: Additional options for the S3 session.
             max_pre_download: Maximum number of chunks that can be pre-downloaded while filling up the cache.
             msg_queue: Optional message queue to send messages to the main process.
+            on_demand_bytes: If True, fetch only the requested sample's bytes instead of downloading the entire chunk.
 
         """
         super().__init__()
@@ -100,6 +102,7 @@ def __init__(
             storage_options=storage_options,
             session_options=session_options,
             max_pre_download=max_pre_download,
+            on_demand_bytes=on_demand_bytes,
         )
         self._is_done = False
         self._distributed_env = _DistributedEnv.detect()
 
@@ -151,6 +151,28 @@ def download_chunk_from_index(self, chunk_index: int, skip_lock: bool = False) -
 
         self.try_decompress(local_chunkpath)
 
+    def download_chunk_bytes_from_index(self, chunk_index: int, offset: int, length: int) -> bytes:
+        assert self._chunks is not None
+        chunk_filename = self._chunks[chunk_index]["filename"]
+
+        local_chunkpath = os.path.join(self._cache_dir, chunk_filename)
+
+        if os.path.exists(local_chunkpath):
+            with open(local_chunkpath, "rb") as f:
+                f.seek(offset)
+                return f.read(length)
+
+        if self._compressor is not None:
+            raise ValueError(
+                "The `download_chunk_bytes_from_index` method is not supported for compressed chunks. "
+                "Please, use `download_chunk_from_index` instead."
+            )
+
+        if self._downloader is None:
+            raise RuntimeError("The downloader is not initialized. Please, initialize it before downloading chunks.")
+
+        return self._downloader.download_chunk_bytes_from_index(chunk_index, offset, length)
+
     def try_decompress(self, local_chunkpath: str) -> None:
         if self._compressor is None:
             return
 
@@ -201,6 +201,19 @@ def __init__(
             if not callable(transform):
                 raise ValueError(f"Transform should be a callable. Found {transform}")
             self.transform = transform
+        self._on_demand_bytes = True  # true by default, when iterating, turn this off to store the chunks in the cache
+
+    @property
+    def on_demand_bytes(self) -> bool:
+        return self._on_demand_bytes
+
+    @on_demand_bytes.setter
+    def on_demand_bytes(self, value: bool) -> None:
+        if not isinstance(value, bool):
+            raise ValueError(f"on_demand_bytes should be a boolean. Found {value}")
+        self._on_demand_bytes = value
+        assert self.cache is not None, "Cache must be initialized before setting on_demand_bytes."
+        self.cache._reader.on_demand_bytes = value
 
     def set_shuffle(self, shuffle: bool) -> None:
         self.shuffle = shuffle
@@ -240,6 +253,7 @@ def _create_cache(self, worker_env: _WorkerEnv) -> Cache:
             storage_options=self.storage_options,
             session_options=self.session_options,
             max_pre_download=self.max_pre_download,
+            on_demand_bytes=self._on_demand_bytes,
         )
         cache._reader._try_load_config()
 
@@ -287,6 +301,7 @@ def __iter__(self) -> "StreamingDataset":
         self.worker_env = _WorkerEnv.detect()
         self.cache = self._create_cache(worker_env=self.worker_env)
         self.shuffler = self._create_shuffler(self.cache)
+        self.on_demand_bytes = False  # reset on_demand_bytes to False, and store chunks in the cache
 
         # Handle restart
         if self._state_dict:
@@ -402,14 +417,15 @@ def _resume(self, workers_chunks: List[List[int]], workers_intervals: List[Any])
         # bump the chunk_index
         self.worker_next_chunk_index += 1
 
-    def __getitem__(self, index: Union[ChunkedIndex, int]) -> Any:
+    def __getitem__(self, index: Union[ChunkedIndex, int, slice]) -> Any:
         if self.cache is None:
             self.worker_env = _WorkerEnv.detect()
             self.cache = self._create_cache(worker_env=self.worker_env)
             self.shuffler = self._create_shuffler(self.cache)
         if isinstance(index, int):
             index = ChunkedIndex(*self.cache._get_chunk_index_from_index(index))
         elif isinstance(index, slice):
+            self.on_demand_bytes = False  # for slices, we always want to store the chunks
             start, stop, step = index.indices(len(self))
             _my_indices = list(range(start, stop, step))
             _my_cache_indices = [ChunkedIndex(*self.cache._get_chunk_index_from_index(idx)) for idx in _my_indices]
@@ -436,6 +452,7 @@ def __next__(self) -> Any:
             self.current_epoch += 1
             self.reset_state_dict()
             logger.debug(_get_log_msg({"name": "iterating_dataset", "ph": "E"}))
+            self.on_demand_bytes = True  # reset on_demand_bytes to True
             raise StopIteration
 
         # Lazily re-populate the interval to reduce memory usage.
@@ -444,6 +461,7 @@ def __next__(self) -> Any:
             if self.num_chunks is not None and self.worker_next_chunk_index >= self.num_chunks:
                 self.current_epoch += 1
                 self.reset_state_dict()
+                self.on_demand_bytes = True  # reset on_demand_bytes to True
                 raise StopIteration
 
             # if upcoming_indexes is empty, means either:
 
@@ -76,9 +76,28 @@ def download_chunk_from_index(self, chunk_index: int) -> None:
 
         logger.debug(_get_log_msg({"name": f"download_chunk_from_index_{chunk_index}", "ph": "E"}))
 
+    def download_chunk_bytes_from_index(self, chunk_index: int, offset: int, length: int) -> bytes:
+        chunk_filename = self._chunks[chunk_index]["filename"]
+        local_chunkpath = os.path.join(self._cache_dir, chunk_filename)
+        remote_chunkpath = os.path.join(self._remote_dir, chunk_filename)
+
+        return self.download_bytes(remote_chunkpath, offset, length, local_chunkpath)
+
     def download_file(self, remote_chunkpath: str, local_chunkpath: str) -> None:
         pass
 
+    def download_bytes(self, remote_chunkpath: str, offset: int, length: int, local_chunkpath: str) -> bytes:
+        """Download a specific range of bytes from the remote file.
+
+        If this method is not overridden in a subclass, it defaults to downloading the full file
+        by calling `download_file` and then reading the desired byte range from the local copy.
+        """
+        self.download_file(remote_chunkpath, local_chunkpath)
+        # read the specified byte range from the local file
+        with open(local_chunkpath, "rb") as f:
+            f.seek(offset)
+            return f.read(length)
+
 
 class S3Downloader(Downloader):
     def __init__(
@@ -165,6 +184,24 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
                         Config=TransferConfig(use_threads=False),
                     )
 
+    def download_bytes(self, remote_filepath: str, offset: int, length: int, local_chunkpath: str) -> bytes:
+        obj = parse.urlparse(remote_filepath)
+
+        if obj.scheme != "s3":
+            raise ValueError(f"Expected obj.scheme to be `s3`, instead, got {obj.scheme} for remote={remote_filepath}")
+
+        if not hasattr(self, "client"):
+            self._client = S3Client(storage_options=self._storage_options, session_options=self.session_options)
+
+        bucket = obj.netloc
+        key = obj.path.lstrip("/")
+
+        byte_range = f"bytes={offset}-{offset + length - 1}"
+
+        response = self._client.client.get_object(Bucket=bucket, Key=key, Range=byte_range)
+
+        return response["Body"].read()
+
 
 class GCPDownloader(Downloader):
     def __init__(
@@ -208,6 +245,26 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
             blob = bucket.blob(key)
             blob.download_to_filename(local_filepath)
 
+    def download_bytes(self, remote_filepath: str, offset: int, length: int, local_chunkpath: str) -> bytes:
+        from google.cloud import storage
+
+        obj = parse.urlparse(remote_filepath)
+
+        if obj.scheme != "gs":
+            raise ValueError(f"Expected scheme 'gs', got '{obj.scheme}' for remote={remote_filepath}")
+
+        bucket_name = obj.netloc
+        key = obj.path.lstrip("/")
+
+        client = storage.Client(**self._storage_options)
+        bucket = client.bucket(bucket_name)
+        blob = bucket.blob(key)
+
+        # GCS uses end as *inclusive*, so end = offset + length - 1
+        end = offset + length - 1
+
+        return blob.download_as_bytes(start=offset, end=end)
+
 
 class AzureDownloader(Downloader):
     def __init__(
 
@@ -107,6 +107,14 @@ def load_item_from_chunk(
     ) -> Any:
         """Returns an item loaded from a chunk."""
 
+    def load_item_from_bytes(
+        self,
+        raw_bytes: bytes,
+        chunk_index: int,
+    ) -> Any:
+        """Returns an item loaded from bytes."""
+        raise NotImplementedError("The `load_item_from_bytes` method is not implemented for this item loader.")
+
     @abstractmethod
     def delete(self, chunk_index: int, chunk_filepath: str) -> None:
         """Delete a chunk from the local filesystem."""
@@ -143,6 +151,22 @@ def generate_intervals(self) -> List[Interval]:
     def pre_load_chunk(self, chunk_index: int, chunk_filepath: str) -> None:
         pass
 
+    def load_item_from_bytes(
+        self,
+        raw_bytes: bytes,
+        chunk_index: int,
+    ) -> bytes:
+        if self._config.get("encryption"):
+            raise ValueError("The `load_item_from_bytes` method does not support encrypted data loading currently.")
+
+        # check for mosaic mds format
+        if "format" in self._config and self._config["format"] == "mds":
+            item_data = self.mds_deserialize(raw_bytes, chunk_index)
+        else:
+            item_data = self.deserialize(raw_bytes)
+
+        return item_data
+
     def load_item_from_chunk(
         self,
         index: int,