Lightning-AI · tchaton · Apr 25, 2025 · Apr 25, 2025
@@ -368,6 +368,12 @@ def _resume(self, workers_chunks: List[List[int]], workers_intervals: List[Any])
         self.worker_chunks = workers_chunks[worker_rank]
         self.worker_intervals = workers_intervals[worker_rank]
 
+        if self.worker_next_chunk_index >= self.num_chunks:
+            # This can happen when interrupting and resuming after some but not all workers are done.
+            # Proceeding would result in an indexing error when attempting to access the next chunk.
+            # To prevent this we exit early and let the worker raise a StopIteration in __next__.
+            return
+
         # replay the indexes for the current chunks
         interval = self.worker_intervals[self.worker_next_chunk_index]
         current_indexes = np.arange(interval[1], interval[2])

@@ -333,3 +333,25 @@ def test_resume_dataloader_with_new_dataset(tmpdir):
     dataloader.load_state_dict(dataloader_state)
     for _ in dataloader:
         assert dataloader.current_epoch == 2, "Current epoch should be 2"
+
+
+def test_resume_dataloader_after_some_workers_are_done(tmpdir):
+    # see https://github.com/Lightning-AI/litData/issues/563
+    dset_path = tmpdir.join("dataset")
+    cache = Cache(input_dir=str(dset_path), chunk_size=1)
+    for i in range(3):
+        cache[i] = i
+    cache.done()
+    cache.merge()
+    dset = StreamingDataset(str(dset_path), shuffle=False)
+    dloader = StreamingDataLoader(dset, batch_size=1, num_workers=2, shuffle=False)
+    # worker 0 is assigned with samples 0 and 1, worker 1 is assigned with sample 2
+    # the workers alternate, so the expected sequence is [0, 2, 1] and not [0, 1, 2]
+    expected_sequence = [0, 2, 1]
+    for i, x in enumerate(dloader):
+        assert x == expected_sequence[i]
+        if i == 1:
+            break
+    dloader.load_state_dict(dloader.state_dict())
+    for x in dloader:
+        assert x == expected_sequence[2]