Enforce passing item_loader when customizing underlying storage format (#296)

tchaton · awaelchli · web-flow · commit 518a1c3d84ae · 2024-08-05T09:31:39.000+01:00
Co-authored-by: awaelchli &lt;aedu.waelchli@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -311,6 +311,69 @@ for batch_idx, batch in enumerate(dataloader):
 </details>
 
 
+<details>
+  <summary> ✅ LLM Pre-training </summary>
+&nbsp;
+
+LitData is highly optimized for LLM pre-training. First, we need to tokenize the entire dataset and then we can consume it.
+
+```python
+import json
+from pathlib import Path
+import zstandard as zstd
+from litdata import optimize, TokensLoader
+from tokenizer import Tokenizer
+from functools import partial
+
+# 1. Define a function to convert the text within the jsonl files into tokens
+def tokenize_fn(filepath, tokenizer=None):
+    with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+        for row in f:
+            text = json.loads(row)["text"]
+            if json.loads(row)["meta"]["redpajama_set_name"] == "RedPajamaGithub":
+                continue  # exclude the GitHub data since it overlaps with starcoder
+            text_ids = tokenizer.encode(text, bos=False, eos=True)
+            yield text_ids
+
+if __name__ == "__main__":
+    # 2. Generate the inputs (we are going to optimize all the compressed json files from SlimPajama dataset )
+    input_dir = "./slimpajama-raw"
+    inputs = [str(file) for file in Path(f"{input_dir}/SlimPajama-627B/train").rglob("*.zst")]
+
+    # 3. Store the optimized data wherever you want under "/teamspace/datasets" or "/teamspace/s3_connections"
+    outputs = optimize(
+        fn=partial(tokenize_fn, tokenizer=Tokenizer(f"{input_dir}/checkpoints/Llama-2-7b-hf")), # Note: You can use HF tokenizer or any others
+        inputs=inputs,
+        output_dir="./slimpajama-optimized",
+        chunk_size=(2049 * 8012),
+        # This is important to inform LitData that we are encoding contiguous 1D array (tokens). 
+        # LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
+        item_loader=TokensLoader(),
+    )
+```
+
+```python
+import os
+from litdata import StreamingDataset, CombinedStreamingDataset, StreamingDataLoader, TokensLoader
+from tqdm import tqdm
+
+# Increase by one because we need the next word as well
+dataset = StreamingDataset(
+  input_dir=f"./slimpajama-optimized/train",
+  item_loader=TokensLoader(block_size=2048 + 1),
+  shuffle=True,
+  drop_last=True,
+)
+
+train_dataloader = StreamingDataLoader(dataset, batch_size=8, pin_memory=True, num_workers=os.cpu_count())
+
+# Iterate over the SlimPajama dataset
+for batch in tqdm(train_dataloader):
+    pass
+```
+
+</details>
+
 <details>
   <summary> ✅ Combine datasets</summary>
 &nbsp;
diff --git a/src/litdata/processing/data_processor.py b/src/litdata/processing/data_processor.py
@@ -50,6 +50,7 @@
 from litdata.streaming.cache import Dir
 from litdata.streaming.client import S3Client
 from litdata.streaming.dataloader import StreamingDataLoader
+from litdata.streaming.item_loader import BaseItemLoader
 from litdata.streaming.resolver import _resolve_dir
 from litdata.utilities._pytree import tree_flatten, tree_unflatten, treespec_loads
 from litdata.utilities.broadcast import broadcast_object
@@ -399,6 +400,7 @@ def __init__(
         use_checkpoint: bool = False,
         checkpoint_chunks_info: Optional[List[Dict[str, Any]]] = None,
         checkpoint_next_index: Optional[int] = None,
+        item_loader: Optional[BaseItemLoader] = None,
     ) -> None:
         """The BaseWorker is responsible to process the user data."""
         self.worker_index = worker_index
@@ -424,6 +426,7 @@ def __init__(
         self.remove_queue: Queue = Queue()
         self.progress_queue: Queue = progress_queue
         self.error_queue: Queue = error_queue
+        self.item_loader = item_loader
         self._counter = 0
         self._last_time = time()
         self._index_counter = 0
@@ -522,6 +525,7 @@ def _create_cache(self) -> None:
             compression=self.data_recipe.compression,
             encryption=self.data_recipe.encryption,
             writer_chunk_index=self.writer_starting_chunk_index,
+            item_loader=self.item_loader,
         )
         self.cache._reader._rank = _get_node_rank() * self.num_workers + self.worker_index
 
@@ -880,6 +884,7 @@ def __init__(
         reader: Optional[BaseReader] = None,
         state_dict: Optional[Dict[int, int]] = None,
         use_checkpoint: bool = False,
+        item_loader: Optional[BaseItemLoader] = None,
         start_method: Optional[str] = None,
     ):
         """The `DatasetOptimiser` provides an efficient way to process data across multiple machine into chunks to make
@@ -902,6 +907,8 @@ def __init__(
             state_dict: The writer state dict. This is used to decide how to append data to an existing dataset.
             use_checkpoint: Whether to create checkpoints while processing the data, which can be used to resume the
                 processing from the last checkpoint if the process is interrupted. (`Default: False`)
+            item_loader: The item loader that will be used during loading in StreamingDataset. Determines
+                    the format in which the data is stored and optimized for loading.
             start_method: The start method used by python multiprocessing package. Default to spawn unless running
                 inside an interactive shell like Ipython.
 
@@ -937,6 +944,7 @@ def __init__(
         self.use_checkpoint = use_checkpoint
         self.checkpoint_chunks_info: Optional[List[List[Dict[str, Any]]]] = None
         self.checkpoint_next_index: Optional[List[int]] = None
+        self.item_loader = item_loader
 
         self.state_dict = state_dict or {rank: 0 for rank in range(self.num_workers)}
 
@@ -1157,6 +1165,7 @@ def _create_process_workers(self, data_recipe: DataRecipe, workers_user_items: L
                 self.use_checkpoint,
                 self.checkpoint_chunks_info[worker_idx] if self.checkpoint_chunks_info else None,
                 self.checkpoint_next_index[worker_idx] if self.checkpoint_next_index else None,
+                self.item_loader,
             )
             worker.start()
             workers.append(worker)
diff --git a/src/litdata/processing/functions.py b/src/litdata/processing/functions.py
@@ -38,6 +38,7 @@
 )
 from litdata.streaming.client import S3Client
 from litdata.streaming.dataloader import StreamingDataLoader
+from litdata.streaming.item_loader import BaseItemLoader
 from litdata.streaming.resolver import (
     Dir,
     _assert_dir_has_index_file,
@@ -311,6 +312,7 @@ def optimize(
     batch_size: Optional[int] = None,
     mode: Optional[Literal["append", "overwrite"]] = None,
     use_checkpoint: bool = False,
+    item_loader: Optional[BaseItemLoader] = None,
     start_method: Optional[str] = None,
 ) -> None:
     """This function converts a dataset into chunks, possibly in a distributed way.
@@ -341,6 +343,8 @@ def optimize(
             Defaults to None.
         use_checkpoint: Whether to create checkpoints while processing the data, which can be used to resume the
             processing from the last checkpoint if the process is interrupted. (`Default: False`)
+        item_loader: The item loader that will be used during loading in StreamingDataset. Determines
+                the format in which the data is stored and optimized for loading.
         start_method: The start method used by python multiprocessing package. Default to spawn unless running
             inside an interactive shell like Ipython.
 
@@ -433,6 +437,7 @@ def optimize(
             reader=reader,
             state_dict=state_dict,
             use_checkpoint=use_checkpoint,
+            item_loader=item_loader,
             start_method=start_method,
         )
 
diff --git a/src/litdata/streaming/cache.py b/src/litdata/streaming/cache.py
@@ -76,6 +76,7 @@ def __init__(
             encryption=encryption,
             serializers=serializers,
             chunk_index=writer_chunk_index or 0,
+            item_loader=item_loader,
         )
         self._reader = BinaryReader(
             self._cache_dir,
diff --git a/src/litdata/streaming/config.py b/src/litdata/streaming/config.py
@@ -257,12 +257,17 @@ def __len__(self) -> int:
 
     def _validate_item_loader(self) -> None:
         assert self._config
-        if (
-            len(self._config["data_format"]) == 1
-            and self._config["data_format"][0].startswith("no_header_tensor")
-            and not isinstance(self._item_loader, TokensLoader)
-        ):
-            raise ValueError("Please, use Cache(..., item_loader=TokensLoader(block_size=...))")
+        if "item_loader" in self._config:
+            if self._item_loader.__class__.__name__ != self._config["item_loader"]:
+                item_loader = self._config["item_loader"]
+                raise ValueError(f"Please, use Cache(..., item_loader={item_loader}(...))")
+        else:
+            if (
+                len(self._config["data_format"]) == 1
+                and self._config["data_format"][0].startswith("no_header_tensor")
+                and not isinstance(self._item_loader, TokensLoader)
+            ):
+                raise ValueError("Please, use Cache(..., item_loader=TokensLoader(block_size=...))")
 
 
 def load_subsampled_chunks(subsampled_files: List[str], original_chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
diff --git a/src/litdata/streaming/item_loader.py b/src/litdata/streaming/item_loader.py
@@ -100,6 +100,10 @@ def delete(self, chunk_index: int, chunk_filepath: str) -> None:
         """Delete a chunk from the local filesystem."""
         pass
 
+    @abstractmethod
+    def encode_data(self, data: List[bytes], sizes: List[int], flattened: List[Any]) -> Any:
+        pass
+
 
 class PyTreeLoader(BaseItemLoader):
     """The Pytree Loader is the default loader of the Cache object."""
@@ -245,9 +249,16 @@ def _validate_encryption(self, encryption: Optional[Encryption]) -> None:
         if encryption.level != self._config["encryption"]["level"]:
             raise ValueError("Encryption level mismatch.")
 
+    @classmethod
+    def encode_data(cls, data: List[bytes], sizes: List[int], flattened: List[Any]) -> Tuple[bytes, Optional[int]]:
+        # Concatenante into a single byte array
+        head = np.array(sizes, np.uint32).tobytes()
+        body = b"".join(data)
+        return head + body, None
+
 
 class TokensLoader(BaseItemLoader):
-    def __init__(self, block_size: int):
+    def __init__(self, block_size: Optional[int] = None):
         """The Tokens Loader is an optimizer item loader for NLP.
 
         Arguments:
@@ -263,6 +274,7 @@ def __init__(self, block_size: int):
         self._chunk_filepaths: Dict[str, bool] = {}
 
     def state_dict(self) -> Dict:
+        assert self._block_size
         return {
             "block_size": self._block_size,
         }
@@ -280,6 +292,7 @@ def setup(
             raise ValueError("The provided chunks isn't properly setup.")
 
     def generate_intervals(self) -> List[Interval]:
+        assert self._block_size
         intervals = []
         begin = 0
         end = 0
@@ -324,6 +337,8 @@ def load_item_from_chunk(
         begin: int,
         chunk_bytes: int,
     ) -> torch.Tensor:
+        assert self._block_size
+
         if chunk_filepath in self._chunk_filepaths and not os.path.isfile(chunk_filepath):
             del self._chunk_filepaths[chunk_filepath]
 
@@ -350,3 +365,7 @@ def delete(self, chunk_index: int, chunk_filepath: str) -> None:
             if chunk_index in self._mmaps:
                 del self._mmaps[chunk_index]
             os.remove(chunk_filepath)
+
+    @classmethod
+    def encode_data(cls, data: List[bytes], _: List[int], flattened: List[Any]) -> Tuple[bytes, Optional[int]]:
+        return data[0], flattened[0].shape[0]
diff --git a/src/litdata/streaming/writer.py b/src/litdata/streaming/writer.py
@@ -20,11 +20,11 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-import torch
 
 from litdata.constants import _INDEX_FILENAME
 from litdata.processing.utilities import get_worker_rank
 from litdata.streaming.compression import _COMPRESSORS, Compressor
+from litdata.streaming.item_loader import BaseItemLoader, PyTreeLoader
 from litdata.streaming.serializers import Serializer, _get_serializers
 from litdata.utilities._pytree import PyTree, tree_flatten, treespec_dumps
 from litdata.utilities.encryption import Encryption, EncryptionLevel
@@ -54,6 +54,7 @@ def __init__(
         follow_tensor_dimension: bool = True,
         serializers: Optional[Dict[str, Serializer]] = None,
         chunk_index: Optional[int] = None,
+        item_loader: Optional[BaseItemLoader] = None,
     ):
         """The BinaryWriter enables to chunk dataset into an efficient streaming format for cloud training.
 
@@ -83,6 +84,7 @@ def __init__(
         self._chunk_bytes = _convert_bytes_to_int(chunk_bytes) if isinstance(chunk_bytes, str) else chunk_bytes
         self._compression = compression
         self._encryption = encryption
+        self._item_loader = item_loader or PyTreeLoader()
 
         self._data_format: Optional[List[str]] = None
         self._data_spec: Optional[PyTree] = None
@@ -148,6 +150,7 @@ def get_config(self) -> Dict[str, Any]:
             "data_format": self._data_format,
             "data_spec": treespec_dumps(self._data_spec) if self._data_spec else None,
             "encryption": self._encryption.state_dict() if self._encryption else None,
+            "item_loader": self._item_loader.__class__.__name__,
         }
 
     def serialize(self, items: Any) -> Tuple[bytes, Optional[int]]:
@@ -156,10 +159,6 @@ def serialize(self, items: Any) -> Tuple[bytes, Optional[int]]:
         # Flatten the items provided by the users
         flattened, data_spec = tree_flatten(items)
 
-        is_single_tensor = (
-            len(flattened) == 1 and isinstance(flattened[0], torch.Tensor) and len(flattened[0].shape) == 1
-        )
-
         # Collect the sizes and associated bytes for each item
         sizes: List[int] = []
         data: List[bytes] = []
@@ -178,14 +177,7 @@ def serialize(self, items: Any) -> Tuple[bytes, Optional[int]]:
             # tiny optimization to avoid looping over all the data format
             self._serialize_with_data_format(flattened, sizes, data, self._data_format)
 
-        # If there is a single element and it is a tensor, enable continous array.
-        if is_single_tensor:
-            return data[0], flattened[0].shape[0]
-
-        # Concatenante into a single byte array
-        head = np.array(sizes, np.uint32).tobytes()
-        body = b"".join(data)
-        return head + body, None
+        return self._item_loader.encode_data(data, sizes, flattened)
 
     def _serialize(self, item: Any, sizes: List[int], data: List[bytes]) -> str:
         """Serialize a given item and append its size and bytes to the sizes and data array."""
diff --git a/tests/streaming/test_cache.py b/tests/streaming/test_cache.py
@@ -211,6 +211,7 @@ def test_cache_with_auto_wrapping(tmpdir):
     assert sorted(os.listdir(os.path.join(tmpdir, "cache_1"))) == [
         "chunk-0-0.bin",
         "chunk-0-1.bin",
+        "chunk-0-2.bin",
         "index.json",
     ]
     # Your dataset is optimised for the cloud
diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py

Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,7 @@ def __init__(`
`76`	`76`	`encryption=encryption,`
`77`	`77`	`serializers=serializers,`
`78`	`78`	`chunk_index=writer_chunk_index or 0,`
	`79`	`+ item_loader=item_loader,`
`79`	`80`	`)`
`80`	`81`	`self._reader = BinaryReader(`
`81`	`82`	`self._cache_dir,`
Original file line number	Diff line number	Diff line change
`@@ -211,6 +211,7 @@ def test_cache_with_auto_wrapping(tmpdir):`
`211`	`211`	`assert sorted(os.listdir(os.path.join(tmpdir, "cache_1"))) == [`
`212`	`212`	`"chunk-0-0.bin",`
`213`	`213`	`"chunk-0-1.bin",`
	`214`	`+ "chunk-0-2.bin",`
`214`	`215`	`"index.json",`
`215`	`216`	`]`
`216`	`217`	`# Your dataset is optimised for the cloud`