Closed
Description
🐛 Bug
To Reproduce
- Clone this Studio template: https://lightning.ai/lightning-ai/studios/how-to-scrape-web-data-to-finetune-llms?view=public§ion=featured
pip install -U lightning-sdk lightning litdata
- Run main.py
- Run optimize.py (code sample below)
Code sample
import requests, os
from bs4 import BeautifulSoup
from litdata import optimize
from requests.exceptions import SSLError, ConnectionError, ReadTimeout
from urllib3.exceptions import HeaderParsingError
import pandas as pd
from lightning_sdk import Machine
from lit_gpt import Tokenizer
from functools import partial
# 1. List of the text files
input_dir = f"/teamspace/datasets/{os.getenv('LIGHTNING_USERNAME', 'undefined')}/website-data"
files = [os.path.join(input_dir, filepath) for filepath in os.listdir(input_dir)]
# 2. Define the tokenize function
def tokenize_fn(filepath, tokenizer=None):
with open(filepath, "r") as f:
text = f.read()
encoded = tokenizer.encode(text, bos=False, eos=True)
return encoded
# yield encoded # <------------ Works if you yield instead of return
# 3. Use the optimize operator to apply the `tokenize_fn` over all the files and write its return into chunks
optimize(
fn=partial(tokenize_fn, tokenizer=Tokenizer("./checkpoints/Llama-2-7b-hf")),
inputs=files,
output_dir=f"/teamspace/datasets/{os.getenv('LIGHTNING_USERNAME', 'undefined')}/website-data-optimized2",
num_workers=1,
chunk_size=2049 * 1024,
reorder_files=True,
)
Error:
Starting 1 workers with 8 items.
Workers are ready ! Starting data processing...
Rank 0 inferred the following `['pickle']` data format. | 0/8 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 616, in _handle_data_chunk_recipe
chunk_filepath = self.cache._add_item(self._index_counter, item_data)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/streaming/cache.py", line 129, in _add_item
return self._writer.add_item(index, data)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/streaming/writer.py", line 286, in add_item
data, dim = self.serialize(items)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/streaming/writer.py", line 170, in serialize
return data[0], flattened[0].shape[0]
IndexError: tuple index out of range
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 409, in run
self._loop()
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 458, in _loop
self._handle_data_chunk_recipe(index)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 624, in _handle_data_chunk_recipe
raise RuntimeError(f"Failed processing {self.items[index]}") from e
RuntimeError: Failed processing /cache/data/website-data/7.txt
Worker 0 is done.
Traceback (most recent call last):
File "/teamspace/studios/this_studio/optimize.py", line 25, in <module>
optimize(
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/functions.py", line 355, in optimize
data_processor.run(
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 960, in run
self._exit_on_error(error)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 1020, in _exit_on_error
raise RuntimeError(f"We found the following error {error}.")
RuntimeError: We found the following error Traceback (most recent call last):
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 616, in _handle_data_chunk_recipe
chunk_filepath = self.cache._add_item(self._index_counter, item_data)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/streaming/cache.py", line 129, in _add_item
return self._writer.add_item(index, data)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/streaming/writer.py", line 286, in add_item
data, dim = self.serialize(items)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/streaming/writer.py", line 170, in serialize
return data[0], flattened[0].shape[0]
IndexError: tuple index out of range
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 409, in run
self._loop()
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 458, in _loop
self._handle_data_chunk_recipe(index)
File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 624, in _handle_data_chunk_recipe
raise RuntimeError(f"Failed processing {self.items[index]}") from e
RuntimeError: Failed processing /cache/data/website-data/7.txt
If you change the return statement to yield in the processing function, it works. I failed to make a minimal repro outside this Studio.
Expected behavior
Both return and yield should work depending on whether the user wants to return one or more examples from the preprocessing function.
Environment
- PyTorch Version (e.g., 1.0): 2.2.1
- OS (e.g., Linux): Linux
- How you installed PyTorch (
conda
,pip
, source): pip - Build command you used (if compiling from source):
- Python version: 3.10
- CUDA/cuDNN version:
- GPU models and configuration:
- Any other relevant information:
Additional context
litdata 0.2.2
lightning 2.2.1
lightning-cloud 0.5.64
lightning_sdk 0.1.2