Skip to content

Commit

Permalink
hotfix
Browse files Browse the repository at this point in the history
  • Loading branch information
Guitaricet committed Jan 4, 2024
1 parent ba7769d commit 5c5c42d
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions pretokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import multiprocessing

from loguru import logger
from datasets import load_dataset
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer


Expand Down Expand Up @@ -54,7 +54,8 @@ def main(args):
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
dataset = load_dataset(args.dataset, args.dataset_config)
if args.take is not None:
dataset = dataset.select(range(args.take))
dataset_dict = {k: v.select(range(args.take)) for k, v in dataset.items()}
dataset = DatasetDict(dataset_dict)

logger.info("Tokenizing and chunking the dataset")
_time = time.time()
Expand Down

0 comments on commit 5c5c42d

Please sign in to comment.