From 3eb7268834f7a553281409d5dfa719493d8f26c9 Mon Sep 17 00:00:00 2001 From: vahe1994 Date: Fri, 12 Jan 2024 18:10:43 +0400 Subject: [PATCH] old C4 fix, by pinning the revision --- src/datautils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/datautils.py b/src/datautils.py index 4a3a0d10..679f9220 100644 --- a/src/datautils.py +++ b/src/datautils.py @@ -57,7 +57,11 @@ def get_ptb(nsamples, seqlen, tokenizer, eval_mode=False): def get_c4(nsamples, seqlen, tokenizer, eval_mode=False): if not eval_mode: traindata = load_dataset( - "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train" + "allenai/c4", + "allenai--c4", + data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, + split="train", + revision="607bd4c8450a42878aa9ddc051a65a055450ef87", ) trainloader = [] for _ in range(nsamples): @@ -80,6 +84,7 @@ def get_c4(nsamples, seqlen, tokenizer, eval_mode=False): "allenai--c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation", + revision="607bd4c8450a42878aa9ddc051a65a055450ef87", ) random.seed(0) valenc = []