Skip to content

Commit

Permalink
old C4 fix, by pinning the revision
Browse files Browse the repository at this point in the history
  • Loading branch information
Vahe1994 committed Jan 12, 2024
1 parent f99ff50 commit 3eb7268
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion src/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ def get_ptb(nsamples, seqlen, tokenizer, eval_mode=False):
def get_c4(nsamples, seqlen, tokenizer, eval_mode=False):
if not eval_mode:
traindata = load_dataset(
"allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train"
"allenai/c4",
"allenai--c4",
data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
split="train",
revision="607bd4c8450a42878aa9ddc051a65a055450ef87",
)
trainloader = []
for _ in range(nsamples):
Expand All @@ -80,6 +84,7 @@ def get_c4(nsamples, seqlen, tokenizer, eval_mode=False):
"allenai--c4",
data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
split="validation",
revision="607bd4c8450a42878aa9ddc051a65a055450ef87",
)
random.seed(0)
valenc = []
Expand Down

0 comments on commit 3eb7268

Please sign in to comment.