We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 1cd2772 commit 3f6e2d2Copy full SHA for 3f6e2d2
datastore/get_datastore_code.py
@@ -41,7 +41,7 @@
41
writer = draftretriever.Writer(
42
index_file_path=datastore_path,
43
max_chunk_len=512 * 1024 * 1024,
44
- vocab_size=tokenizer.vocab_size,
+ vocab_size=tokenizer.vocab_size + len(tokenizer.get_added_vocab()),
45
)
46
47
total_length = len(dataset)
@@ -51,4 +51,4 @@
51
token_list = tokenizer.encode(sample['content'])
52
writer.add_entry(token_list)
53
54
-writer.finalize()
+writer.finalize()
0 commit comments