Skip to content

Commit 6cc3a46

Browse files
l0rincLőrinc
and
Lőrinc
authored
Optimize regular expressions used for splitting by ~20% (#234)
By combining the contractions to a single non-capturing group prefixed by `'`, we can speed up matches by roughly 20%. By using possessive quantifiers for the `cl100k_base` in the word and punctuation groups we're avoiding some backtracking. The last whitespace groups can also be simplified to have a single newline matched explicitly, since the previous whitespace would already match it. Overall the regex matches the exact same sequence of characters as before for any case and for unicode sequences. Co-authored-by: Lőrinc <lorinc.pap@gmail.com>
1 parent db5bda9 commit 6cc3a46

File tree

1 file changed

+8
-5
lines changed

1 file changed

+8
-5
lines changed

tiktoken_ext/openai_public.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@ def gpt2():
1717
return {
1818
"name": "gpt2",
1919
"explicit_n_vocab": 50257,
20-
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
20+
# The pattern in the original GPT-2 release is:
21+
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
22+
# This is equivalent, but executes faster:
23+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
2124
"mergeable_ranks": mergeable_ranks,
2225
"special_tokens": {ENDOFTEXT: 50256},
2326
}
@@ -31,7 +34,7 @@ def r50k_base():
3134
return {
3235
"name": "r50k_base",
3336
"explicit_n_vocab": 50257,
34-
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
37+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
3538
"mergeable_ranks": mergeable_ranks,
3639
"special_tokens": {ENDOFTEXT: 50256},
3740
}
@@ -45,7 +48,7 @@ def p50k_base():
4548
return {
4649
"name": "p50k_base",
4750
"explicit_n_vocab": 50281,
48-
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
51+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
4952
"mergeable_ranks": mergeable_ranks,
5053
"special_tokens": {ENDOFTEXT: 50256},
5154
}
@@ -59,7 +62,7 @@ def p50k_edit():
5962
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
6063
return {
6164
"name": "p50k_edit",
62-
"pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
65+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
6366
"mergeable_ranks": mergeable_ranks,
6467
"special_tokens": special_tokens,
6568
}
@@ -79,7 +82,7 @@ def cl100k_base():
7982
}
8083
return {
8184
"name": "cl100k_base",
82-
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
85+
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""",
8386
"mergeable_ranks": mergeable_ranks,
8487
"special_tokens": special_tokens,
8588
}

0 commit comments

Comments
 (0)