forked from huggingface/transformers.js
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_tests.py
129 lines (99 loc) · 3.88 KB
/
generate_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Helper file to dynamically generate unit tests
# This is done by running the python Transformers library and comparing its outputs with ours.
import json
import os
from transformers import AutoTokenizer, AutoConfig
from scripts.supported_models import SUPPORTED_MODELS
# List of tokenizers where the model isn't yet supported, but the tokenizer is
ADDITIONAL_TOKENIZERS_TO_TEST = {
'RefinedWebModel': [
'tiiuae/falcon-7b',
],
"llama": [
"hf-internal-testing/llama-tokenizer",
],
'mpt': [
'mosaicml/mpt-7b',
],
}
TOKENIZER_TEST_DATA = {
"shared": [
"hello world",
"Hello World",
"How are you doing?",
"You should've done this",
"A\n'll !!to?'d''d of, can't.",
"def main():\n\tpass",
"This\n\nis\na\ntest.",
"let a = obj.toString();\ntoString();",
'Hi Hello',
"trailing space ",
" leading space",
"生活的真谛是",
"The company was founded in 2016.",
"test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test",
"I bought an apple for $1.00 at the store.",
],
"custom": {
"tiiuae/falcon-7b": [
"12 and 123 and 1234", # Special case for splitting on 3 numbers
]
},
}
def generate_tokenizer_tests():
results = {}
tokenizers_to_test = list(SUPPORTED_MODELS.items()) + list(ADDITIONAL_TOKENIZERS_TO_TEST.items())
for model_type, tokenizer_names in tokenizers_to_test:
print(f'Generating tests for {model_type}')
for tokenizer_name in tokenizer_names:
print(' -', tokenizer_name)
try:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
except KeyError:
# If a KeyError is raised from the AutoTokenizer, it means the model
# does not use a tokenizer (e.g., vision models)
continue
tokenizer_results = []
shared_texts = TOKENIZER_TEST_DATA["shared"]
custom_texts = TOKENIZER_TEST_DATA["custom"].get(
tokenizer_name, [])
# Run tokenizer on test cases
for text in shared_texts + custom_texts:
# TODO: add with_pair option
encoded = tokenizer(text).data
decoded_with_special = tokenizer.decode(
encoded["input_ids"], skip_special_tokens=False)
decoded_without_special = tokenizer.decode(
encoded["input_ids"], skip_special_tokens=True)
tokenizer_results.append(dict(
input=text,
encoded=encoded,
decoded_with_special=decoded_with_special,
decoded_without_special=decoded_without_special,
))
results[tokenizer_name] = tokenizer_results
return results
def generate_config_tests():
results = {}
for model_type, config_names in SUPPORTED_MODELS.items():
for config_name in config_names:
# Load config
config = AutoConfig.from_pretrained(config_name)
results[config_name] = config.to_dict()
# TODO: Remove after https://github.com/huggingface/transformers/issues/23876 fixed
results[config_name].pop('torch_dtype', None)
return results
def main():
# TODO add option to cache generated data + force build tests
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "data",
)
tokenizer_tests = generate_tokenizer_tests()
with open(os.path.join(data_dir, "tokenizer_tests.json"), "w", encoding="utf-8") as fp:
json.dump(tokenizer_tests, fp)
config_tests = generate_config_tests()
with open(os.path.join(data_dir, "config_tests.json"), "w", encoding="utf-8") as fp:
json.dump(config_tests, fp)
if __name__ == "__main__":
main()