|
7 | 7 |
|
8 | 8 | class TestTokenizeFunction(unittest.TestCase):
|
9 | 9 | def setUp(self):
|
10 |
| - self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf') |
| 10 | + self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") |
11 | 11 | self.config = {
|
12 |
| - 'gpt_base_model': True, |
13 |
| - 'max_length': 512, |
14 |
| - 'trust_remote_code': False, |
15 |
| - 'chat_template': "Below is an instruction that describes a task. Write a response that appropriately " |
16 |
| - "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" |
17 |
| - "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" |
18 |
| - "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " |
19 |
| - "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " |
20 |
| - "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " |
21 |
| - "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " |
22 |
| - "End \n'}}", |
| 12 | + "gpt_base_model": True, |
| 13 | + "max_length": 512, |
| 14 | + "trust_remote_code": False, |
| 15 | + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately " |
| 16 | + "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" |
| 17 | + "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" |
| 18 | + "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " |
| 19 | + "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " |
| 20 | + "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " |
| 21 | + "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " |
| 22 | + "End \n'}}", |
23 | 23 | }
|
24 | 24 | self.processer = GeneralProcesser(self.config)
|
25 | 25 |
|
26 | 26 | def test_tokenize_function_with_gpt_model(self):
|
27 |
| - self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b') |
| 27 | + self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") |
28 | 28 |
|
29 |
| - examples = \ |
30 |
| - { |
31 |
| - "instruction": "Test instruction", |
32 |
| - "response": "Test response", |
33 |
| - "context": "Test context", |
34 |
| - } |
| 29 | + examples = { |
| 30 | + "instruction": "Test instruction", |
| 31 | + "response": "Test response", |
| 32 | + "context": "Test context", |
| 33 | + } |
35 | 34 |
|
36 | 35 | # Verify the format of the result
|
37 |
| - expected_result = 'Below is an instruction that describes a task. Write a response that '\ |
38 |
| - 'appropriately completes the request.\n'\ |
39 |
| - '\n'\ |
40 |
| - '### Instruction:\n'\ |
41 |
| - 'Test instruction\n'\ |
42 |
| - '\n'\ |
43 |
| - 'Input:\n'\ |
44 |
| - 'Test context\n'\ |
45 |
| - '\n'\ |
46 |
| - '### Response:\n'\ |
47 |
| - 'Test response\n'\ |
48 |
| - '\n'\ |
49 |
| - '### End' |
| 36 | + expected_result = ( |
| 37 | + "Below is an instruction that describes a task. Write a response that " |
| 38 | + "appropriately completes the request.\n" |
| 39 | + "\n" |
| 40 | + "### Instruction:\n" |
| 41 | + "Test instruction\n" |
| 42 | + "\n" |
| 43 | + "Input:\n" |
| 44 | + "Test context\n" |
| 45 | + "\n" |
| 46 | + "### Response:\n" |
| 47 | + "Test response\n" |
| 48 | + "\n" |
| 49 | + "### End" |
| 50 | + ) |
50 | 51 |
|
51 | 52 | result = self.processer.tokenize_function(examples, self.tokenizer)
|
52 |
| - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 53 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
53 | 54 |
|
54 | 55 | def test_tokenize_function_with_custom_chat_template(self):
|
55 |
| - examples = \ |
56 |
| - { |
57 |
| - "instruction": "Test instruction", |
58 |
| - "response": "Test response", |
59 |
| - "context": "Test context", |
60 |
| - } |
| 56 | + examples = { |
| 57 | + "instruction": "Test instruction", |
| 58 | + "response": "Test response", |
| 59 | + "context": "Test context", |
| 60 | + } |
61 | 61 |
|
62 | 62 | # Verify the format of the result
|
63 |
| - expected_result = '<|im_start|>user\n' \ |
64 |
| - '###Instruction:\n' \ |
65 |
| - 'Test instruction\n' \ |
66 |
| - '\n' \ |
67 |
| - '###context:\n' \ |
68 |
| - 'Test context\n' \ |
69 |
| - '\n' \ |
70 |
| - '<|im_end|><|im_start|>assistant\n' \ |
71 |
| - 'Test response\n' \ |
72 |
| - '\n' \ |
73 |
| - '<|im_end|>' |
| 63 | + expected_result = ( |
| 64 | + "<|im_start|>user\n" |
| 65 | + "###Instruction:\n" |
| 66 | + "Test instruction\n" |
| 67 | + "\n" |
| 68 | + "###context:\n" |
| 69 | + "Test context\n" |
| 70 | + "\n" |
| 71 | + "<|im_end|><|im_start|>assistant\n" |
| 72 | + "Test response\n" |
| 73 | + "\n" |
| 74 | + "<|im_end|>" |
| 75 | + ) |
74 | 76 | # Set custom chat template
|
75 |
| - self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\ |
76 |
| - "+ message['content'] + '<|im_end|>'}}{% endfor %}" |
| 77 | + self.config["custom_chat_template"] = ( |
| 78 | + "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'" |
| 79 | + "+ message['content'] + '<|im_end|>'}}{% endfor %}" |
| 80 | + ) |
77 | 81 |
|
78 |
| - self.config['gpt_base_model'] = False |
| 82 | + self.config["gpt_base_model"] = False |
79 | 83 | result = self.processer.tokenize_function(examples, self.tokenizer)
|
80 |
| - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 84 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
81 | 85 |
|
82 | 86 | def test_tokenize_function_with_chat_template(self):
|
83 |
| - examples = \ |
84 |
| - { |
85 |
| - "instruction": "Test instruction", |
86 |
| - "response": "Test response", |
87 |
| - "context": "Test context", |
88 |
| - } |
| 87 | + examples = { |
| 88 | + "instruction": "Test instruction", |
| 89 | + "response": "Test response", |
| 90 | + "context": "Test context", |
| 91 | + } |
89 | 92 |
|
90 | 93 | # Verify the format of the result
|
91 |
| - expected_result = 'Below is an instruction that describes a task. Write a response that '\ |
92 |
| - 'appropriately completes the request\n'\ |
93 |
| - '### Instruction: ###Instruction:\n'\ |
94 |
| - 'Test instruction\n'\ |
95 |
| - '\n'\ |
96 |
| - '###context:\n'\ |
97 |
| - 'Test context\n'\ |
98 |
| - '\n'\ |
99 |
| - '### Response: Test response\n'\ |
100 |
| - '\n'\ |
101 |
| - '### End \n'\ |
102 |
| - |
103 |
| - self.config['gpt_base_model'] = False |
| 94 | + expected_result = ( |
| 95 | + "Below is an instruction that describes a task. Write a response that " |
| 96 | + "appropriately completes the request\n" |
| 97 | + "### Instruction: ###Instruction:\n" |
| 98 | + "Test instruction\n" |
| 99 | + "\n" |
| 100 | + "###context:\n" |
| 101 | + "Test context\n" |
| 102 | + "\n" |
| 103 | + "### Response: Test response\n" |
| 104 | + "\n" |
| 105 | + "### End \n" |
| 106 | + ) |
| 107 | + self.config["gpt_base_model"] = False |
104 | 108 | result = self.processer.tokenize_function(examples, self.tokenizer)
|
105 |
| - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 109 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
106 | 110 |
|
107 | 111 | def test_tokenize_function_with_default_chat_template(self):
|
108 |
| - self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it') |
109 |
| - examples = \ |
110 |
| - { |
111 |
| - "instruction": "Test instruction", |
112 |
| - "response": "Test response", |
113 |
| - "context": "Test context", |
114 |
| - } |
| 112 | + self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") |
| 113 | + examples = { |
| 114 | + "instruction": "Test instruction", |
| 115 | + "response": "Test response", |
| 116 | + "context": "Test context", |
| 117 | + } |
115 | 118 |
|
116 | 119 | chat_example = [
|
117 | 120 | {
|
118 | 121 | "role": "user",
|
119 | 122 | "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n",
|
120 |
| - |
121 | 123 | },
|
122 | 124 | {
|
123 | 125 | "role": "assistant",
|
124 | 126 | "content": "Test response\n\n",
|
125 |
| - } |
| 127 | + }, |
126 | 128 | ]
|
127 | 129 |
|
128 | 130 | # Verify the format of the result
|
129 |
| - expected_result = self.tokenizer.apply_chat_template(chat_example, |
130 |
| - tokenize=False, |
131 |
| - max_length=self.config.get("max_length")) |
| 131 | + expected_result = self.tokenizer.apply_chat_template( |
| 132 | + chat_example, tokenize=False, max_length=self.config.get("max_length") |
| 133 | + ) |
132 | 134 |
|
133 |
| - self.config['gpt_base_model'] = False |
| 135 | + self.config["gpt_base_model"] = False |
134 | 136 | result = self.processer.tokenize_function(examples, self.tokenizer)
|
135 |
| - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) |
| 137 | + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) |
136 | 138 |
|
137 | 139 |
|
138 |
| -if __name__ == '__main__': |
| 140 | +if __name__ == "__main__": |
139 | 141 | unittest.main()
|
0 commit comments