@@ -132,15 +132,19 @@ def create_data(self, examples):
132
132
)
133
133
else :
134
134
new_messages = [
135
+ {
136
+ "role" : "system" ,
137
+ "content" : INTRO_BLURB + "\n " ,
138
+ },
135
139
{
136
140
"role" : "user" ,
137
141
"content" : examples ["instruction" ]
138
- + "\n \n "
142
+ + "\n "
139
143
+ INPUT_KEY
140
144
+ examples ["context" ]
141
- + "\n \n " ,
145
+ + "\n " ,
142
146
},
143
- {"role" : "assistant" , "content" : examples ["response" ] + "\n \n " },
147
+ {"role" : "assistant" , "content" : examples ["response" ] + "\n " },
144
148
]
145
149
146
150
return new_messages
@@ -162,7 +166,6 @@ def tokenize_func(self, tokenizer, message):
162
166
message ,
163
167
tokenize = False ,
164
168
)
165
- print (new_tokenizer )
166
169
return tokenizer (
167
170
new_tokenizer , add_special_tokens = False , max_length = self .config .get ("max_length" )
168
171
)
@@ -251,21 +254,9 @@ def prepare_dataloader(self, tokenizer, dataset):
251
254
252
255
253
256
class SlimOrcaDataPreprocess (ChatDataPreprocess ):
254
- chat_template = (
255
- "{% for message in messages %}"
256
- "{% if message['role'] == 'system' %}"
257
- "{{ '### System: ' + message['content'] }}"
258
- "{% elif message['role'] == 'user' %}"
259
- "{{ '### User: ' + message['content'] }}"
260
- "{% elif message['role'] == 'assistant' %}"
261
- "{{ '### Assistant: ' + message['content'] }}"
262
- "{% endif %}"
263
- "{% endfor %}"
264
- )
265
257
266
258
def __init__ (self , config ):
267
259
super ().__init__ (config )
268
- self .config ["chat_template" ] = self .chat_template
269
260
self .default_system = "You are a helpful, respectful and honest assistant."
270
261
271
262
def create_data (self , data ):
@@ -286,22 +277,26 @@ def create_data(self, data):
286
277
examples [conv [j ]["from" ]] = conv [j ]["value" ]
287
278
examples [conv [j + 1 ]["from" ]] = conv [j + 1 ]["value" ]
288
279
289
- new_messages = [
290
- {"role" : "system" , "content" : examples ["system" ] + "\n " },
291
- {
292
- "role" : "user" ,
293
- "content" : examples ["human" ] + "\n " ,
294
- },
295
- {"role" : "assistant" , "content" : examples ["gpt" ] + "\n " },
296
- ]
297
280
if self .config .get ("gpt_base_model" ):
298
281
if examples ["human" ]:
299
- return SLIMORCA_PROMPT_DICT [ "prompt_with_input" ] .format (
300
- system = examples ["system" ], user = examples ["human " ], gpt = examples ["gpt " ]
282
+ return PROMPT_WITH_INPUT_FORMAT .format (
283
+ instruction = examples ["system" ], response = examples ["gpt " ], input = examples ["human " ]
301
284
)
302
285
else :
303
- return SLIMORCA_PROMPT_DICT [ "prompt_with_input" ] .format (
304
- system = examples ["human " ], gpt = examples ["gpt" ]
286
+ return PROMPT_NO_INPUT_FORMAT .format (
287
+ instruction = examples ["system " ], response = examples ["gpt" ]
305
288
)
306
289
else :
290
+ new_messages = [
291
+ {"role" : "system" , "content" : INTRO_BLURB + "\n " },
292
+ {
293
+ "role" : "user" ,
294
+ "content" : examples ["system" ]
295
+ + "\n "
296
+ + INPUT_KEY
297
+ + examples ["human" ]
298
+ + "\n " ,
299
+ },
300
+ {"role" : "assistant" , "content" : examples ["gpt" ] + "\n " },
301
+ ]
307
302
return new_messages
0 commit comments