Skip to content

Commit 9df84c4

Browse files
committed
Remove default chat templates
huggingface/transformers#31733
1 parent 38a3bf6 commit 9df84c4

File tree

1 file changed

+22
-53
lines changed

1 file changed

+22
-53
lines changed

src/tokenizers.js

Lines changed: 22 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2502,8 +2502,6 @@ function truncateHelper(item, length) {
25022502
export class PreTrainedTokenizer extends Callable {
25032503
return_token_type_ids = false;
25042504

2505-
_default_chat_template = `{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}`;
2506-
25072505
padding_side = 'right';
25082506
/**
25092507
* Create a new PreTrainedTokenizer instance.
@@ -3059,26 +3057,10 @@ export class PreTrainedTokenizer extends Callable {
30593057

30603058
return decoded;
30613059
}
3062-
3063-
get default_chat_template() {
3064-
if (!this._warned_about_chat_template) {
3065-
console.warn(
3066-
"No chat template is defined for this tokenizer - using a default chat template " +
3067-
"that implements the ChatML format. If the default is not appropriate for " +
3068-
"your model, please set `tokenizer.chat_template` to an appropriate template. " +
3069-
"See https://huggingface.co/docs/transformers/main/chat_templating for more information."
3070-
)
3071-
this._warned_about_chat_template = true; // TODO move to logger.warning_once()
3072-
}
3073-
3074-
return this._default_chat_template;
3075-
}
3076-
30773060
/**
30783061
* Converts a list of message objects with `"role"` and `"content"` keys to a list of token
30793062
* ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
3080-
* determine the format and control tokens to use when converting. When chat_template is None, it will fall back
3081-
* to the default_chat_template specified at the class level.
3063+
* determine the format and control tokens to use when converting.
30823064
*
30833065
* See [here](https://huggingface.co/docs/transformers/chat_templating) for more information.
30843066
*
@@ -3105,7 +3087,7 @@ export class PreTrainedTokenizer extends Callable {
31053087
* @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys.
31063088
* @param {Object} options An optional object containing the following properties:
31073089
* @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If
3108-
* this is not passed, the model's default chat template will be used instead.
3090+
* this is not passed, the model's chat template will be used instead.
31093091
* @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate
31103092
* the start of an assistant message. This is useful when you want to generate a response from the model.
31113093
* Note that this argument will be passed to the chat template, and so it must be supported in the
@@ -3135,10 +3117,10 @@ export class PreTrainedTokenizer extends Callable {
31353117

31363118
// First, handle the cases when the model has a dict of multiple templates
31373119
if (
3138-
(this.chat_template && typeof this.chat_template === 'object') ||
3139-
(this.chat_template === null && this.default_chat_template && typeof this.default_chat_template === 'object')
3120+
(this.chat_template && typeof this.chat_template === 'object')
3121+
|| this.chat_template === null
31403122
) {
3141-
const template_dict = this.chat_template ?? this.default_chat_template; // Guaranteed to be a non-null object
3123+
const template_dict = this.chat_template;
31423124

31433125
if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
31443126
// The user can pass the name of a template to the chat template argument instead of an entire template
@@ -3154,8 +3136,17 @@ export class PreTrainedTokenizer extends Callable {
31543136
}
31553137
} else {
31563138
// These are the cases when the model has a single template
3157-
// priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
3158-
chat_template ??= this.chat_template ?? this.default_chat_template;
3139+
// priority: `chat_template` argument > `tokenizer.chat_template`
3140+
if (this.chat_template) {
3141+
chat_template = this.chat_template;
3142+
} else {
3143+
throw Error(
3144+
"Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
3145+
"argument was passed! For information about writing templates and setting the " +
3146+
"tokenizer.chat_template attribute, please see the documentation at " +
3147+
"https://huggingface.co/docs/transformers/main/en/chat_templating"
3148+
)
3149+
}
31593150
}
31603151
if (typeof chat_template !== 'string') {
31613152
throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
@@ -3250,9 +3241,7 @@ export class ElectraTokenizer extends PreTrainedTokenizer {
32503241
}
32513242

32523243
export class T5Tokenizer extends PreTrainedTokenizer { }
3253-
export class GPT2Tokenizer extends PreTrainedTokenizer {
3254-
_default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`
3255-
}
3244+
export class GPT2Tokenizer extends PreTrainedTokenizer { }
32563245
export class BartTokenizer extends PreTrainedTokenizer { }
32573246
export class MBartTokenizer extends PreTrainedTokenizer {
32583247
constructor(tokenizerJSON, tokenizerConfig) {
@@ -3278,7 +3267,7 @@ export class MBart50Tokenizer extends MBartTokenizer { } // NOTE: extends MBartT
32783267

32793268
export class RobertaTokenizer extends PreTrainedTokenizer { }
32803269

3281-
export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to get the correct chat template
3270+
export class BloomTokenizer extends PreTrainedTokenizer {
32823271

32833272
constructor(tokenizerJSON, tokenizerConfig) {
32843273
// Override the default (invalid) regex of the pretokenizer.
@@ -3295,20 +3284,11 @@ export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to
32953284
const SPIECE_UNDERLINE = "▁";
32963285

32973286
export class LlamaTokenizer extends PreTrainedTokenizer {
3298-
_default_chat_template = `{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}`
3299-
3300-
DEFAULT_SYSTEM_PROMPT =
3301-
"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " +
3302-
"answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure " +
3303-
"that your responses are socially unbiased and positive in nature.\n\n" +
3304-
"If a question does not make any sense, or is not factually coherent, explain why instead of answering something not " +
3305-
"correct. If you don't know the answer to a question, please don't share false information."
33063287

33073288
padding_side = 'left';
33083289

33093290
constructor(tokenizerJSON, tokenizerConfig) {
33103291
super(tokenizerJSON, tokenizerConfig);
3311-
this.use_default_system_prompt = tokenizerConfig.use_default_system_prompt ?? false;
33123292

33133293
this.legacy = tokenizerConfig.legacy ?? true;
33143294
if (!this.legacy) {
@@ -3341,14 +3321,8 @@ export class LlamaTokenizer extends PreTrainedTokenizer {
33413321
}
33423322
return tokens;
33433323
}
3344-
3345-
get default_chat_template() {
3346-
return super.default_chat_template
3347-
.replaceAll('USE_DEFAULT_PROMPT', this.use_default_system_prompt ? 'true' : 'false')
3348-
.replaceAll('DEFAULT_SYSTEM_MESSAGE', this.DEFAULT_SYSTEM_PROMPT.replaceAll("\n", "\\n").replaceAll("'", "\\'"));
3349-
}
33503324
}
3351-
export class CodeLlamaTokenizer extends LlamaTokenizer { } // NOTE: `LlamaTokenizer` to get the correct chat template
3325+
export class CodeLlamaTokenizer extends PreTrainedTokenizer { }
33523326

33533327
export class XLMRobertaTokenizer extends PreTrainedTokenizer { }
33543328
export class MPNetTokenizer extends PreTrainedTokenizer { }
@@ -3361,9 +3335,7 @@ export class EsmTokenizer extends PreTrainedTokenizer { }
33613335

33623336
export class Qwen2Tokenizer extends PreTrainedTokenizer { }
33633337

3364-
export class GemmaTokenizer extends PreTrainedTokenizer {
3365-
_default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
3366-
}
3338+
export class GemmaTokenizer extends PreTrainedTokenizer { }
33673339

33683340
export class Grok1Tokenizer extends PreTrainedTokenizer { }
33693341

@@ -3491,7 +3463,6 @@ export class M2M100Tokenizer extends PreTrainedTokenizer {
34913463
* @extends PreTrainedTokenizer
34923464
*/
34933465
export class WhisperTokenizer extends PreTrainedTokenizer {
3494-
_default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`;
34953466

34963467
get timestamp_begin() {
34973468
return this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1;
@@ -4284,10 +4255,8 @@ export class MarianTokenizer extends PreTrainedTokenizer {
42844255

42854256
export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer { }
42864257

4287-
export class BlenderbotTokenizer extends PreTrainedTokenizer {
4288-
_default_chat_template = `{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}`;
4289-
}
4290-
export class BlenderbotSmallTokenizer extends BlenderbotTokenizer { } // NOTE `BlenderbotTokenizer` to get the correct chat template
4258+
export class BlenderbotTokenizer extends PreTrainedTokenizer { }
4259+
export class BlenderbotSmallTokenizer extends PreTrainedTokenizer { }
42914260

42924261
export class SpeechT5Tokenizer extends PreTrainedTokenizer { }
42934262

0 commit comments

Comments
 (0)