@@ -2502,8 +2502,6 @@ function truncateHelper(item, length) {
25022502export class PreTrainedTokenizer extends Callable {
25032503 return_token_type_ids = false ;
25042504
2505- _default_chat_template = `{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}` ;
2506-
25072505 padding_side = 'right' ;
25082506 /**
25092507 * Create a new PreTrainedTokenizer instance.
@@ -3059,26 +3057,10 @@ export class PreTrainedTokenizer extends Callable {
30593057
30603058 return decoded ;
30613059 }
3062-
3063- get default_chat_template ( ) {
3064- if ( ! this . _warned_about_chat_template ) {
3065- console . warn (
3066- "No chat template is defined for this tokenizer - using a default chat template " +
3067- "that implements the ChatML format. If the default is not appropriate for " +
3068- "your model, please set `tokenizer.chat_template` to an appropriate template. " +
3069- "See https://huggingface.co/docs/transformers/main/chat_templating for more information."
3070- )
3071- this . _warned_about_chat_template = true ; // TODO move to logger.warning_once()
3072- }
3073-
3074- return this . _default_chat_template ;
3075- }
3076-
30773060 /**
30783061 * Converts a list of message objects with `"role"` and `"content"` keys to a list of token
30793062 * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
3080- * determine the format and control tokens to use when converting. When chat_template is None, it will fall back
3081- * to the default_chat_template specified at the class level.
3063+ * determine the format and control tokens to use when converting.
30823064 *
30833065 * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information.
30843066 *
@@ -3105,7 +3087,7 @@ export class PreTrainedTokenizer extends Callable {
31053087 * @param {Message[] } conversation A list of message objects with `"role"` and `"content"` keys.
31063088 * @param {Object } options An optional object containing the following properties:
31073089 * @param {string } [options.chat_template=null] A Jinja template to use for this conversion. If
3108- * this is not passed, the model's default chat template will be used instead.
3090+ * this is not passed, the model's chat template will be used instead.
31093091 * @param {boolean } [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate
31103092 * the start of an assistant message. This is useful when you want to generate a response from the model.
31113093 * Note that this argument will be passed to the chat template, and so it must be supported in the
@@ -3135,10 +3117,10 @@ export class PreTrainedTokenizer extends Callable {
31353117
31363118 // First, handle the cases when the model has a dict of multiple templates
31373119 if (
3138- ( this . chat_template && typeof this . chat_template === 'object' ) ||
3139- ( this . chat_template === null && this . default_chat_template && typeof this . default_chat_template === 'object' )
3120+ ( this . chat_template && typeof this . chat_template === 'object' )
3121+ || this . chat_template === null
31403122 ) {
3141- const template_dict = this . chat_template ?? this . default_chat_template ; // Guaranteed to be a non-null object
3123+ const template_dict = this . chat_template ;
31423124
31433125 if ( chat_template !== null && Object . hasOwn ( template_dict , chat_template ) ) {
31443126 // The user can pass the name of a template to the chat template argument instead of an entire template
@@ -3154,8 +3136,17 @@ export class PreTrainedTokenizer extends Callable {
31543136 }
31553137 } else {
31563138 // These are the cases when the model has a single template
3157- // priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
3158- chat_template ??= this . chat_template ?? this . default_chat_template ;
3139+ // priority: `chat_template` argument > `tokenizer.chat_template`
3140+ if ( this . chat_template ) {
3141+ chat_template = this . chat_template ;
3142+ } else {
3143+ throw Error (
3144+ "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
3145+ "argument was passed! For information about writing templates and setting the " +
3146+ "tokenizer.chat_template attribute, please see the documentation at " +
3147+ "https://huggingface.co/docs/transformers/main/en/chat_templating"
3148+ )
3149+ }
31593150 }
31603151 if ( typeof chat_template !== 'string' ) {
31613152 throw Error ( `chat_template must be a string, but got ${ typeof chat_template } ` ) ;
@@ -3250,9 +3241,7 @@ export class ElectraTokenizer extends PreTrainedTokenizer {
32503241}
32513242
32523243export class T5Tokenizer extends PreTrainedTokenizer { }
3253- export class GPT2Tokenizer extends PreTrainedTokenizer {
3254- _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`
3255- }
3244+ export class GPT2Tokenizer extends PreTrainedTokenizer { }
32563245export class BartTokenizer extends PreTrainedTokenizer { }
32573246export class MBartTokenizer extends PreTrainedTokenizer {
32583247 constructor ( tokenizerJSON , tokenizerConfig ) {
@@ -3278,7 +3267,7 @@ export class MBart50Tokenizer extends MBartTokenizer { } // NOTE: extends MBartT
32783267
32793268export class RobertaTokenizer extends PreTrainedTokenizer { }
32803269
3281- export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to get the correct chat template
3270+ export class BloomTokenizer extends PreTrainedTokenizer {
32823271
32833272 constructor ( tokenizerJSON , tokenizerConfig ) {
32843273 // Override the default (invalid) regex of the pretokenizer.
@@ -3295,20 +3284,11 @@ export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to
32953284const SPIECE_UNDERLINE = "▁" ;
32963285
32973286export class LlamaTokenizer extends PreTrainedTokenizer {
3298- _default_chat_template = `{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}`
3299-
3300- DEFAULT_SYSTEM_PROMPT =
3301- "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " +
3302- "answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure " +
3303- "that your responses are socially unbiased and positive in nature.\n\n" +
3304- "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not " +
3305- "correct. If you don't know the answer to a question, please don't share false information."
33063287
33073288 padding_side = 'left' ;
33083289
33093290 constructor ( tokenizerJSON , tokenizerConfig ) {
33103291 super ( tokenizerJSON , tokenizerConfig ) ;
3311- this . use_default_system_prompt = tokenizerConfig . use_default_system_prompt ?? false ;
33123292
33133293 this . legacy = tokenizerConfig . legacy ?? true ;
33143294 if ( ! this . legacy ) {
@@ -3341,14 +3321,8 @@ export class LlamaTokenizer extends PreTrainedTokenizer {
33413321 }
33423322 return tokens ;
33433323 }
3344-
3345- get default_chat_template ( ) {
3346- return super . default_chat_template
3347- . replaceAll ( 'USE_DEFAULT_PROMPT' , this . use_default_system_prompt ? 'true' : 'false' )
3348- . replaceAll ( 'DEFAULT_SYSTEM_MESSAGE' , this . DEFAULT_SYSTEM_PROMPT . replaceAll ( "\n" , "\\n" ) . replaceAll ( "'" , "\\'" ) ) ;
3349- }
33503324}
3351- export class CodeLlamaTokenizer extends LlamaTokenizer { } // NOTE: `LlamaTokenizer` to get the correct chat template
3325+ export class CodeLlamaTokenizer extends PreTrainedTokenizer { }
33523326
33533327export class XLMRobertaTokenizer extends PreTrainedTokenizer { }
33543328export class MPNetTokenizer extends PreTrainedTokenizer { }
@@ -3361,9 +3335,7 @@ export class EsmTokenizer extends PreTrainedTokenizer { }
33613335
33623336export class Qwen2Tokenizer extends PreTrainedTokenizer { }
33633337
3364- export class GemmaTokenizer extends PreTrainedTokenizer {
3365- _default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
3366- }
3338+ export class GemmaTokenizer extends PreTrainedTokenizer { }
33673339
33683340export class Grok1Tokenizer extends PreTrainedTokenizer { }
33693341
@@ -3491,7 +3463,6 @@ export class M2M100Tokenizer extends PreTrainedTokenizer {
34913463 * @extends PreTrainedTokenizer
34923464 */
34933465export class WhisperTokenizer extends PreTrainedTokenizer {
3494- _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}` ;
34953466
34963467 get timestamp_begin ( ) {
34973468 return this . model . convert_tokens_to_ids ( [ "<|notimestamps|>" ] ) [ 0 ] + 1 ;
@@ -4284,10 +4255,8 @@ export class MarianTokenizer extends PreTrainedTokenizer {
42844255
42854256export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer { }
42864257
4287- export class BlenderbotTokenizer extends PreTrainedTokenizer {
4288- _default_chat_template = `{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}` ;
4289- }
4290- export class BlenderbotSmallTokenizer extends BlenderbotTokenizer { } // NOTE `BlenderbotTokenizer` to get the correct chat template
4258+ export class BlenderbotTokenizer extends PreTrainedTokenizer { }
4259+ export class BlenderbotSmallTokenizer extends PreTrainedTokenizer { }
42914260
42924261export class SpeechT5Tokenizer extends PreTrainedTokenizer { }
42934262
0 commit comments