From 82ce25ef50a87534f44ec6a83f466f600df11567 Mon Sep 17 00:00:00 2001 From: Yizhen Date: Fri, 14 Jun 2024 23:06:03 +0800 Subject: [PATCH 1/6] [Typo] Shebang typo fix --- scripts/run_reward_modeling.sh | 3 +-- scripts/run_reward_modeling_with_lisa.sh | 3 +-- scripts/run_reward_modeling_with_lora.sh | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/scripts/run_reward_modeling.sh b/scripts/run_reward_modeling.sh index 938e10903..311384576 100644 --- a/scripts/run_reward_modeling.sh +++ b/scripts/run_reward_modeling.sh @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# coding=utf-8 +#!/bin/bash # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. # Parses arguments model_name_or_path=google/gemma-2b-it diff --git a/scripts/run_reward_modeling_with_lisa.sh b/scripts/run_reward_modeling_with_lisa.sh index 2fed71b0e..64716259a 100644 --- a/scripts/run_reward_modeling_with_lisa.sh +++ b/scripts/run_reward_modeling_with_lisa.sh @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# coding=utf-8 +#!/bin/bash # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. # Parses arguments model_name_or_path=google/gemma-2b-it diff --git a/scripts/run_reward_modeling_with_lora.sh b/scripts/run_reward_modeling_with_lora.sh index 2663a8f7e..1afbd9784 100644 --- a/scripts/run_reward_modeling_with_lora.sh +++ b/scripts/run_reward_modeling_with_lora.sh @@ -1,5 +1,4 @@ -#!/usr/bin/env python -# coding=utf-8 +#!/bin/bash # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. # Parses arguments model_name_or_path=google/gemma-2b-it From 9ae03273dadaaa5fbd38a6f326469d6c5e945c8b Mon Sep 17 00:00:00 2001 From: Yizhen Date: Fri, 14 Jun 2024 23:07:34 +0800 Subject: [PATCH 2/6] [Usability] Add padding side --- src/lmflow/args.py | 12 ++++++++ src/lmflow/models/hf_model_mixin.py | 48 +++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/lmflow/args.py b/src/lmflow/args.py index f4be176cf..ef91ac1f2 100644 --- a/src/lmflow/args.py +++ b/src/lmflow/args.py @@ -93,6 +93,8 @@ class ModelArguments: arch_type : str Model architecture type. + padding_side : str + The side on which the tokenizer should have padding applied. """ model_name_or_path: Optional[str] = field( @@ -296,6 +298,16 @@ class ModelArguments: "choices": [None, "left", "right"], }, ) + padding_side: str = field( + default='right', + metadata={ + "help": ( + "The side on which the tokenizer should have padding applied. " + "LMFlow uses right padding by default. When set to `auto`, will " + "use padding_side from tokenizer.padding_side."), + "choices": ["right", "left", "auto"], + } + ) def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): diff --git a/src/lmflow/models/hf_model_mixin.py b/src/lmflow/models/hf_model_mixin.py index ed1a63f94..0d7e73dd0 100644 --- a/src/lmflow/models/hf_model_mixin.py +++ b/src/lmflow/models/hf_model_mixin.py @@ -3,7 +3,7 @@ # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved. import os import logging -from typing import Union, Optional +from typing import Union, Optional, Dict import torch import deepspeed @@ -30,6 +30,7 @@ from lmflow.utils.constants import ( LMFLOW_LORA_TARGET_MODULES_MAPPING ) +from lmflow.args import ModelArguments logger = logging.getLogger(__name__) @@ -51,11 +52,12 @@ class HFModelMixin(BaseModel): def __init__( self, - model_args, + model_args: ModelArguments, do_train: bool, ds_config=None, device: Optional[str]="gpu", use_accelerator: bool=False, + hf_auto_model_additional_args: Optional[Dict]=None, *args, **kwargs ): @@ -88,7 +90,7 @@ def __init__( self.model_args = model_args self.tokenizer = self.__prepare_tokenizer(model_args) self.torch_dtype = self.__prepare_dtype(model_args) - self.hf_model_config = self.__prepare_model_config(model_args) + self.hf_model_config = self.__prepare_model_config(model_args, hf_auto_model_additional_args) self.quant_config = self.__prepare_quant_config(model_args) self.peft_config = self.__prepare_peft_config(model_args) @@ -106,11 +108,13 @@ def __init__( self.tokenizer.eos_token_id = self.backend_model.config.eos_token_id if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + if self.backend_model.config.pad_token_id is None: + self.backend_model.config.pad_token_id = self.tokenizer.pad_token_id def __prepare_tokenizer( self, - model_args + model_args: ModelArguments, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: tokenizer_kwargs = { "cache_dir": model_args.cache_dir, @@ -119,6 +123,8 @@ def __prepare_tokenizer( "use_auth_token": True if model_args.use_auth_token else None, "trust_remote_code": model_args.trust_remote_code, } + if model_args.padding_side != 'auto': + tokenizer_kwargs["padding_side"] = model_args.padding_side try: if model_args.tokenizer_name: @@ -163,7 +169,7 @@ def __prepare_tokenizer( def __prepare_dtype( self, - model_args + model_args: ModelArguments, ) -> torch.dtype: if model_args.arch_type == 'text_regression': if model_args.torch_dtype in ["auto", None, "bf16", "bfloat16"]: @@ -189,8 +195,23 @@ def __prepare_dtype( def __prepare_model_config( self, - model_args + model_args: ModelArguments, + hf_auto_model_additional_args: Optional[Dict]=None, ): + """Prepare model configuration for hf auto register, + Parameters + ---------- + model_args : ModelArguments + LMFlow model arguments. + hf_auto_model_additional_args : Optional[Dict], optional + Special configurations such as `num_labels` in `AutoModelForSequenceClassification` + (commonly used in reward modeling) will not preset in __prepare_model_config, + so it should be passed in hf_auto_model_additional_args. + Returns + ------- + config : ModelConfig + hf model config. + """ config_kwargs = { "torch_dtype": self.torch_dtype, "attn_implementation": "flash_attention_2" if model_args.use_flash_attention else None, @@ -200,6 +221,9 @@ def __prepare_model_config( "trust_remote_code": model_args.trust_remote_code, "from_tf": bool(".ckpt" in model_args.model_name_or_path), } + if hf_auto_model_additional_args is not None: + config_kwargs.update(hf_auto_model_additional_args) + if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: @@ -217,7 +241,7 @@ def __prepare_model_config( def __prepare_quant_config( self, - model_args + model_args: ModelArguments, ): quant_config = None if model_args.use_qlora: @@ -236,7 +260,7 @@ def __prepare_quant_config( def __prepare_peft_config( self, - model_args + model_args: ModelArguments, ): peft_config = None if model_args.use_lora: @@ -267,7 +291,7 @@ def __prepare_peft_config( def __model_module_inject( self, - model_args + model_args: ModelArguments, ) -> None: """Override some model modules with custom implementations. @@ -286,8 +310,8 @@ def __model_module_inject( def __prepare_model_for_training( self, - model_args, - hf_auto_model: HF_AUTOMODEL_TYPE + model_args: ModelArguments, + hf_auto_model: HF_AUTOMODEL_TYPE, ): # TODO: change to accelerate logger.info("Preparing model for training") @@ -326,7 +350,7 @@ def __prepare_model_for_training( def __prepare_model_for_inference( self, - model_args, + model_args: ModelArguments, hf_auto_model: HF_AUTOMODEL_TYPE, use_accelerator, ds_config From 911bbb636d87f7b172ed80251ed2e1258933beb7 Mon Sep 17 00:00:00 2001 From: Yizhen Date: Fri, 14 Jun 2024 23:10:15 +0800 Subject: [PATCH 3/6] [Usability] Add text_only and text2text data support --- src/lmflow/models/hf_text_regression_model.py | 46 +++++++++++++++---- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/lmflow/models/hf_text_regression_model.py b/src/lmflow/models/hf_text_regression_model.py index 0102c92f8..2bfcf245d 100644 --- a/src/lmflow/models/hf_text_regression_model.py +++ b/src/lmflow/models/hf_text_regression_model.py @@ -31,10 +31,12 @@ from lmflow.models.interfaces.tunable import Tunable from lmflow.models.hf_model_mixin import HFModelMixin from lmflow.models.text_regression_model import TextRegressionModel -from lmflow.tokenization.hf_text_regression_model import tokenize_function +from lmflow.tokenization.hf_text_regression_model import paired_conversation_tokenize_function, tokenize_function from lmflow.utils.conversation_template import PRESET_TEMPLATES from lmflow.utils.constants import ( PAIRED_CONVERSATION_DATASET_DESCRIPTION, + TEXT2TEXT_DATASET_DESCRIPTION, + TEXT_ONLY_DATASET_DESCRIPTION, CONVERSATION_ROLE_NAMES, ) @@ -81,6 +83,7 @@ def __init__( :param tune_strategy: tuning strategy: normal, none, lora or adapter :param ds_config: deepspeed configuration for distributed training """ + config_additional_args = {"num_labels": 1} HFModelMixin.__init__( self, model_args=model_args, @@ -88,6 +91,7 @@ def __init__( ds_config=ds_config, device=device, use_accelerator=use_accelerator, + hf_auto_model_additional_args=config_additional_args, *args, **kwargs ) @@ -133,14 +137,28 @@ def tokenize( raw_datasets = dataset hf_raw_datasets = dataset.get_backend_dataset() column_names = list(hf_raw_datasets.features) # in paired conversation, for example, would be 'chosen' and 'rejected' - - # since this will be pickled to avoid _LazyModule error in Hasher force - # logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - data_args = raw_datasets.get_data_args() - if dataset_type == "paired_conversation": + # Requires three types of information for tokenizing different datasets + # 1) Which fields require tokenization, e.g. + # "text2float": "text", but not "float" + # "text2text": both "input" and "output" + # 2) How will there tokenized sequence concatenated together, e.g. + # "text_only": "text" -> "text" + # "text2text": "input", "output" -> "input" + "output" + # 3) Which fields require loss in final computation, e.g. + # "text_only": "text" + # "text2text": "output" only + tokenized_column_order = None # Handles 1) and 2) + label_columns = None # Handles 3) + if dataset_type == "text_only": + tokenized_column_order = ["text"] + label_columns = ["text"] + elif dataset_type == "text2text": + tokenized_column_order = ["input", "output"] + label_columns = ["output"] + add_special_tokens = False + elif dataset_type == "paired_conversation": if data_args.conversation_template: if data_args.conversation_template in PRESET_TEMPLATES.keys(): conversation_template = PRESET_TEMPLATES[data_args.conversation_template] @@ -157,7 +175,9 @@ def tokenize( raise NotImplementedError( f"Dataset type \"{dataset_type}\" is not supported, currently" " only support following data types for HFTextRegressionModel:\n" - f" {PAIRED_CONVERSATION_DATASET_DESCRIPTION}\n" + f" 1) {TEXT_ONLY_DATASET_DESCRIPTION}\n" + f" 2) {TEXT2TEXT_DATASET_DESCRIPTION}\n" + f" 3) {PAIRED_CONVERSATION_DATASET_DESCRIPTION}\n" ) # Whether to truncate long sequences to fit into max_length @@ -165,13 +185,19 @@ def tokenize( if model_args.use_lora or data_args.disable_group_texts: use_truncation = True - tokenize_fn = tokenize_function + tokenize_fn = paired_conversation_tokenize_function if "conversation" in dataset_type else tokenize_function tokenize_fn_kwargs = { "data_args": data_args, "tokenizer": self.tokenizer, "column_names": column_names, - "conversation_template": conversation_template } + if "conversation" in dataset_type: + tokenize_fn_kwargs["conversation_template"] = conversation_template + else: + tokenize_fn_kwargs["label_columns"] = label_columns + tokenize_fn_kwargs["tokenized_column_order"] = tokenized_column_order + tokenize_fn_kwargs["add_special_tokens"] = add_special_tokens + tokenize_fn_kwargs["use_truncation"] = use_truncation tokenize_kwargs = {} if not data_args.streaming: From 15002f6f7dd782d54a6b4bb3944a5db85c211647 Mon Sep 17 00:00:00 2001 From: Yizhen Date: Fri, 14 Jun 2024 23:10:54 +0800 Subject: [PATCH 4/6] [Usability] Disable use_cache when gradient_checkpointing --- src/lmflow/pipeline/rm_tuner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/lmflow/pipeline/rm_tuner.py b/src/lmflow/pipeline/rm_tuner.py index 99c7923d6..2997aba2a 100644 --- a/src/lmflow/pipeline/rm_tuner.py +++ b/src/lmflow/pipeline/rm_tuner.py @@ -194,6 +194,13 @@ def switch_active_layers(self): elif last_checkpoint is not None: checkpoint = last_checkpoint + if self.finetuner_args.gradient_checkpointing: + if model.get_backend_model().config.use_cache: + logger.warning( + "Backend model config `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + model.get_backend_model().config.use_cache = False + train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload From 4fb2e0a5729a3ec7e19ec1adaea6a558c19cf71b Mon Sep 17 00:00:00 2001 From: Yizhen Date: Fri, 14 Jun 2024 23:11:42 +0800 Subject: [PATCH 5/6] [Usability] Add padding side --- src/lmflow/tokenization/hf_decoder_model.py | 51 +++-- .../tokenization/hf_text_regression_model.py | 204 +++++++++++++++--- 2 files changed, 205 insertions(+), 50 deletions(-) diff --git a/src/lmflow/tokenization/hf_decoder_model.py b/src/lmflow/tokenization/hf_decoder_model.py index 0e75c6d88..00126ee12 100644 --- a/src/lmflow/tokenization/hf_decoder_model.py +++ b/src/lmflow/tokenization/hf_decoder_model.py @@ -4,13 +4,15 @@ import logging from logging import Logger -from typing import Dict +from typing import Dict, Union import transformers from transformers.testing_utils import CaptureLogger +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from lmflow.utils.conversation_template import ConversationTemplate from lmflow.utils.constants import CONVERSATION_ROLE_NAMES +from lmflow.args import DatasetArguments logger = logging.getLogger(__name__) @@ -22,6 +24,7 @@ def blocking( block_size: int, model_max_length: int, pad_token_id: int, + padding_side: str, ) -> Dict: block_size_warning_num = 0 num_example = len(token_dict[list(token_dict.keys())[0]]) @@ -35,16 +38,32 @@ def blocking( for key in ["input_ids", "attention_mask", "labels"]: token_dict[key][i] = token_dict[key][i][:pad_length] else: - # Pads too short samples - token_dict["input_ids"][i].extend( - [pad_token_id for _ in range(pad_length)] - ) - token_dict["attention_mask"][i].extend( - [0 for _ in range(pad_length)] - ) - token_dict["labels"][i].extend( - [-100 for _ in range(pad_length)] - ) + if padding_side == 'right': + # Pads too short samples + token_dict["input_ids"][i].extend( + [pad_token_id for _ in range(pad_length)] + ) + token_dict["attention_mask"][i].extend( + [0 for _ in range(pad_length)] + ) + token_dict["labels"][i].extend( + [-100 for _ in range(pad_length)] + ) + elif padding_side == 'left': + # Pads too short samples + token_dict["input_ids"][i] = ( + [pad_token_id for _ in range(pad_length)] + token_dict["input_ids"][i] + ) + token_dict["attention_mask"][i] = ( + [0 for _ in range(pad_length)] + token_dict["attention_mask"][i] + ) + token_dict["labels"][i] = ( + [-100 for _ in range(pad_length)] + token_dict["labels"][i] + ) + else: + raise ValueError( + f"padding_side should be either 'right' or 'left', got {padding_side}" + ) if block_size_warning_num > 0: logger.warning( f"There are {block_size_warning_num} of {num_example} samples where" @@ -58,8 +77,8 @@ def blocking( def tokenize_function( examples, - data_args, - tokenizer, + data_args: DatasetArguments, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], column_names, label_columns, tokenized_column_order, @@ -105,6 +124,7 @@ def tokenize_function( block_size=data_args.block_size, model_max_length=tokenizer.model_max_length, pad_token_id=tokenizer.pad_token_id, + padding_side=tokenizer.padding_side, ) # clm input could be much much longer than block_size @@ -118,8 +138,8 @@ def tokenize_function( def conversation_tokenize_function( examples, - data_args, - tokenizer, + data_args: DatasetArguments, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], column_names, conversation_template: ConversationTemplate, ) -> Dict: @@ -175,6 +195,7 @@ def conversation_tokenize_function( block_size=data_args.block_size, model_max_length=tokenizer.model_max_length, pad_token_id=tokenizer.pad_token_id, + padding_side=tokenizer.padding_side, ) # clm input could be much much longer than block_size diff --git a/src/lmflow/tokenization/hf_text_regression_model.py b/src/lmflow/tokenization/hf_text_regression_model.py index fc9da8ec3..483f9db58 100644 --- a/src/lmflow/tokenization/hf_text_regression_model.py +++ b/src/lmflow/tokenization/hf_text_regression_model.py @@ -4,23 +4,133 @@ import logging from logging import Logger -from typing import Dict, List +from typing import Dict, List, Union import transformers from transformers.testing_utils import CaptureLogger +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from lmflow.utils.conversation_template import ConversationTemplate from lmflow.utils.constants import CONVERSATION_ROLE_NAMES +from lmflow.args import DatasetArguments logger = logging.getLogger(__name__) tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") -def tokenize_function( +def blocking_paired( + token_dict: Dict, + column_names: List, + block_size: int, + model_max_length: int, + pad_token_id: int, + padding_side: str, +) -> Dict: + block_size_warning_num = 0 + num_example = len(token_dict[list(token_dict.keys())[0]]) + for i in range(num_example): + for column_name in column_names: + max_length = min(block_size, model_max_length) + pad_length = max_length - len(token_dict[f"input_ids_{column_name}"][i]) + if block_size < model_max_length: + block_size_warning_num += 1 + if pad_length < 0: + # Truncates too long samples + for key in [f"input_ids_{column_name}", f"attention_mask_{column_name}"]: + token_dict[key][i] = token_dict[key][i][:pad_length] + else: + if padding_side == 'right': + # Pads too short samples + token_dict[f"input_ids_{column_name}"][i].extend( + [pad_token_id for _ in range(pad_length)] + ) + token_dict[f"attention_mask_{column_name}"][i].extend( + [0 for _ in range(pad_length)] + ) + elif padding_side == 'left': + # Pads too short samples + token_dict[f"input_ids_{column_name}"][i] = ( + [pad_token_id for _ in range(pad_length)] + token_dict[f"input_ids_{column_name}"][i] + ) + token_dict[f"attention_mask_{column_name}"][i] = ( + [0 for _ in range(pad_length)] + token_dict[f"attention_mask_{column_name}"][i] + ) + else: + raise ValueError( + f"padding_side should be either 'right' or 'left', got {padding_side}" + ) + if block_size_warning_num > 0: + logger.warning( + f"There are {block_size_warning_num} of {num_example} samples where" + f"block_size {block_size} < model_max_length" + f" {model_max_length}, use block_size" + " for maximum tokenized sequence length" + ) + + return token_dict + + +def blocking( + token_dict: Dict, + block_size: int, + model_max_length: int, + pad_token_id: int, + padding_side: str, +) -> Dict: + block_size_warning_num = 0 + num_example = len(token_dict[list(token_dict.keys())[0]]) + for i in range(num_example): + max_length = min(block_size, model_max_length) + pad_length = max_length - len(token_dict["input_ids"][i]) + if block_size < model_max_length: + block_size_warning_num += 1 + if pad_length < 0: + # Truncates too long samples + for key in ["input_ids", "attention_mask", "labels"]: + token_dict[key][i] = token_dict[key][i][:pad_length] + else: + if padding_side == 'right': + # Pads too short samples + token_dict["input_ids"][i].extend( + [pad_token_id for _ in range(pad_length)] + ) + token_dict["attention_mask"][i].extend( + [0 for _ in range(pad_length)] + ) + token_dict["labels"][i].extend( + [-100 for _ in range(pad_length)] + ) + elif padding_side == 'left': + # Pads too short samples + token_dict["input_ids"][i] = ( + [pad_token_id for _ in range(pad_length)] + token_dict["input_ids"][i] + ) + token_dict["attention_mask"][i] = ( + [0 for _ in range(pad_length)] + token_dict["attention_mask"][i] + ) + token_dict["labels"][i] = ( + [-100 for _ in range(pad_length)] + token_dict["labels"][i] + ) + else: + raise ValueError( + f"padding_side should be either 'right' or 'left', got {padding_side}" + ) + if block_size_warning_num > 0: + logger.warning( + f"There are {block_size_warning_num} of {num_example} samples where" + f"block_size {block_size} < model_max_length" + f" {model_max_length}, use block_size" + " for maximum tokenized sequence length" + ) + + return token_dict + + +def paired_conversation_tokenize_function( examples, - data_args, - tokenizer, + data_args: DatasetArguments, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], column_names, conversation_template: ConversationTemplate, ) -> Dict: @@ -69,7 +179,8 @@ def tokenize_function( column_names=column_names, block_size=data_args.block_size, model_max_length=tokenizer.model_max_length, - pad_token_id=tokenizer.pad_token_id + pad_token_id=tokenizer.pad_token_id, + padding_side=tokenizer.padding_side, ) # clm input could be much much longer than block_size @@ -81,39 +192,62 @@ def tokenize_function( return token_dict -def blocking_paired( - token_dict: Dict, - column_names: List, - block_size: int, - model_max_length: int, - pad_token_id: int, +def tokenize_function( + examples, + data_args: DatasetArguments, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + column_names, + label_columns, + tokenized_column_order, + add_special_tokens, + use_truncation, ) -> Dict: - block_size_warning_num = 0 - num_example = len(token_dict[list(token_dict.keys())[0]]) - for i in range(num_example): - for column_name in column_names: - max_length = min(block_size, model_max_length) - pad_length = max_length - len(token_dict[f"input_ids_{column_name}"][i]) - if block_size < model_max_length: - block_size_warning_num += 1 - if pad_length < 0: - # Truncates too long samples - for key in [f"input_ids_{column_name}", f"attention_mask_{column_name}"]: - token_dict[key][i] = token_dict[key][i][:pad_length] + """Handels text_only and text2text datasets tokenization + """ + num_example = len(examples[column_names[0]]) + token_dict = { + "input_ids": [[] for _ in range(num_example)], + "attention_mask": [[] for _ in range(num_example)], + "labels": [[] for _ in range(num_example)], + } + with CaptureLogger(tok_logger) as cl: + for column_name in tokenized_column_order: + encoding = tokenizer( + examples[column_name], + add_special_tokens=add_special_tokens, + truncation=use_truncation, + ) + + if column_name in label_columns: + labels = encoding["input_ids"].copy() else: - # Pads too short samples - token_dict[f"input_ids_{column_name}"][i].extend( - [pad_token_id for _ in range(pad_length)] + labels = [ + [-100] * len(encoding["input_ids"][i]) + for i in range(num_example) + ] + + for i in range(num_example): + token_dict["input_ids"][i].extend( + encoding["input_ids"][i] ) - token_dict[f"attention_mask_{column_name}"][i].extend( - [0 for _ in range(pad_length)] + token_dict["attention_mask"][i].extend( + encoding["attention_mask"][i] ) - if block_size_warning_num > 0: - logger.warning( - f"There are {block_size_warning_num} of {num_example} samples where" - f"block_size {block_size} < model_max_length" - f" {model_max_length}, use block_size" - " for maximum tokenized sequence length" + token_dict["labels"][i].extend(labels[i]) + + if data_args.disable_group_texts: + token_dict = blocking( + token_dict=token_dict, + block_size=data_args.block_size, + model_max_length=tokenizer.model_max_length, + pad_token_id=tokenizer.pad_token_id, + padding_side=tokenizer.padding_side, + ) + + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" + " before being passed to the model." ) - return token_dict \ No newline at end of file From f9d99e16bf91e9909f6eb1bd173b3e45223c898a Mon Sep 17 00:00:00 2001 From: Eric Date: Sat, 15 Jun 2024 00:14:49 +0800 Subject: [PATCH 6/6] [Usability] Add padding_side to dataset fingerprint --- src/lmflow/models/hf_decoder_model.py | 1 + src/lmflow/models/hf_text_regression_model.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py index fa571413c..0f5b1c4d5 100644 --- a/src/lmflow/models/hf_decoder_model.py +++ b/src/lmflow/models/hf_decoder_model.py @@ -246,6 +246,7 @@ def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs): ( raw_datasets.get_fingerprint() + str(self.tokenizer) + + f'###padding_side={self.tokenizer.padding_side}' + ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "") + f'###disable_group_texts={data_args.disable_group_texts}' + f'###block_size={data_args.block_size}' diff --git a/src/lmflow/models/hf_text_regression_model.py b/src/lmflow/models/hf_text_regression_model.py index 2bfcf245d..4bce86306 100644 --- a/src/lmflow/models/hf_text_regression_model.py +++ b/src/lmflow/models/hf_text_regression_model.py @@ -205,6 +205,7 @@ def tokenize( ( raw_datasets.get_fingerprint() + str(self.tokenizer) + + f'###padding_side={self.tokenizer.padding_side}' + ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "") + f'###disable_group_texts={data_args.disable_group_texts}' + f'###block_size={data_args.block_size}'