From acf23ac79a4d9f48d3470db72325041db9453fe4 Mon Sep 17 00:00:00 2001 From: Qingyi Si <43233342+PhoebusSi@users.noreply.github.com> Date: Thu, 30 Mar 2023 03:41:32 +0800 Subject: [PATCH] Update and rename finetune_glm.py to uniform_finetune.py --- finetune_glm.py => uniform_finetune.py | 91 +++++++++++++++++--------- 1 file changed, 61 insertions(+), 30 deletions(-) rename finetune_glm.py => uniform_finetune.py (70%) diff --git a/finetune_glm.py b/uniform_finetune.py similarity index 70% rename from finetune_glm.py rename to uniform_finetune.py index e16385a..a2bfa95 100644 --- a/finetune_glm.py +++ b/uniform_finetune.py @@ -16,7 +16,7 @@ LlamaForCausalLM, LlamaTokenizer, AutoModel, AutoTokenizer, BloomForCausalLM, BloomTokenizerFast) -from model_chatglm import ChatGLMForConditionalGeneration, ChatGLMTokenizer + from peft import ( prepare_model_for_int8_training, @@ -42,8 +42,8 @@ }), "chatglm": ModelClass(**{ - "tokenizer": ChatGLMTokenizer, - "model": ChatGLMForConditionalGeneration, + "tokenizer": AutoTokenizer, #ChatGLMTokenizer, + "model": AutoModel, #ChatGLMForConditionalGeneration, }), "bloom": ModelClass(**{ "tokenizer": BloomTokenizerFast, @@ -57,12 +57,13 @@ # add the custom dataset DATA_PATH = { - "alpaca": "data/alpaca_data_cleaned.json", - "belle": "data/belle_data_cn.json", - "alpaca-belle": "data/alpaca_plus_belle_data.json", - "cot": "data/CoT_data.json", - "alpaca-cot": "data/alcapa_plus_cot.json", - "alpaca-belle-cot": "data/alcapa_plus_belle_plus_cot.json" + "alpaca": "alpaca_data_cleaned.json", + "belle": "/mnt/bn/qingyi-bn-lq/llama/belle-0.5M-cn/belle_data_cn.json", + "alpaca-belle": "/mnt/bn/qingyi-bn-lq/llama/belle-0.5M-cn/alpaca_plus_belle_data.json", + "cot": "/mnt/bn/qingyi-bn-lq/llama/all_formatted_data/CoT_data.json", + "alpaca-cot": "/mnt/bn/qingyi-bn-lq/llama/all_formatted_data/alcapa_plus_cot.json", + "alpaca-belle-cot": "/mnt/bn/qingyi-bn-lq/llama/all_formatted_data/alcapa_plus_belle_plus_cot.json", + "belle1.5m": "/mnt/bn/qingyi-bn-lq/llama/all_formatted_data/belle_data1.5M_cn.json.json" } PROMPT_DICT = { @@ -104,10 +105,18 @@ def get_model_class(model_type): model_class = get_model_class(args.model_type) - model = model_class.model.from_pretrained(args.model_name_or_path, - load_in_8bit=True, - device_map=device_map) - tokenizer = model_class.tokenizer.from_pretrained(args.model_name_or_path) # default add_eos_token=False + if args.model_type == "chatglm": + # chatglm can not set load_in_8bit=True: ChatGLMForConditionalGeneration does not support gradient checkpointing. + model = model_class.model.from_pretrained(args.model_name_or_path, + trust_remote_code=True, + device_map=device_map) + tokenizer = model_class.tokenizer.from_pretrained(args.model_name_or_path,trust_remote_code=True) # default add_eos_token=False + else: + model = model_class.model.from_pretrained(args.model_name_or_path, + load_in_8bit=True, + device_map=device_map) + + tokenizer = model_class.tokenizer.from_pretrained(args.model_name_or_path) # default add_eos_token=False # llama has no pad_id, maybe copy the stanford_alpaca's handling ? if args.model_type == 'llama': @@ -136,32 +145,55 @@ def train(args): # 1. load data & model_class data, model, tokenizer = get_data_model(args) - def tokenize(prompt): - result = tokenizer(prompt, - truncation=True, - max_length=args.cutoff_len, - # padding="max_length", - padding=False, - ) + if "chatglm" in args.model_type: + def prompt_tokenize(prompt): + input_ids = tokenizer.encode(prompt) + return { + "input_ids": input_ids, + "labels": copy.deepcopy(input_ids) + } + def completion_tokenize(completion): + if completion[-4:] == '': + input_ids = tokenizer.encode(completion[:-4]) #, add_special_tokens=False) + else: + input_ids = tokenizer.encode(completion) #, add_special_tokens=False) + return { + "input_ids": input_ids, + "labels": copy.deepcopy(input_ids) + } + else: + def tokenize(prompt): + result = tokenizer(prompt, + truncation=True, + max_length=args.cutoff_len, + # padding="max_length", + padding=False, + ) - return { - "input_ids": result["input_ids"], - "attention_mask": result["attention_mask"], - "labels": copy.deepcopy(result["input_ids"]) - } + return { + "input_ids": result["input_ids"], + "attention_mask": result["attention_mask"], + "labels": copy.deepcopy(result["input_ids"]) + } def generate_and_tokenize_prompt(data_point): prompt_no_resp = generate_prompt(data_point) - tokenized_result = tokenize(prompt_no_resp) + if "chatglm" in args.model_type: + tokenized_result = prompt_tokenize(prompt_no_resp) + else: + tokenized_result = tokenize(prompt_no_resp) source_len = len(tokenized_result['input_ids']) prompt_with_response = prompt_no_resp + " " + data_point["output"] - if "llama" in args.model_type: - prompt_with_response += " " + tokenizer.eos_token + # if "llama" in args.model_type: + prompt_with_response += " " + tokenizer.eos_token - tokenized_with_response = tokenize(prompt_with_response) + if "chatglm" in args.model_type: + tokenized_with_response = completion_tokenize(prompt_with_response) + else: + tokenized_with_response = tokenize(prompt_with_response) tokenized_with_response["labels"] = [IGNORE_INDEX] * source_len + tokenized_with_response["labels"][source_len:] @@ -258,4 +290,3 @@ def generate_and_tokenize_prompt(data_point): print(args) train(args) -